diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790b..6783378d39 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -213,6 +213,9 @@ In other words, do not use it unless really needed. *-s*, *--leave-stopped*:: Leave tasks in stopped state after checkpoint, instead of killing. +*--no-resume-on-error*:: + Leave tasks in stopped state even if checkpoint completed unsuccessfully. + *--external* __type__**[**__id__**]:**__value__:: Dump an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal diff --git a/criu/config.c b/criu/config.c index 1322a490ab..a380a836ac 100644 --- a/criu/config.c +++ b/criu/config.c @@ -412,6 +412,7 @@ void init_opts(void) /* Default options */ opts.final_state = TASK_DEAD; + opts.resume_on_dump_error = true; INIT_LIST_HEAD(&opts.ext_mounts); INIT_LIST_HEAD(&opts.inherit_fds); INIT_LIST_HEAD(&opts.external); @@ -622,6 +623,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "tree", required_argument, 0, 't' }, { "leave-stopped", no_argument, 0, 's' }, { "leave-running", no_argument, 0, 'R' }, + BOOL_OPT("resume-on-error", &opts.resume_on_dump_error), BOOL_OPT("restore-detached", &opts.restore_detach), BOOL_OPT("restore-sibling", &opts.restore_sibling), BOOL_OPT("daemon", &opts.restore_detach), diff --git a/criu/cr-dump.c b/criu/cr-dump.c index ee5974acc9..3454e04923 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2070,7 +2070,9 @@ static int cr_dump_finish(int ret) * consistency of the FS and other resources, we simply * start rollback procedure and cleanup everything. */ - if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { + if (opts.resume_on_dump_error && (ret || post_dump_ret)) + opts.final_state = TASK_ALIVE; + if (opts.final_state == TASK_ALIVE) { unsuspend_lsm(); network_unlock(); delete_link_remaps(); @@ -2082,7 +2084,7 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; - pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); + pstree_switch_state(root_item, opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); seccomp_free_entries(); diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ffe..f3b965f2ce 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -483,6 +483,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_stopped && req->leave_stopped) opts.final_state = TASK_STOPPED; + if (req->has_resume_on_dump_error) + opts.resume_on_dump_error = req->resume_on_dump_error; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..6609ace0e2 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -394,6 +394,8 @@ int main(int argc, char *argv[], char *envp[]) " -d|--restore-detached detach after restore\n" " -S|--restore-sibling restore root task as sibling\n" " -s|--leave-stopped leave tasks in stopped state after checkpoint\n" + " --no-resume-on-error\n" + " don't resume tasks on dump failure if they were stopped\n" " -R|--leave-running leave tasks in running state after checkpoint\n" " -D|--images-dir DIR directory for image files\n" " --pidfile FILE write root task, service or page-server pid to FILE\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 60cf9437e6..a8da23f392 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -131,6 +131,7 @@ struct cr_options { bool daemon_mode; }; int restore_sibling; + int resume_on_dump_error; bool ext_unix_sk; int shell_job; int handle_file_locks; diff --git a/images/rpc.proto b/images/rpc.proto index 1a4722a9ce..b7f133534c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -145,6 +145,7 @@ message criu_opts { optional bool leave_stopped = 69; optional bool display_stats = 70; optional bool log_to_stderr = 71; + optional bool resume_on_dump_error = 72 [default = true]; /* optional bool check_mounts = 128; */ } diff --git a/test/jenkins/criu-stop.sh b/test/jenkins/criu-stop.sh index 64da2ee8af..a3c86f79fa 100644 --- a/test/jenkins/criu-stop.sh +++ b/test/jenkins/criu-stop.sh @@ -5,3 +5,4 @@ set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run -t zdtm/transition/fork --stop --iter 3 || fail +./test/zdtm.py run -t zdtm/static/sigtrap --stop-on-error || fail diff --git a/test/zdtm.py b/test/zdtm.py index 7a7cdfd3b6..412a2d6154 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1055,6 +1055,7 @@ def __init__(self, opts): self.__user = bool(opts['user']) self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) + self.__stop_on_error = bool(opts['stop_on_error']) self.__stream = bool(opts['stream']) self.__show_stats = bool(opts['show_stats']) self.__lazy_pages_p = None @@ -1390,6 +1391,8 @@ def dump(self, action, opts=[]): if self.__leave_stopped: a_opts += ['--leave-stopped'] + if self.__stop_on_error: + a_opts += ['--no-resume-on-error'] if self.__empty_ns: a_opts += ['--empty-ns', 'net'] if self.__pre_dump_mode: @@ -1399,9 +1402,16 @@ def dump(self, action, opts=[]): if self.__lazy_migrate and action == "dump": a_opts += ["--lazy-pages", "--port", "12345"] + self.__tls nowait = True - self.__dump_process = self.__criu_act(action, - opts=a_opts + opts, - nowait=nowait) + try: + self.__dump_process = self.__criu_act(action, + opts=a_opts + opts, + nowait=nowait) + except test_fail_expected_exc: + if self.__stop_on_error: + pstree_check_stopped(self.__test.getpid(), "--no-resume-on-error") + pstree_signal(self.__test.getpid(), signal.SIGKILL) + raise + if self.__stream: ret = self.wait_for_criu_image_streamer() if ret: @@ -1888,10 +1898,10 @@ def is_thread_stopped(status): return True -def pstree_check_stopped(root_pid): +def pstree_check_stopped(root_pid, test_flag="--leave_stopped"): for pid in pstree_each_pid(root_pid): if not is_proc_stopped(pid): - raise test_fail_exc("CRIU --leave-stopped %s" % pid) + raise test_fail_exc("CRIU %s %s" % (test_flag, pid)) def pstree_signal(root_pid, signal): @@ -2083,7 +2093,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'stop_on_error') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2706,6 +2716,9 @@ def get_cli_args(): rp.add_argument("--stop", help="Check that --leave-stopped option stops ps tree.", action='store_true') + rp.add_argument("--stop-on-error", + help="Check that --no-resume-on-error stops ps tree on dump error.", + action='store_true') rp.add_argument("--iters", help="Do CR cycle several times before check (n[:pause])") rp.add_argument("--fault", help="Test fault injection")