Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dump: Don't unfreeze tasks on dump failure with --no-resume-on-error. #2215

Draft
wants to merge 2 commits into
base: criu-dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions Documentation/criu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ In other words, do not use it unless really needed.
*-s*, *--leave-stopped*::
Leave tasks in stopped state after checkpoint, instead of killing.

*--no-resume-on-error*::
Leave tasks in stopped state even if checkpoint completed unsuccessfully.

Comment on lines +216 to +218
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some lines about what situations it is useful and why? What benefit does it provide?
How should a user decide what is preferred for them in case of criu failure?
Current doc lines are copy'n'paste from above without adding any useful context.

*--external* __type__**[**__id__**]:**__value__::
Dump an instance of an external resource. The generic syntax is
'type' of resource, followed by resource 'id' (enclosed in literal
Expand Down
2 changes: 2 additions & 0 deletions criu/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ void init_opts(void)

/* Default options */
opts.final_state = TASK_DEAD;
opts.resume_on_dump_error = true;
INIT_LIST_HEAD(&opts.ext_mounts);
INIT_LIST_HEAD(&opts.inherit_fds);
INIT_LIST_HEAD(&opts.external);
Expand Down Expand Up @@ -622,6 +623,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
{ "tree", required_argument, 0, 't' },
{ "leave-stopped", no_argument, 0, 's' },
{ "leave-running", no_argument, 0, 'R' },
BOOL_OPT("resume-on-error", &opts.resume_on_dump_error),
BOOL_OPT("restore-detached", &opts.restore_detach),
BOOL_OPT("restore-sibling", &opts.restore_sibling),
BOOL_OPT("daemon", &opts.restore_detach),
Expand Down
6 changes: 4 additions & 2 deletions criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -2070,7 +2070,9 @@ static int cr_dump_finish(int ret)
* consistency of the FS and other resources, we simply
* start rollback procedure and cleanup everything.
*/
if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) {
if (opts.resume_on_dump_error && (ret || post_dump_ret))
opts.final_state = TASK_ALIVE;
if (opts.final_state == TASK_ALIVE) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to check that final_state isn't TASK_DEAD here. If finale_state is TASK_STOPPED, we have to do all these actions, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is kinda strange that we don't unlock network for --leave-stopped. One would not be able to just resume such stopped container with SIGCONT, but will also need to manually unlock network and remove link remaps and do other cleanups.

unsuspend_lsm();
network_unlock();
delete_link_remaps();
Expand All @@ -2082,7 +2084,7 @@ static int cr_dump_finish(int ret)

if (arch_set_thread_regs(root_item, true) < 0)
return -1;
pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state);
pstree_switch_state(root_item, opts.final_state);
timing_stop(TIME_FROZEN);
free_pstree(root_item);
seccomp_free_entries();
Expand Down
3 changes: 3 additions & 0 deletions criu/cr-service.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
if (req->has_leave_stopped && req->leave_stopped)
opts.final_state = TASK_STOPPED;

if (req->has_resume_on_dump_error)
opts.resume_on_dump_error = req->resume_on_dump_error;

if (!req->has_pid) {
req->has_pid = true;
req->pid = ids.pid;
Expand Down
2 changes: 2 additions & 0 deletions criu/crtools.c
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ int main(int argc, char *argv[], char *envp[])
" -d|--restore-detached detach after restore\n"
" -S|--restore-sibling restore root task as sibling\n"
" -s|--leave-stopped leave tasks in stopped state after checkpoint\n"
" --no-resume-on-error\n"
" don't resume tasks on dump failure if they were stopped\n"
osctobe marked this conversation as resolved.
Show resolved Hide resolved
" -R|--leave-running leave tasks in running state after checkpoint\n"
" -D|--images-dir DIR directory for image files\n"
" --pidfile FILE write root task, service or page-server pid to FILE\n"
Expand Down
1 change: 1 addition & 0 deletions criu/include/cr_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ struct cr_options {
bool daemon_mode;
};
int restore_sibling;
int resume_on_dump_error;
bool ext_unix_sk;
int shell_job;
int handle_file_locks;
Expand Down
1 change: 1 addition & 0 deletions images/rpc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ message criu_opts {
optional bool leave_stopped = 69;
optional bool display_stats = 70;
optional bool log_to_stderr = 71;
optional bool resume_on_dump_error = 72 [default = true];
/* optional bool check_mounts = 128; */
}

Expand Down
1 change: 1 addition & 0 deletions test/jenkins/criu-stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ set -e
source `dirname $0`/criu-lib.sh
prep
./test/zdtm.py run -t zdtm/transition/fork --stop --iter 3 || fail
./test/zdtm.py run -t zdtm/static/sigtrap --stop-on-error || fail
25 changes: 19 additions & 6 deletions test/zdtm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,7 @@ def __init__(self, opts):
self.__user = bool(opts['user'])
self.__rootless = bool(opts['rootless'])
self.__leave_stopped = bool(opts['stop'])
self.__stop_on_error = bool(opts['stop_on_error'])
self.__stream = bool(opts['stream'])
self.__show_stats = bool(opts['show_stats'])
self.__lazy_pages_p = None
Expand Down Expand Up @@ -1390,6 +1391,8 @@ def dump(self, action, opts=[]):

if self.__leave_stopped:
a_opts += ['--leave-stopped']
if self.__stop_on_error:
a_opts += ['--no-resume-on-error']
if self.__empty_ns:
a_opts += ['--empty-ns', 'net']
if self.__pre_dump_mode:
Expand All @@ -1399,9 +1402,16 @@ def dump(self, action, opts=[]):
if self.__lazy_migrate and action == "dump":
a_opts += ["--lazy-pages", "--port", "12345"] + self.__tls
nowait = True
self.__dump_process = self.__criu_act(action,
opts=a_opts + opts,
nowait=nowait)
try:
self.__dump_process = self.__criu_act(action,
opts=a_opts + opts,
nowait=nowait)
except test_fail_expected_exc:
if self.__stop_on_error:
pstree_check_stopped(self.__test.getpid(), "--no-resume-on-error")
pstree_signal(self.__test.getpid(), signal.SIGKILL)
raise

if self.__stream:
ret = self.wait_for_criu_image_streamer()
if ret:
Expand Down Expand Up @@ -1888,10 +1898,10 @@ def is_thread_stopped(status):
return True


def pstree_check_stopped(root_pid):
def pstree_check_stopped(root_pid, test_flag="--leave_stopped"):
for pid in pstree_each_pid(root_pid):
if not is_proc_stopped(pid):
raise test_fail_exc("CRIU --leave-stopped %s" % pid)
raise test_fail_exc("CRIU %s %s" % (test_flag, pid))


def pstree_signal(root_pid, signal):
Expand Down Expand Up @@ -2083,7 +2093,7 @@ def run_test(self, name, desc, flavor):
'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup',
'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream',
'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode',
'rootless')
'rootless', 'stop_on_error')
arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd}))

if self.__use_log:
Expand Down Expand Up @@ -2706,6 +2716,9 @@ def get_cli_args():
rp.add_argument("--stop",
help="Check that --leave-stopped option stops ps tree.",
action='store_true')
rp.add_argument("--stop-on-error",
help="Check that --no-resume-on-error stops ps tree on dump error.",
action='store_true')
rp.add_argument("--iters",
help="Do CR cycle several times before check (n[:pause])")
rp.add_argument("--fault", help="Test fault injection")
Expand Down