From bca304ff3e1a52a58bf5c0564affbca35ff278bc Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 23 Aug 2018 22:22:27 -0700 Subject: [PATCH] Fix an issue that container/sandbox can't be stopped. Signed-off-by: Lantao Liu --- pkg/server/container_stop.go | 78 ++++++++++++++++++++---------------- pkg/server/sandbox_stop.go | 13 +++++- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/pkg/server/container_stop.go b/pkg/server/container_stop.go index b8f980a4281b..ca1e430d9863 100644 --- a/pkg/server/container_stop.go +++ b/pkg/server/container_stop.go @@ -33,7 +33,10 @@ import ( // killContainerTimeout is the timeout that we wait for the container to // be SIGKILLed. -const killContainerTimeout = 2 * time.Minute +// The timeout is set to 1 min, because the default CRI operation timeout +// for StopContainer is (2 min + stop timeout). Set to 1 min, so that we +// have enough time for kill(all=true) and kill(all=false). +const killContainerTimeout = 1 * time.Minute // StopContainer stops a running container with a grace period (i.e., timeout). func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) { @@ -63,6 +66,16 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore return nil } + task, err := container.Container.Task(ctx, nil) + if err != nil { + if !errdefs.IsNotFound(err) { + return errors.Wrapf(err, "failed to stop container, task not found for container %q", id) + } + return nil + } + + // We only need to kill the task. The event handler will Delete the + // task from containerd after it handles the Exited event. if timeout > 0 { stopSignal := unix.SIGTERM image, err := c.imageStore.Get(container.ImageRef) @@ -81,52 +94,47 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore } } logrus.Infof("Stop container %q with signal %v", id, stopSignal) - task, err := container.Container.Task(ctx, nil) - if err != nil { - if !errdefs.IsNotFound(err) { - return errors.Wrapf(err, "failed to stop container, task not found for container %q", id) - } - return nil - } - if task != nil { - if err = task.Kill(ctx, stopSignal); err != nil { - if !errdefs.IsNotFound(err) { - return errors.Wrapf(err, "failed to stop container %q", id) - } - // Move on to make sure container status is updated. - } + if err = task.Kill(ctx, stopSignal); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrapf(err, "failed to stop container %q", id) } - err = c.waitContainerStop(ctx, container, timeout) - if err == nil { + if err = c.waitContainerStop(ctx, container, timeout); err == nil { return nil } - logrus.WithError(err).Errorf("Stop container %q timed out", id) + logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be stopped", id) } - task, err := container.Container.Task(ctx, nil) - if err != nil { - if !errdefs.IsNotFound(err) { - return errors.Wrapf(err, "failed to stop container, task not found for container %q", id) - } + logrus.Infof("Kill container %q", id) + if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrapf(err, "failed to kill container %q", id) + } + + // Wait for a fixed timeout until container stop is observed by event monitor. + if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil { return nil } - // Event handler will Delete the container from containerd after it handles the Exited event. - logrus.Infof("Kill container %q", id) - if task != nil { - if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil { - if !errdefs.IsNotFound(err) { - return errors.Wrapf(err, "failed to kill container %q", id) - } - // Move on to make sure container status is updated. - } + logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be killed", id) + + // This is a fix for `runc`, and should not break other runtimes. With + // containerd.WithKillAll, `runc` will get all processes from the container + // cgroups, and kill them. However, sometimes the processes may be moved + // out from the container cgroup, e.g. users manually move them by mistake, + // or systemd.Delegate=true is not set. + // In these cases, we should try our best to do cleanup, kill the container + // without containerd.WithKillAll, so that runc can kill the container init + // process directly. + // NOTE(random-liu): If pid namespace is shared inside the pod, non-init processes + // of this container will be left running until the pause container is stopped. + logrus.Infof("Kill container %q init process", id) + if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrapf(err, "failed to kill container %q init process", id) } // Wait for a fixed timeout until container stop is observed by event monitor. - if err := c.waitContainerStop(ctx, container, killContainerTimeout); err != nil { - return errors.Wrapf(err, "an error occurs during waiting for container %q to stop", id) + if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil { + return nil } - return nil + return errors.Wrapf(err, "an error occurs during waiting for container %q init process to be killed", id) } // waitContainerStop waits for container to be stopped until timeout exceeds or context is cancelled. diff --git a/pkg/server/sandbox_stop.go b/pkg/server/sandbox_stop.go index 9def53576539..2d5b3ca3380f 100644 --- a/pkg/server/sandbox_stop.go +++ b/pkg/server/sandbox_stop.go @@ -109,11 +109,20 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst } // Kill the sandbox container. - err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll) - if err != nil && !errdefs.IsNotFound(err) { + if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) { return errors.Wrap(err, "failed to kill sandbox container") } + if err = c.waitSandboxStop(ctx, sandbox, killContainerTimeout); err == nil { + return nil + } + logrus.WithError(err).Errorf("An error occurs during waiting for sandbox %q to be killed", sandbox.ID) + + // Kill the sandbox container init process. + if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to kill sandbox container init process") + } + return c.waitSandboxStop(ctx, sandbox, killContainerTimeout) }