Skip to content

Commit

Permalink
Merge pull request #885 from Random-Liu/enhance-container-stop
Browse files Browse the repository at this point in the history
Fix an issue that container/sandbox can't be stopped.
  • Loading branch information
Random-Liu committed Aug 27, 2018
2 parents a3af739 + bca304f commit df67dff
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 37 deletions.
78 changes: 43 additions & 35 deletions pkg/server/container_stop.go
Expand Up @@ -33,7 +33,10 @@ import (

// killContainerTimeout is the timeout that we wait for the container to
// be SIGKILLed.
const killContainerTimeout = 2 * time.Minute
// The timeout is set to 1 min, because the default CRI operation timeout
// for StopContainer is (2 min + stop timeout). Set to 1 min, so that we
// have enough time for kill(all=true) and kill(all=false).
const killContainerTimeout = 1 * time.Minute

// StopContainer stops a running container with a grace period (i.e., timeout).
func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) {
Expand Down Expand Up @@ -63,6 +66,16 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
return nil
}

task, err := container.Container.Task(ctx, nil)
if err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
}
return nil
}

// We only need to kill the task. The event handler will Delete the
// task from containerd after it handles the Exited event.
if timeout > 0 {
stopSignal := unix.SIGTERM
image, err := c.imageStore.Get(container.ImageRef)
Expand All @@ -81,52 +94,47 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
}
}
logrus.Infof("Stop container %q with signal %v", id, stopSignal)
task, err := container.Container.Task(ctx, nil)
if err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
}
return nil
}
if task != nil {
if err = task.Kill(ctx, stopSignal); err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container %q", id)
}
// Move on to make sure container status is updated.
}
if err = task.Kill(ctx, stopSignal); err != nil && !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container %q", id)
}

err = c.waitContainerStop(ctx, container, timeout)
if err == nil {
if err = c.waitContainerStop(ctx, container, timeout); err == nil {
return nil
}
logrus.WithError(err).Errorf("Stop container %q timed out", id)
logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be stopped", id)
}

task, err := container.Container.Task(ctx, nil)
if err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
}
logrus.Infof("Kill container %q", id)
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to kill container %q", id)
}

// Wait for a fixed timeout until container stop is observed by event monitor.
if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil {
return nil
}
// Event handler will Delete the container from containerd after it handles the Exited event.
logrus.Infof("Kill container %q", id)
if task != nil {
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to kill container %q", id)
}
// Move on to make sure container status is updated.
}
logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be killed", id)

// This is a fix for `runc`, and should not break other runtimes. With
// containerd.WithKillAll, `runc` will get all processes from the container
// cgroups, and kill them. However, sometimes the processes may be moved
// out from the container cgroup, e.g. users manually move them by mistake,
// or systemd.Delegate=true is not set.
// In these cases, we should try our best to do cleanup, kill the container
// without containerd.WithKillAll, so that runc can kill the container init
// process directly.
// NOTE(random-liu): If pid namespace is shared inside the pod, non-init processes
// of this container will be left running until the pause container is stopped.
logrus.Infof("Kill container %q init process", id)
if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to kill container %q init process", id)
}

// Wait for a fixed timeout until container stop is observed by event monitor.
if err := c.waitContainerStop(ctx, container, killContainerTimeout); err != nil {
return errors.Wrapf(err, "an error occurs during waiting for container %q to stop", id)
if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil {
return nil
}
return nil
return errors.Wrapf(err, "an error occurs during waiting for container %q init process to be killed", id)
}

// waitContainerStop waits for container to be stopped until timeout exceeds or context is cancelled.
Expand Down
13 changes: 11 additions & 2 deletions pkg/server/sandbox_stop.go
Expand Up @@ -109,11 +109,20 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst
}

// Kill the sandbox container.
err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll)
if err != nil && !errdefs.IsNotFound(err) {
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) {
return errors.Wrap(err, "failed to kill sandbox container")
}

if err = c.waitSandboxStop(ctx, sandbox, killContainerTimeout); err == nil {
return nil
}
logrus.WithError(err).Errorf("An error occurs during waiting for sandbox %q to be killed", sandbox.ID)

// Kill the sandbox container init process.
if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
return errors.Wrap(err, "failed to kill sandbox container init process")
}

return c.waitSandboxStop(ctx, sandbox, killContainerTimeout)
}

Expand Down

0 comments on commit df67dff

Please sign in to comment.