Skip to content

Commit

Permalink
Merge pull request #1037 from Random-Liu/support-unknown-state
Browse files Browse the repository at this point in the history
Support unknown state
  • Loading branch information
Random-Liu committed Feb 5, 2019
2 parents dd2846d + c27a12d commit 7c2498d
Show file tree
Hide file tree
Showing 168 changed files with 3,590 additions and 1,417 deletions.
2 changes: 1 addition & 1 deletion hack/test-integration.sh
Expand Up @@ -33,7 +33,7 @@ mkdir -p ${REPORT_DIR}
test_setup ${REPORT_DIR}

# Run integration test.
sudo ${ROOT}/_output/integration.test --test.run="${FOCUS}" --test.v \
sudo PATH=${PATH} ${ROOT}/_output/integration.test --test.run="${FOCUS}" --test.v \
--cri-endpoint=${CONTAINERD_SOCK} \
--cri-root=${CRI_ROOT} \
--runtime-handler=${RUNTIME}
Expand Down
149 changes: 149 additions & 0 deletions integration/restart_test.go
Expand Up @@ -17,8 +17,11 @@ limitations under the License.
package integration

import (
"os"
"os/exec"
"sort"
"testing"
"time"

"github.com/containerd/containerd"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -192,3 +195,149 @@ func TestContainerdRestart(t *testing.T) {
assert.True(t, found, "should find image %+v", i1)
}
}

// Note: The test moves runc binary.
// The test requires:
// 1) The runtime is runc;
// 2) runc is in PATH;
func TestUnknownStateAfterContainerdRestart(t *testing.T) {
if *runtimeHandler != "" {
t.Skip("unsupported config: runtime handler is set")
}
runcPath, err := exec.LookPath("runc")
if err != nil {
t.Skip("unsupported config: runc not in PATH")
}

const testImage = "busybox"
t.Logf("Pull test image %q", testImage)
img, err := imageService.PullImage(&runtime.ImageSpec{Image: testImage}, nil)
require.NoError(t, err)
defer func() {
assert.NoError(t, imageService.RemoveImage(&runtime.ImageSpec{Image: img}))
}()

sbConfig := PodSandboxConfig("sandbox", "sandbox-unknown-state")
t.Log("Should not be able to create sandbox without runc")
tmpRuncPath := Randomize(runcPath)
require.NoError(t, os.Rename(runcPath, tmpRuncPath))
defer func() {
os.Rename(tmpRuncPath, runcPath)
}()
sb, err := runtimeService.RunPodSandbox(sbConfig, "")
if err == nil {
assert.NoError(t, runtimeService.StopPodSandbox(sb))
assert.NoError(t, runtimeService.RemovePodSandbox(sb))
t.Skip("unsupported config: runc is not being used")
}
require.NoError(t, os.Rename(tmpRuncPath, runcPath))

t.Log("Create a sandbox")
sb, err = runtimeService.RunPodSandbox(sbConfig, "")
require.NoError(t, err)
defer func() {
// Make sure the sandbox is cleaned up in any case.
runtimeService.StopPodSandbox(sb)
runtimeService.RemovePodSandbox(sb)
}()
ps, err := runtimeService.PodSandboxStatus(sb)
require.NoError(t, err)
assert.Equal(t, runtime.PodSandboxState_SANDBOX_READY, ps.GetState())

t.Log("Create a container")
cnConfig := ContainerConfig(
"container-unknown-state",
testImage,
WithCommand("sleep", "1000"),
)
cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
require.NoError(t, err)

t.Log("Start the container")
require.NoError(t, runtimeService.StartContainer(cn))
cs, err := runtimeService.ContainerStatus(cn)
require.NoError(t, err)
assert.Equal(t, runtime.ContainerState_CONTAINER_RUNNING, cs.GetState())

t.Log("Move runc binary, so that container/sandbox can't be loaded after restart")
tmpRuncPath = Randomize(runcPath)
require.NoError(t, os.Rename(runcPath, tmpRuncPath))
defer func() {
os.Rename(tmpRuncPath, runcPath)
}()

t.Log("Restart containerd")
RestartContainerd(t)

t.Log("Sandbox should be in NOTREADY state after containerd restart")
ps, err = runtimeService.PodSandboxStatus(sb)
require.NoError(t, err)
assert.Equal(t, runtime.PodSandboxState_SANDBOX_NOTREADY, ps.GetState())

t.Log("Container should be in UNKNOWN state after containerd restart")
cs, err = runtimeService.ContainerStatus(cn)
require.NoError(t, err)
assert.Equal(t, runtime.ContainerState_CONTAINER_UNKNOWN, cs.GetState())

t.Log("Stop/remove the sandbox should fail for the lack of runc")
assert.Error(t, runtimeService.StopPodSandbox(sb))
assert.Error(t, runtimeService.RemovePodSandbox(sb))

t.Log("Stop/remove the container should fail for the lack of runc")
assert.Error(t, runtimeService.StopContainer(cn, 10))
assert.Error(t, runtimeService.RemoveContainer(cn))

t.Log("Move runc back")
require.NoError(t, os.Rename(tmpRuncPath, runcPath))

t.Log("Sandbox should still be in NOTREADY state")
ps, err = runtimeService.PodSandboxStatus(sb)
require.NoError(t, err)
assert.Equal(t, runtime.PodSandboxState_SANDBOX_NOTREADY, ps.GetState())

t.Log("Container should still be in UNKNOWN state")
cs, err = runtimeService.ContainerStatus(cn)
require.NoError(t, err)
assert.Equal(t, runtime.ContainerState_CONTAINER_UNKNOWN, cs.GetState())

t.Log("Sandbox operations which require running state should fail")
_, err = runtimeService.PortForward(&runtime.PortForwardRequest{
PodSandboxId: sb,
Port: []int32{8080},
})
assert.Error(t, err)

t.Log("Container operations which require running state should fail")
assert.Error(t, runtimeService.ReopenContainerLog(cn))
_, _, err = runtimeService.ExecSync(cn, []string{"ls"}, 10*time.Second)
assert.Error(t, err)
_, err = runtimeService.Attach(&runtime.AttachRequest{
ContainerId: cn,
Stdin: true,
Stdout: true,
Stderr: true,
})
assert.Error(t, err)

t.Log("Containerd should still be running now")
_, err = runtimeService.Status()
require.NoError(t, err)

t.Log("Remove the container should fail in this state")
assert.Error(t, runtimeService.RemoveContainer(cn))

t.Log("Remove the sandbox should fail in this state")
assert.Error(t, runtimeService.RemovePodSandbox(sb))

t.Log("Should be able to stop container in this state")
assert.NoError(t, runtimeService.StopContainer(cn, 10))

t.Log("Should be able to stop sandbox in this state")
assert.NoError(t, runtimeService.StopPodSandbox(sb))

t.Log("Should be able to remove container after stop")
assert.NoError(t, runtimeService.RemoveContainer(cn))

t.Log("Should be able to remove sandbox after stop")
assert.NoError(t, runtimeService.RemovePodSandbox(sb))
}
7 changes: 5 additions & 2 deletions pkg/server/container_remove.go
Expand Up @@ -98,9 +98,12 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
// container will not be started or removed again.
func setContainerRemoving(container containerstore.Container) error {
return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
// Do not remove container if it's still running.
// Do not remove container if it's still running or unknown.
if status.State() == runtime.ContainerState_CONTAINER_RUNNING {
return status, errors.New("container is still running")
return status, errors.New("container is still running, to stop first")
}
if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
return status, errors.New("container state is unknown, to stop first")
}
if status.Removing {
return status, errors.New("container is already in removing state")
Expand Down
9 changes: 9 additions & 0 deletions pkg/server/container_status.go
Expand Up @@ -60,6 +60,15 @@ func (c *criService) ContainerStatus(ctx context.Context, r *runtime.ContainerSt
}
}
status := toCRIContainerStatus(container, spec, imageRef)
if status.GetCreatedAt() == 0 {
// CRI doesn't allow CreatedAt == 0.
info, err := container.Container.Info(ctx)
if err != nil {
return nil, errors.Wrapf(err, "failed to get CreatedAt in %q state", status.State)
}
status.CreatedAt = info.CreatedAt.UnixNano()
}

info, err := toCRIContainerInfo(ctx, container, r.GetVerbose())
if err != nil {
return nil, errors.Wrap(err, "failed to get verbose container info")
Expand Down
59 changes: 55 additions & 4 deletions pkg/server/container_stop.go
Expand Up @@ -19,6 +19,8 @@ package server
import (
"time"

"github.com/containerd/containerd"
eventtypes "github.com/containerd/containerd/api/events"
"github.com/containerd/containerd/errdefs"
"github.com/docker/docker/pkg/signal"
"github.com/pkg/errors"
Expand Down Expand Up @@ -60,18 +62,49 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
// Return without error if container is not running. This makes sure that
// stop only takes real action after the container is started.
state := container.Status.Get().State()
if state != runtime.ContainerState_CONTAINER_RUNNING {
logrus.Infof("Container to stop %q is not running, current state %q",
if state != runtime.ContainerState_CONTAINER_RUNNING &&
state != runtime.ContainerState_CONTAINER_UNKNOWN {
logrus.Infof("Container to stop %q must be in running or unknown state, current state %q",
id, criContainerStateToString(state))
return nil
}

task, err := container.Container.Task(ctx, nil)
if err != nil {
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
return errors.Wrapf(err, "failed to get task for container %q", id)
}
// Don't return for unknown state, some cleanup needs to be done.
if state != runtime.ContainerState_CONTAINER_UNKNOWN {
return nil
}
// Task is an interface, explicitly set it to nil just in case.
task = nil
}

// Handle unknown state.
if state == runtime.ContainerState_CONTAINER_UNKNOWN {
status, err := getTaskStatus(ctx, task)
if err != nil {
return errors.Wrapf(err, "failed to get task status for %q", id)
}
switch status.Status {
case containerd.Running, containerd.Created:
// The task is still running, continue stopping the task.
case containerd.Stopped:
// The task has exited. If the task exited after containerd
// started, the event monitor will receive its exit event; if it
// exited before containerd started, the event monitor will never
// receive its exit event.
// However, we can't tell that because the task state was not
// successfully loaded during containerd start (container is
// in UNKNOWN state).
// So always do cleanup here, just in case that we've missed the
// exit event.
return cleanupUnknownContainer(ctx, id, status, container)
default:
return errors.Wrapf(err, "unsupported task status %q", status.Status)
}
return nil
}

// We only need to kill the task. The event handler will Delete the
Expand Down Expand Up @@ -141,3 +174,21 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
return nil
}
}

// cleanupUnknownContainer cleanup stopped container in unknown state.
func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
cntr containerstore.Container) error {
// Reuse handleContainerExit to do the cleanup.
// NOTE(random-liu): If the task did exit after containerd started, both
// the event monitor and the cleanup function would update the container
// state. The final container state will be whatever being updated first.
// There is no way to completely avoid this race condition, and for best
// effort unknown state container cleanup, this seems acceptable.
return handleContainerExit(ctx, &eventtypes.TaskExit{
ContainerID: id,
ID: id,
Pid: 0,
ExitStatus: status.ExitStatus,
ExitedAt: status.ExitTime,
}, cntr)
}
21 changes: 15 additions & 6 deletions pkg/server/events.go
Expand Up @@ -213,7 +213,7 @@ func (em *eventMonitor) handleEvent(any interface{}) error {
} else if err != store.ErrNotExist {
return errors.Wrap(err, "can't find container for TaskExit event")
}
// Use GetAll to include sandbox in unknown state.
// Use GetAll to include sandbox in init state.
sb, err := em.c.sandboxStore.GetAll(e.ID)
if err == nil {
if err := handleSandboxExit(ctx, e, sb); err != nil {
Expand Down Expand Up @@ -260,7 +260,16 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
// Attach container IO so that `Delete` could cleanup the stream properly.
task, err := cntr.Container.Task(ctx,
func(*containerdio.FIFOSet) (containerdio.IO, error) {
return cntr.IO, nil
// We can't directly return cntr.IO here, because
// even if cntr.IO is nil, the cio.IO interface
// is not.
// See https://tour.golang.org/methods/12:
// Note that an interface value that holds a nil
// concrete value is itself non-nil.
if cntr.IO != nil {
return cntr.IO, nil
}
return nil, nil
},
)
if err != nil {
Expand Down Expand Up @@ -313,13 +322,13 @@ func handleSandboxExit(ctx context.Context, e *eventtypes.TaskExit, sb sandboxst
}
}
err = sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
// NOTE(random-liu): We SHOULD NOT change UNKNOWN state here.
// If sandbox state is UNKNOWN when event monitor receives an TaskExit event,
// NOTE(random-liu): We SHOULD NOT change INIT state here.
// If sandbox state is INIT when event monitor receives an TaskExit event,
// it means that sandbox start has failed. In that case, `RunPodSandbox` will
// cleanup everything immediately.
// Once sandbox state goes out of UNKNOWN, it becomes visable to the user, which
// Once sandbox state goes out of INIT, it becomes visable to the user, which
// is not what we want.
if status.State != sandboxstore.StateUnknown {
if status.State != sandboxstore.StateInit {
status.State = sandboxstore.StateNotReady
}
status.Pid = 0
Expand Down

0 comments on commit 7c2498d

Please sign in to comment.