Merge pull request #1037 from Random-Liu/support-unknown-state

Support unknown state
containerd · Feb 5, 2019 · 7c2498d · 7c2498d
2 parents dd2846d + c27a12d
commit 7c2498d
Show file tree

Hide file tree

Showing 168 changed files with 3,590 additions and 1,417 deletions.
diff --git a/hack/test-integration.sh b/hack/test-integration.sh
@@ -33,7 +33,7 @@ mkdir -p ${REPORT_DIR}
 test_setup ${REPORT_DIR}
 
 # Run integration test.
-sudo ${ROOT}/_output/integration.test --test.run="${FOCUS}" --test.v \
+sudo PATH=${PATH} ${ROOT}/_output/integration.test --test.run="${FOCUS}" --test.v \
   --cri-endpoint=${CONTAINERD_SOCK} \
   --cri-root=${CRI_ROOT} \
   --runtime-handler=${RUNTIME}

diff --git a/integration/restart_test.go b/integration/restart_test.go
@@ -17,8 +17,11 @@ limitations under the License.
 package integration
 
 import (
+	"os"
+	"os/exec"
 	"sort"
 	"testing"
+	"time"
 
 	"github.com/containerd/containerd"
 	"github.com/stretchr/testify/assert"
@@ -192,3 +195,149 @@ func TestContainerdRestart(t *testing.T) {
 		assert.True(t, found, "should find image %+v", i1)
 	}
 }
+
+// Note: The test moves runc binary.
+// The test requires:
+// 1) The runtime is runc;
+// 2) runc is in PATH;
+func TestUnknownStateAfterContainerdRestart(t *testing.T) {
+	if *runtimeHandler != "" {
+		t.Skip("unsupported config: runtime handler is set")
+	}
+	runcPath, err := exec.LookPath("runc")
+	if err != nil {
+		t.Skip("unsupported config: runc not in PATH")
+	}
+
+	const testImage = "busybox"
+	t.Logf("Pull test image %q", testImage)
+	img, err := imageService.PullImage(&runtime.ImageSpec{Image: testImage}, nil)
+	require.NoError(t, err)
+	defer func() {
+		assert.NoError(t, imageService.RemoveImage(&runtime.ImageSpec{Image: img}))
+	}()
+
+	sbConfig := PodSandboxConfig("sandbox", "sandbox-unknown-state")
+	t.Log("Should not be able to create sandbox without runc")
+	tmpRuncPath := Randomize(runcPath)
+	require.NoError(t, os.Rename(runcPath, tmpRuncPath))
+	defer func() {
+		os.Rename(tmpRuncPath, runcPath)
+	}()
+	sb, err := runtimeService.RunPodSandbox(sbConfig, "")
+	if err == nil {
+		assert.NoError(t, runtimeService.StopPodSandbox(sb))
+		assert.NoError(t, runtimeService.RemovePodSandbox(sb))
+		t.Skip("unsupported config: runc is not being used")
+	}
+	require.NoError(t, os.Rename(tmpRuncPath, runcPath))
+
+	t.Log("Create a sandbox")
+	sb, err = runtimeService.RunPodSandbox(sbConfig, "")
+	require.NoError(t, err)
+	defer func() {
+		// Make sure the sandbox is cleaned up in any case.
+		runtimeService.StopPodSandbox(sb)
+		runtimeService.RemovePodSandbox(sb)
+	}()
+	ps, err := runtimeService.PodSandboxStatus(sb)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.PodSandboxState_SANDBOX_READY, ps.GetState())
+
+	t.Log("Create a container")
+	cnConfig := ContainerConfig(
+		"container-unknown-state",
+		testImage,
+		WithCommand("sleep", "1000"),
+	)
+	cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
+	require.NoError(t, err)
+
+	t.Log("Start the container")
+	require.NoError(t, runtimeService.StartContainer(cn))
+	cs, err := runtimeService.ContainerStatus(cn)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.ContainerState_CONTAINER_RUNNING, cs.GetState())
+
+	t.Log("Move runc binary, so that container/sandbox can't be loaded after restart")
+	tmpRuncPath = Randomize(runcPath)
+	require.NoError(t, os.Rename(runcPath, tmpRuncPath))
+	defer func() {
+		os.Rename(tmpRuncPath, runcPath)
+	}()
+
+	t.Log("Restart containerd")
+	RestartContainerd(t)
+
+	t.Log("Sandbox should be in NOTREADY state after containerd restart")
+	ps, err = runtimeService.PodSandboxStatus(sb)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.PodSandboxState_SANDBOX_NOTREADY, ps.GetState())
+
+	t.Log("Container should be in UNKNOWN state after containerd restart")
+	cs, err = runtimeService.ContainerStatus(cn)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.ContainerState_CONTAINER_UNKNOWN, cs.GetState())
+
+	t.Log("Stop/remove the sandbox should fail for the lack of runc")
+	assert.Error(t, runtimeService.StopPodSandbox(sb))
+	assert.Error(t, runtimeService.RemovePodSandbox(sb))
+
+	t.Log("Stop/remove the container should fail for the lack of runc")
+	assert.Error(t, runtimeService.StopContainer(cn, 10))
+	assert.Error(t, runtimeService.RemoveContainer(cn))
+
+	t.Log("Move runc back")
+	require.NoError(t, os.Rename(tmpRuncPath, runcPath))
+
+	t.Log("Sandbox should still be in NOTREADY state")
+	ps, err = runtimeService.PodSandboxStatus(sb)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.PodSandboxState_SANDBOX_NOTREADY, ps.GetState())
+
+	t.Log("Container should still be in UNKNOWN state")
+	cs, err = runtimeService.ContainerStatus(cn)
+	require.NoError(t, err)
+	assert.Equal(t, runtime.ContainerState_CONTAINER_UNKNOWN, cs.GetState())
+
+	t.Log("Sandbox operations which require running state should fail")
+	_, err = runtimeService.PortForward(&runtime.PortForwardRequest{
+		PodSandboxId: sb,
+		Port:         []int32{8080},
+	})
+	assert.Error(t, err)
+
+	t.Log("Container operations which require running state should fail")
+	assert.Error(t, runtimeService.ReopenContainerLog(cn))
+	_, _, err = runtimeService.ExecSync(cn, []string{"ls"}, 10*time.Second)
+	assert.Error(t, err)
+	_, err = runtimeService.Attach(&runtime.AttachRequest{
+		ContainerId: cn,
+		Stdin:       true,
+		Stdout:      true,
+		Stderr:      true,
+	})
+	assert.Error(t, err)
+
+	t.Log("Containerd should still be running now")
+	_, err = runtimeService.Status()
+	require.NoError(t, err)
+
+	t.Log("Remove the container should fail in this state")
+	assert.Error(t, runtimeService.RemoveContainer(cn))
+
+	t.Log("Remove the sandbox should fail in this state")
+	assert.Error(t, runtimeService.RemovePodSandbox(sb))
+
+	t.Log("Should be able to stop container in this state")
+	assert.NoError(t, runtimeService.StopContainer(cn, 10))
+
+	t.Log("Should be able to stop sandbox in this state")
+	assert.NoError(t, runtimeService.StopPodSandbox(sb))
+
+	t.Log("Should be able to remove container after stop")
+	assert.NoError(t, runtimeService.RemoveContainer(cn))
+
+	t.Log("Should be able to remove sandbox after stop")
+	assert.NoError(t, runtimeService.RemovePodSandbox(sb))
+}
diff --git a/pkg/server/container_remove.go b/pkg/server/container_remove.go
@@ -98,9 +98,12 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
 // container will not be started or removed again.
 func setContainerRemoving(container containerstore.Container) error {
 	return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
-		// Do not remove container if it's still running.
+		// Do not remove container if it's still running or unknown.
 		if status.State() == runtime.ContainerState_CONTAINER_RUNNING {
-			return status, errors.New("container is still running")
+			return status, errors.New("container is still running, to stop first")
+		}
+		if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
+			return status, errors.New("container state is unknown, to stop first")
 		}
 		if status.Removing {
 			return status, errors.New("container is already in removing state")

diff --git a/pkg/server/container_status.go b/pkg/server/container_status.go
@@ -60,6 +60,15 @@ func (c *criService) ContainerStatus(ctx context.Context, r *runtime.ContainerSt
 		}
 	}
 	status := toCRIContainerStatus(container, spec, imageRef)
+	if status.GetCreatedAt() == 0 {
+		// CRI doesn't allow CreatedAt == 0.
+		info, err := container.Container.Info(ctx)
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed to get CreatedAt in %q state", status.State)
+		}
+		status.CreatedAt = info.CreatedAt.UnixNano()
+	}
+
 	info, err := toCRIContainerInfo(ctx, container, r.GetVerbose())
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to get verbose container info")

diff --git a/pkg/server/container_stop.go b/pkg/server/container_stop.go
@@ -19,6 +19,8 @@ package server
 import (
 	"time"
 
+	"github.com/containerd/containerd"
+	eventtypes "github.com/containerd/containerd/api/events"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/docker/docker/pkg/signal"
 	"github.com/pkg/errors"
@@ -60,18 +62,49 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
 	// Return without error if container is not running. This makes sure that
 	// stop only takes real action after the container is started.
 	state := container.Status.Get().State()
-	if state != runtime.ContainerState_CONTAINER_RUNNING {
-		logrus.Infof("Container to stop %q is not running, current state %q",
+	if state != runtime.ContainerState_CONTAINER_RUNNING &&
+		state != runtime.ContainerState_CONTAINER_UNKNOWN {
+		logrus.Infof("Container to stop %q must be in running or unknown state, current state %q",
 			id, criContainerStateToString(state))
 		return nil
 	}
 
 	task, err := container.Container.Task(ctx, nil)
 	if err != nil {
 		if !errdefs.IsNotFound(err) {
-			return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
+			return errors.Wrapf(err, "failed to get task for container %q", id)
+		}
+		// Don't return for unknown state, some cleanup needs to be done.
+		if state != runtime.ContainerState_CONTAINER_UNKNOWN {
+			return nil
+		}
+		// Task is an interface, explicitly set it to nil just in case.
+		task = nil
+	}
+
+	// Handle unknown state.
+	if state == runtime.ContainerState_CONTAINER_UNKNOWN {
+		status, err := getTaskStatus(ctx, task)
+		if err != nil {
+			return errors.Wrapf(err, "failed to get task status for %q", id)
+		}
+		switch status.Status {
+		case containerd.Running, containerd.Created:
+			// The task is still running, continue stopping the task.
+		case containerd.Stopped:
+			// The task has exited. If the task exited after containerd
+			// started, the event monitor will receive its exit event; if it
+			// exited before containerd started, the event monitor will never
+			// receive its exit event.
+			// However, we can't tell that because the task state was not
+			// successfully loaded during containerd start (container is
+			// in UNKNOWN state).
+			// So always do cleanup here, just in case that we've missed the
+			// exit event.
+			return cleanupUnknownContainer(ctx, id, status, container)
+		default:
+			return errors.Wrapf(err, "unsupported task status %q", status.Status)
 		}
-		return nil
 	}
 
 	// We only need to kill the task. The event handler will Delete the
@@ -141,3 +174,21 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
 		return nil
 	}
 }
+
+// cleanupUnknownContainer cleanup stopped container in unknown state.
+func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
+	cntr containerstore.Container) error {
+	// Reuse handleContainerExit to do the cleanup.
+	// NOTE(random-liu): If the task did exit after containerd started, both
+	// the event monitor and the cleanup function would update the container
+	// state. The final container state will be whatever being updated first.
+	// There is no way to completely avoid this race condition, and for best
+	// effort unknown state container cleanup, this seems acceptable.
+	return handleContainerExit(ctx, &eventtypes.TaskExit{
+		ContainerID: id,
+		ID:          id,
+		Pid:         0,
+		ExitStatus:  status.ExitStatus,
+		ExitedAt:    status.ExitTime,
+	}, cntr)
+}
diff --git a/pkg/server/events.go b/pkg/server/events.go
@@ -213,7 +213,7 @@ func (em *eventMonitor) handleEvent(any interface{}) error {
 		} else if err != store.ErrNotExist {
 			return errors.Wrap(err, "can't find container for TaskExit event")
 		}
-		// Use GetAll to include sandbox in unknown state.
+		// Use GetAll to include sandbox in init state.
 		sb, err := em.c.sandboxStore.GetAll(e.ID)
 		if err == nil {
 			if err := handleSandboxExit(ctx, e, sb); err != nil {
@@ -260,7 +260,16 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
 	// Attach container IO so that `Delete` could cleanup the stream properly.
 	task, err := cntr.Container.Task(ctx,
 		func(*containerdio.FIFOSet) (containerdio.IO, error) {
-			return cntr.IO, nil
+			// We can't directly return cntr.IO here, because
+			// even if cntr.IO is nil, the cio.IO interface
+			// is not.
+			// See https://tour.golang.org/methods/12:
+			//   Note that an interface value that holds a nil
+			//   concrete value is itself non-nil.
+			if cntr.IO != nil {
+				return cntr.IO, nil
+			}
+			return nil, nil
 		},
 	)
 	if err != nil {
@@ -313,13 +322,13 @@ func handleSandboxExit(ctx context.Context, e *eventtypes.TaskExit, sb sandboxst
 		}
 	}
 	err = sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
-		// NOTE(random-liu): We SHOULD NOT change UNKNOWN state here.
-		// If sandbox state is UNKNOWN when event monitor receives an TaskExit event,
+		// NOTE(random-liu): We SHOULD NOT change INIT state here.
+		// If sandbox state is INIT when event monitor receives an TaskExit event,
 		// it means that sandbox start has failed. In that case, `RunPodSandbox` will
 		// cleanup everything immediately.
-		// Once sandbox state goes out of UNKNOWN, it becomes visable to the user, which
+		// Once sandbox state goes out of INIT, it becomes visable to the user, which
 		// is not what we want.
-		if status.State != sandboxstore.StateUnknown {
+		if status.State != sandboxstore.StateInit {
 			status.State = sandboxstore.StateNotReady
 		}
 		status.Pid = 0