diff --git a/cmd/dockerd/config_unix.go b/cmd/dockerd/config_unix.go index 1817c1da271d9..8cc11ae0a9408 100644 --- a/cmd/dockerd/config_unix.go +++ b/cmd/dockerd/config_unix.go @@ -9,6 +9,7 @@ import ( "github.com/docker/docker/opts" "github.com/docker/docker/rootless" units "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/pkg/errors" "github.com/spf13/pflag" ) @@ -64,6 +65,10 @@ func installConfigFlags(conf *config.Config, flags *pflag.FlagSet) error { // rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702) // Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless. flags.BoolVar(&conf.Rootless, "rootless", rootless.RunningWithRootlessKit(), "Enable rootless mode; typically used with RootlessKit (experimental)") - flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", config.DefaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`) + defaultCgroupNamespaceMode := "host" + if cgroups.IsCgroup2UnifiedMode() { + defaultCgroupNamespaceMode = "private" + } + flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", defaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`) return nil } diff --git a/daemon/config/config_unix.go b/daemon/config/config_unix.go index 92076d4d7faa9..343b85fef7867 100644 --- a/daemon/config/config_unix.go +++ b/daemon/config/config_unix.go @@ -11,8 +11,6 @@ import ( ) const ( - // DefaultCgroupNamespaceMode is the default for a container's CgroupnsMode, if not set otherwise - DefaultCgroupNamespaceMode = "host" // TODO: change to private // DefaultIpcMode is default for container's IpcMode, if not set otherwise DefaultIpcMode = "private" ) diff --git a/daemon/daemon.go b/daemon/daemon.go index 8a11fa5faa63d..f139fc6d83763 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -794,6 +794,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S PluginStore: pluginStore, startupDone: make(chan struct{}), } + // Ensure the daemon is properly shutdown if there is a failure during // initialization defer func() { @@ -914,7 +915,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S } } - return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m) + return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m, d.useShimV2()) } // Plugin system initialization should happen before restore. Do not change order. @@ -1063,7 +1064,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S go d.execCommandGC() - d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d) + d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d, d.useShimV2()) if err != nil { return nil, err } diff --git a/daemon/daemon_unix.go b/daemon/daemon_unix.go index 37bac78603b7c..cb5ecddd1bacd 100644 --- a/daemon/daemon_unix.go +++ b/daemon/daemon_unix.go @@ -364,10 +364,15 @@ func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConf // Set default cgroup namespace mode, if unset for container if hostConfig.CgroupnsMode.IsEmpty() { - if hostConfig.Privileged { + // for cgroup v2: unshare cgroupns even for privileged containers + // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 + if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() { hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host") } else { - m := config.DefaultCgroupNamespaceMode + m := "host" + if cgroups.IsCgroup2UnifiedMode() { + m = "private" + } if daemon.configStore != nil { m = daemon.configStore.CgroupNamespaceMode } @@ -708,8 +713,8 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes. warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") } - if hostConfig.Privileged { - return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces. You must run the container in the host cgroup namespace when running privileged mode") + if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() { + return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces on cgroup v1 host. You must run the container in the host cgroup namespace when running privileged mode") } } @@ -1594,6 +1599,10 @@ func (daemon *Daemon) initCgroupsPath(path string) error { return nil } + if cgroups.IsCgroup2UnifiedMode() { + return fmt.Errorf("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2") + } + // Recursively create cgroup to ensure that the system and all parent cgroups have values set // for the period and runtime as this limits what the children can be set to. daemon.initCgroupsPath(filepath.Dir(path)) @@ -1639,3 +1648,7 @@ func (daemon *Daemon) setupSeccompProfile() error { } return nil } + +func (daemon *Daemon) useShimV2() bool { + return cgroups.IsCgroup2UnifiedMode() +} diff --git a/daemon/daemon_windows.go b/daemon/daemon_windows.go index 8a912f493eb10..021b7b8f0a678 100644 --- a/daemon/daemon_windows.go +++ b/daemon/daemon_windows.go @@ -653,3 +653,7 @@ func (daemon *Daemon) initRuntimes(_ map[string]types.Runtime) error { func setupResolvConf(config *config.Config) { } + +func (daemon *Daemon) useShimV2() bool { + return true +} diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index 874a1a371b9ec..a90a69d8a028c 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -316,7 +316,9 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode) } - if cgroupNsMode.IsPrivate() && !c.HostConfig.Privileged { + // for cgroup v2: unshare cgroupns even for privileged containers + // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 + if cgroupNsMode.IsPrivate() && (cgroups.IsCgroup2UnifiedMode() || !c.HostConfig.Privileged) { nsCgroup := specs.LinuxNamespace{Type: "cgroup"} setNamespace(s, nsCgroup) } diff --git a/daemon/start_unix.go b/daemon/start_unix.go index e680b95f42149..73963b9cf6d44 100644 --- a/daemon/start_unix.go +++ b/daemon/start_unix.go @@ -8,6 +8,7 @@ import ( "path/filepath" "github.com/containerd/containerd/runtime/linux/runctypes" + v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" "github.com/docker/docker/container" "github.com/docker/docker/errdefs" "github.com/pkg/errors" @@ -43,6 +44,20 @@ func (daemon *Daemon) getLibcontainerdCreateOptions(container *container.Contain if err != nil { return nil, err } + if daemon.useShimV2() { + opts := &v2runcoptions.Options{ + BinaryName: path, + Root: filepath.Join(daemon.configStore.ExecRoot, + fmt.Sprintf("runtime-%s", container.HostConfig.Runtime)), + } + + if UsingSystemd(daemon.configStore) { + opts.SystemdCgroup = true + } + + return opts, nil + + } opts := &runctypes.RuncOptions{ Runtime: path, RuntimeRoot: filepath.Join(daemon.configStore.ExecRoot, diff --git a/integration/container/run_cgroupns_linux_test.go b/integration/container/run_cgroupns_linux_test.go index 64c18fa281601..f06d01e3dad71 100644 --- a/integration/container/run_cgroupns_linux_test.go +++ b/integration/container/run_cgroupns_linux_test.go @@ -115,7 +115,7 @@ func TestCgroupNamespacesRunPrivilegedAndPrivate(t *testing.T) { skip.If(t, !requirement.CgroupNamespacesEnabled()) // Running with both privileged and cgroupns=private is not allowed - errStr := "privileged mode is incompatible with private cgroup namespaces. You must run the container in the host cgroup namespace when running privileged mode" + errStr := "privileged mode is incompatible with private cgroup namespaces on cgroup v1 host. You must run the container in the host cgroup namespace when running privileged mode" testCreateFailureWithCgroupNs(t, "private", errStr, container.WithPrivileged(true), container.WithCgroupnsMode("private")) } diff --git a/libcontainerd/libcontainerd_linux.go b/libcontainerd/libcontainerd_linux.go index ec195a7905e6f..3b008fe2567b5 100644 --- a/libcontainerd/libcontainerd_linux.go +++ b/libcontainerd/libcontainerd_linux.go @@ -9,6 +9,6 @@ import ( ) // NewClient creates a new libcontainerd client from a containerd client -func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) { - return remote.NewClient(ctx, cli, stateDir, ns, b) +func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) { + return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2) } diff --git a/libcontainerd/libcontainerd_windows.go b/libcontainerd/libcontainerd_windows.go index 61f19ba087a3c..5a64180be4dfd 100644 --- a/libcontainerd/libcontainerd_windows.go +++ b/libcontainerd/libcontainerd_windows.go @@ -11,9 +11,10 @@ import ( ) // NewClient creates a new libcontainerd client from a containerd client -func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) { +func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) { if !system.ContainerdRuntimeSupported() { + // useShimV2 is ignored for windows return local.NewClient(ctx, cli, stateDir, ns, b) } - return remote.NewClient(ctx, cli, stateDir, ns, b) + return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2) } diff --git a/libcontainerd/remote/client.go b/libcontainerd/remote/client.go index 93cc4e73b9510..9bc4ffde02126 100644 --- a/libcontainerd/remote/client.go +++ b/libcontainerd/remote/client.go @@ -23,6 +23,7 @@ import ( "github.com/containerd/containerd/events" "github.com/containerd/containerd/images" "github.com/containerd/containerd/runtime/linux/runctypes" + v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" "github.com/containerd/typeurl" "github.com/docker/docker/errdefs" "github.com/docker/docker/libcontainerd/queue" @@ -45,21 +46,27 @@ type client struct { logger *logrus.Entry ns string - backend libcontainerdtypes.Backend - eventQ queue.Queue - oomMu sync.Mutex - oom map[string]bool + backend libcontainerdtypes.Backend + eventQ queue.Queue + oomMu sync.Mutex + oom map[string]bool + useShimV2 bool + v2runcoptionsMu sync.Mutex + // v2runcoptions is used for copying options specified on Create() to Start() + v2runcoptions map[string]v2runcoptions.Options } // NewClient creates a new libcontainerd client from a containerd client -func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) { +func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) { c := &client{ - client: cli, - stateDir: stateDir, - logger: logrus.WithField("module", "libcontainerd").WithField("namespace", ns), - ns: ns, - backend: b, - oom: make(map[string]bool), + client: cli, + stateDir: stateDir, + logger: logrus.WithField("module", "libcontainerd").WithField("namespace", ns), + ns: ns, + backend: b, + oom: make(map[string]bool), + useShimV2: useShimV2, + v2runcoptions: make(map[string]v2runcoptions.Options), } go c.processEventStream(ctx, ns) @@ -126,9 +133,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run bdir := c.bundleDir(id) c.logger.WithField("bundle", bdir).WithField("root", ociSpec.Root.Path).Debug("bundle dir created") + rt := runtimeName + if c.useShimV2 { + rt = shimV2RuntimeName + } newOpts := []containerd.NewContainerOpts{ containerd.WithSpec(ociSpec), - containerd.WithRuntime(runtimeName, runtimeOptions), + containerd.WithRuntime(rt, runtimeOptions), WithBundle(bdir, ociSpec), } opts = append(opts, newOpts...) @@ -140,6 +151,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run } return wrapError(err) } + if c.useShimV2 { + if x, ok := runtimeOptions.(*v2runcoptions.Options); ok { + c.v2runcoptionsMu.Lock() + c.v2runcoptions[id] = *x + c.v2runcoptionsMu.Unlock() + } + } return nil } @@ -200,11 +218,26 @@ func (c *client) Start(ctx context.Context, id, checkpointDir string, withStdin if runtime.GOOS != "windows" { taskOpts = append(taskOpts, func(_ context.Context, _ *containerd.Client, info *containerd.TaskInfo) error { - info.Options = &runctypes.CreateOptions{ - IoUid: uint32(uid), - IoGid: uint32(gid), - NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "", + if c.useShimV2 { + // For v2, we need to inherit options specified on Create + c.v2runcoptionsMu.Lock() + opts, ok := c.v2runcoptions[id] + c.v2runcoptionsMu.Unlock() + if !ok { + opts = v2runcoptions.Options{} + } + opts.IoUid = uint32(uid) + opts.IoGid = uint32(gid) + opts.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" + info.Options = &opts + } else { + info.Options = &runctypes.CreateOptions{ + IoUid: uint32(uid), + IoGid: uint32(gid), + NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "", + } } + return nil }) } else { @@ -466,6 +499,9 @@ func (c *client) Delete(ctx context.Context, containerID string) error { c.oomMu.Lock() delete(c.oom, containerID) c.oomMu.Unlock() + c.v2runcoptionsMu.Lock() + delete(c.v2runcoptions, containerID) + c.v2runcoptionsMu.Unlock() if os.Getenv("LIBCONTAINERD_NOCLEAN") != "1" { if err := os.RemoveAll(bundle); err != nil { c.logger.WithError(err).WithFields(logrus.Fields{ diff --git a/libcontainerd/remote/client_linux.go b/libcontainerd/remote/client_linux.go index 486c8538e0587..637ac94d8288e 100644 --- a/libcontainerd/remote/client_linux.go +++ b/libcontainerd/remote/client_linux.go @@ -16,7 +16,10 @@ import ( "github.com/sirupsen/logrus" ) -const runtimeName = "io.containerd.runtime.v1.linux" +const ( + runtimeName = "io.containerd.runtime.v1.linux" + shimV2RuntimeName = "io.containerd.runc.v2" +) func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) { return &libcontainerdtypes.Summary{}, nil diff --git a/libcontainerd/remote/client_windows.go b/libcontainerd/remote/client_windows.go index a086d7b3ba6c0..c371b9a8b47ec 100644 --- a/libcontainerd/remote/client_windows.go +++ b/libcontainerd/remote/client_windows.go @@ -16,7 +16,10 @@ import ( "github.com/sirupsen/logrus" ) -const runtimeName = "io.containerd.runhcs.v1" +const ( + runtimeName = "io.containerd.runhcs.v1" + shimV2RuntimeName = runtimeName +) func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) { switch pd := i.(type) { diff --git a/pkg/sysinfo/sysinfo_linux.go b/pkg/sysinfo/sysinfo_linux.go index 2c147d35d5246..a488ef0e4f643 100644 --- a/pkg/sysinfo/sysinfo_linux.go +++ b/pkg/sysinfo/sysinfo_linux.go @@ -60,6 +60,9 @@ func New(quiet bool) *SysInfo { w := o(sysInfo, cgMounts) warnings = append(warnings, w...) } + if cgroups.IsCgroup2UnifiedMode() { + warnings = append(warnings, "Your system is running cgroup v2 (unsupported)") + } if !quiet { for _, w := range warnings { logrus.Warn(w) @@ -70,6 +73,15 @@ func New(quiet bool) *SysInfo { // applyMemoryCgroupInfo reads the memory information from the memory cgroup mount point. func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.MemoryLimit = true + info.SwapLimit = true + info.MemoryReservation = true + info.OomKillDisable = true + info.MemorySwappiness = true + return nil + } var warnings []string mountPoint, ok := cgMounts["memory"] if !ok { @@ -108,6 +120,15 @@ func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyCPUCgroupInfo reads the cpu information from the cpu cgroup mount point. func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.CPUShares = true + info.CPUCfsPeriod = true + info.CPUCfsQuota = true + info.CPURealtimePeriod = true + info.CPURealtimeRuntime = true + return nil + } var warnings []string mountPoint, ok := cgMounts["cpu"] if !ok { @@ -145,6 +166,15 @@ func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyBlkioCgroupInfo reads the blkio information from the blkio cgroup mount point. func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.BlkioWeight = true + info.BlkioReadBpsDevice = true + info.BlkioWriteBpsDevice = true + info.BlkioReadIOpsDevice = true + info.BlkioWriteIOpsDevice = true + return nil + } var warnings []string mountPoint, ok := cgMounts["blkio"] if !ok { @@ -186,6 +216,11 @@ func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyCPUSetCgroupInfo reads the cpuset information from the cpuset cgroup mount point. func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.Cpuset = true + return nil + } var warnings []string mountPoint, ok := cgMounts["cpuset"] if !ok { @@ -213,6 +248,11 @@ func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyPIDSCgroupInfo reads the pids information from the pids cgroup mount point. func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.PidsLimit = true + return nil + } var warnings []string _, err := cgroups.FindCgroupMountpoint("", "pids") if err != nil { @@ -225,6 +265,11 @@ func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string { // applyDevicesCgroupInfo reads the pids information from the devices cgroup mount point. func applyDevicesCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { + if cgroups.IsCgroup2UnifiedMode() { + // TODO: check cgroup2 info correctly + info.CgroupDevicesEnabled = true + return nil + } var warnings []string _, ok := cgMounts["devices"] info.CgroupDevicesEnabled = ok diff --git a/plugin/executor/containerd/containerd.go b/plugin/executor/containerd/containerd.go index 91bae6c6b9322..aeeb2184ec3e3 100644 --- a/plugin/executor/containerd/containerd.go +++ b/plugin/executor/containerd/containerd.go @@ -26,13 +26,13 @@ type ExitHandler interface { } // New creates a new containerd plugin executor -func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler) (*Executor, error) { +func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler, useShimV2 bool) (*Executor, error) { e := &Executor{ rootDir: rootDir, exitHandler: exitHandler, } - client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e) + client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e, useShimV2) if err != nil { return nil, errors.Wrap(err, "error creating containerd exec client") }