Skip to content

Commit

Permalink
oci.WithPrivileged: set the current caps, not the known caps
Browse files Browse the repository at this point in the history
This change is needed for running the latest containerd inside Docker
that is not aware of the recently added caps (BPF, PERFMON, CHECKPOINT_RESTORE).

Without this change, containerd inside Docker fails to run containers with
"apply caps: operation not permitted" error.

See kubernetes-sigs/kind 2058

NOTE: The caller process of this function is now assumed to be as
privileged as possible.

Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
  • Loading branch information
AkihiroSuda committed Feb 10, 2021
1 parent ddcc431 commit a2d1a8a
Show file tree
Hide file tree
Showing 12 changed files with 424 additions and 35 deletions.
26 changes: 1 addition & 25 deletions oci/spec_opts.go
Expand Up @@ -38,7 +38,6 @@ import (
"github.com/opencontainers/runc/libcontainer/user"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/syndtr/gocapability/capability"
)

// SpecOpts sets spec specific information to a newly generated OCI spec
Expand Down Expand Up @@ -776,29 +775,6 @@ func WithCapabilities(caps []string) SpecOpts {
}
}

// WithAllCapabilities sets all linux capabilities for the process
var WithAllCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
return WithCapabilities(GetAllCapabilities())(ctx, client, c, s)
}

// GetAllCapabilities returns all caps up to CAP_LAST_CAP
// or CAP_BLOCK_SUSPEND on RHEL6
func GetAllCapabilities() []string {
last := capability.CAP_LAST_CAP
// hack for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability.Cap(63) {
last = capability.CAP_BLOCK_SUSPEND
}
var caps []string
for _, cap := range capability.List() {
if cap > last {
continue
}
caps = append(caps, "CAP_"+strings.ToUpper(cap.String()))
}
return caps
}

func capsContain(caps []string, s string) bool {
for _, c := range caps {
if c == s {
Expand Down Expand Up @@ -1132,7 +1108,7 @@ func WithDefaultUnixDevices(_ context.Context, _ Client, _ *containers.Container

// WithPrivileged sets up options for a privileged container
var WithPrivileged = Compose(
WithAllCapabilities,
WithAllCurrentCapabilities,
WithMaskedPaths(nil),
WithReadonlyPaths(nil),
WithWriteableSysfs,
Expand Down
17 changes: 17 additions & 0 deletions oci/spec_opts_linux.go
Expand Up @@ -25,6 +25,7 @@ import (
"path/filepath"

"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/pkg/cap"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
Expand Down Expand Up @@ -180,3 +181,19 @@ func WithCPUCFS(quota int64, period uint64) SpecOpts {
return nil
}
}

// WithAllCurrentCapabilities propagates the effective capabilities of the caller process to the container process.
// The capability set may differ from WithAllKnownCapabilities when running in a container.
var WithAllCurrentCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
caps, err := cap.Current()
if err != nil {
return err
}
return WithCapabilities(caps)(ctx, client, c, s)
}

// WithAllKnownCapabilities sets all the the known linux capabilities for the container process
var WithAllKnownCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
caps := cap.Known()
return WithCapabilities(caps)(ctx, client, c, s)
}
38 changes: 38 additions & 0 deletions oci/spec_opts_nonlinux.go
@@ -0,0 +1,38 @@
// +build !linux

/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package oci

import (
"context"

"github.com/containerd/containerd/containers"
)

// WithAllCurrentCapabilities propagates the effective capabilities of the caller process to the container process.
// The capability set may differ from WithAllKnownCapabilities when running in a container.
//nolint: deadcode, unused
var WithAllCurrentCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
return WithCapabilities(nil)(ctx, client, c, s)
}

// WithAllKnownCapabilities sets all the the known linux capabilities for the container process
//nolint: deadcode, unused
var WithAllKnownCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
return WithCapabilities(nil)(ctx, client, c, s)
}
4 changes: 2 additions & 2 deletions oci/spec_opts_test.go
Expand Up @@ -574,7 +574,7 @@ func TestDropCaps(t *testing.T) {

var s specs.Spec

if err := WithAllCapabilities(context.Background(), nil, nil, &s); err != nil {
if err := WithAllKnownCapabilities(context.Background(), nil, nil, &s); err != nil {
t.Fatal(err)
}
if err := WithDroppedCapabilities([]string{"CAP_CHOWN"})(context.Background(), nil, nil, &s); err != nil {
Expand All @@ -593,7 +593,7 @@ func TestDropCaps(t *testing.T) {
}

// Add all capabilities back and drop a different cap.
if err := WithAllCapabilities(context.Background(), nil, nil, &s); err != nil {
if err := WithAllKnownCapabilities(context.Background(), nil, nil, &s); err != nil {
t.Fatal(err)
}
if err := WithDroppedCapabilities([]string{"CAP_FOWNER"})(context.Background(), nil, nil, &s); err != nil {
Expand Down
9 changes: 9 additions & 0 deletions oci/spec_test.go
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/namespaces"
"github.com/containerd/containerd/pkg/testutil"
specs "github.com/opencontainers/runtime-spec/specs-go"
)

Expand Down Expand Up @@ -251,6 +252,10 @@ func TestPopulateDefaultUnixSpec(t *testing.T) {

func TestWithPrivileged(t *testing.T) {
t.Parallel()
if runtime.GOOS == "linux" {
// because WithPrivileged depends on CapEff in /proc/self/status
testutil.RequiresRoot(t)
}

ctx := namespaces.WithNamespace(context.Background(), "testing")

Expand All @@ -272,6 +277,10 @@ func TestWithPrivileged(t *testing.T) {
t.Fatal(err)
}

if runtime.GOOS != "linux" {
return
}

if len(s.Process.Capabilities.Bounding) == 0 {
t.Error("Expected capabilities to be set with privileged")
}
Expand Down
172 changes: 172 additions & 0 deletions pkg/cap/cap_linux.go
@@ -0,0 +1,172 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package cap provides Linux capability utility
package cap

import (
"bufio"
"io"
"os"
"strconv"
"strings"

"github.com/pkg/errors"
"github.com/syndtr/gocapability/capability"
)

// FromUint64 parses an integer into string slice like
// []{"CAP_SYS_ADMIN", ...}.
//
// Unknown cap numbers are returned as []int.
func FromUint64(v uint64) ([]string, []int) {
var (
res []string
unknown []int
)
knownList := capability.List()
known := make(map[string]struct{}, len(knownList))
for _, f := range knownList {
known[f.String()] = struct{}{}
}
for i := 0; i <= 63; i++ {
if b := (v >> i) & 0x1; b == 0x1 {
c := capability.Cap(i)
sRaw := c.String()
if _, ok := known[sRaw]; ok {
s := "CAP_" + strings.ToUpper(sRaw)
res = append(res, s)
} else {
unknown = append(unknown, i)
}
}
}
return res, unknown
}

// ParseProcPIDStatus returns uint64 value from /proc/<PID>/status file
func ParseProcPIDStatus(r io.Reader) (map[capability.CapType]uint64, error) {
res := make(map[capability.CapType]uint64)
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := scanner.Text()
pair := strings.SplitN(line, ":", 2)
if len(pair) != 2 {
continue
}
k := strings.TrimSpace(pair[0])
v := strings.TrimSpace(pair[1])
switch k {
case "CapInh", "CapPrm", "CapEff", "CapBnd", "CapAmb":
ui64, err := strconv.ParseUint(v, 16, 64)
if err != nil {
return nil, errors.Errorf("failed to parse line %q", line)
}
switch k {
case "CapInh":
res[capability.INHERITABLE] = ui64
case "CapPrm":
res[capability.PERMITTED] = ui64
case "CapEff":
res[capability.EFFECTIVE] = ui64
case "CapBnd":
res[capability.BOUNDING] = ui64
case "CapAmb":
res[capability.AMBIENT] = ui64
}
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
return res, nil
}

// Current returns the list of the effective and the known caps of
// the current process.
//
// The result is like []string{"CAP_SYS_ADMIN", ...}.
//
// The result does not contain caps that are not recognized by
// the "github.com/syndtr/gocapability" library.
func Current() ([]string, error) {
f, err := os.Open("/proc/self/status")
if err != nil {
return nil, err
}
defer f.Close()
caps, err := ParseProcPIDStatus(f)
if err != nil {
return nil, err
}
capEff := caps[capability.EFFECTIVE]
names, _ := FromUint64(capEff)
return names, nil
}

var (
// caps35 is the caps of kernel 3.5 (37 entries)
caps35 = []string{
"CAP_CHOWN", // 2.2
"CAP_DAC_OVERRIDE", // 2.2
"CAP_DAC_READ_SEARCH", // 2.2
"CAP_FOWNER", // 2.2
"CAP_FSETID", // 2.2
"CAP_KILL", // 2.2
"CAP_SETGID", // 2.2
"CAP_SETUID", // 2.2
"CAP_SETPCAP", // 2.2
"CAP_LINUX_IMMUTABLE", // 2.2
"CAP_NET_BIND_SERVICE", // 2.2
"CAP_NET_BROADCAST", // 2.2
"CAP_NET_ADMIN", // 2.2
"CAP_NET_RAW", // 2.2
"CAP_IPC_LOCK", // 2.2
"CAP_IPC_OWNER", // 2.2
"CAP_SYS_MODULE", // 2.2
"CAP_SYS_RAWIO", // 2.2
"CAP_SYS_CHROOT", // 2.2
"CAP_SYS_PTRACE", // 2.2
"CAP_SYS_PACCT", // 2.2
"CAP_SYS_ADMIN", // 2.2
"CAP_SYS_BOOT", // 2.2
"CAP_SYS_NICE", // 2.2
"CAP_SYS_RESOURCE", // 2.2
"CAP_SYS_TIME", // 2.2
"CAP_SYS_TTY_CONFIG", // 2.2
"CAP_MKNOD", // 2.4
"CAP_LEASE", // 2.4
"CAP_AUDIT_WRITE", // 2.6.11
"CAP_AUDIT_CONTROL", // 2.6.11
"CAP_SETFCAP", // 2.6.24
"CAP_MAC_OVERRIDE", // 2.6.25
"CAP_MAC_ADMIN", // 2.6.25
"CAP_SYSLOG", // 2.6.37
"CAP_WAKE_ALARM", // 3.0
"CAP_BLOCK_SUSPEND", // 3.5
}
// caps316 is the caps of kernel 3.16 (38 entries)
caps316 = append(caps35, "CAP_AUDIT_READ")
// caps58 is the caps of kernel 5.8 (40 entries)
caps58 = append(caps316, []string{"CAP_PERFMON", "CAP_BPF"}...)
// caps59 is the caps of kernel 5.9 (41 entries)
caps59 = append(caps58, "CAP_CHECKPOINT_RESTORE")
)

// Known returns the known cap strings as of kernel 5.9
func Known() []string {
return caps59
}

0 comments on commit a2d1a8a

Please sign in to comment.