Skip to content

Commit

Permalink
runsc: umount /proc in the sandbox namespace
Browse files Browse the repository at this point in the history
The sandbox process is started with the procfs mount and then it is umounted
before executing guest processes.

Here are a few reasons why we need /proc in the sandbox namespace:
* /proc/self/exe is used to re-executed itself without capabilities and
  with a specific uid,gid, etc. execveat() can be protected by some security
  policies. For example, on my workstation, it fails tomoyo_bprm_check_security
  checks.
* runsc reads /proc/cpuinfo, /proc/sys/vm/mmap_min_addr, /proc/self/auxv,
  /proc/sys/kernel/cap_last_cap.
* The kvm platform reads /proc/self/maps to get current mappings.

PiperOrigin-RevId: 488733897
  • Loading branch information
avagin authored and gvisor-bot committed Nov 15, 2022
1 parent 258f27e commit fcbc289
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 0 deletions.
6 changes: 6 additions & 0 deletions runsc/boot/loader.go
Expand Up @@ -124,6 +124,9 @@ type Loader struct {
// should be called when a sandbox is destroyed.
stopProfiling func()

// PreSeccompCallback is called right before installing seccomp filters.
PreSeccompCallback func()

// restore is set to true if we are restoring a container.
restore bool

Expand Down Expand Up @@ -543,6 +546,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {

// installSeccompFilters installs sandbox seccomp filters with the host.
func (l *Loader) installSeccompFilters() error {
if l.PreSeccompCallback != nil {
l.PreSeccompCallback()
}
if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
Expand Down
1 change: 1 addition & 0 deletions runsc/cli/main.go
Expand Up @@ -102,6 +102,7 @@ func Main(version string) {
const internalGroup = "internal use only"
subcommands.Register(new(cmd.Boot), internalGroup)
subcommands.Register(new(cmd.Gofer), internalGroup)
subcommands.Register(new(cmd.Umount), internalGroup)

// Register with the main command line.
config.RegisterFlags(flag.CommandLine)
Expand Down
2 changes: 2 additions & 0 deletions runsc/cmd/BUILD
Expand Up @@ -37,6 +37,7 @@ go_library(
"statefile.go",
"symbolize.go",
"syscalls.go",
"umount_unsafe.go",
"usage.go",
"wait.go",
"write_control.go",
Expand All @@ -45,6 +46,7 @@ go_library(
"//runsc:__subpackages__",
],
deps = [
"//pkg/abi/linux",
"//pkg/coretag",
"//pkg/coverage",
"//pkg/log",
Expand Down
68 changes: 68 additions & 0 deletions runsc/cmd/boot.go
Expand Up @@ -16,8 +16,10 @@ package cmd

import (
"context"
"fmt"
"io/ioutil"
"os"
"os/exec"
"runtime/debug"
"strings"

Expand Down Expand Up @@ -101,6 +103,10 @@ type Boot struct {

// FDs for profile data.
profileFDs profile.FDArgs

// procMountSyncFD is a file descriptor that has to be closed when the
// procfs mount isn't needed anymore.
procMountSyncFD int
}

// Name implements subcommands.Command.Name.
Expand All @@ -125,6 +131,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be closed when /proc isn't needed")
f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name")
Expand Down Expand Up @@ -183,9 +190,24 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma
}

if !b.applyCaps && !conf.Rootless {
// /proc is umounted from a forked process, because the
// current one is going to re-execute itself without
// capabilities.
cmd, w := b.execProcUmounter()
defer w.Close()
defer cmd.Wait()
if b.procMountSyncFD != -1 {
panic("procMountSyncFD is set")
}
b.procMountSyncFD = int(w.Fd())

// Remove --apply-caps arg to call myself. It has already been done.
args := b.prepareArgs("setup-root")

// Clear FD_CLOEXEC.
if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
util.Fatalf("error clearing CLOEXEC: %v", errno)
}
// Note that we've already read the spec from the spec FD, and
// we will read it again after the exec call. This works
// because the ReadSpecFromFile function seeks to the beginning
Expand Down Expand Up @@ -303,6 +325,28 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma
// Fatalf exits the process and doesn't run defers.
// 'l' must be destroyed explicitly after this point!

if b.procMountSyncFD != -1 {
l.PreSeccompCallback = func() {
syncFile := os.NewFile(uintptr(b.procMountSyncFD), "sync file")
buf := make([]byte, 1)
if w, err := syncFile.Write(buf); err != nil || w != 1 {
util.Fatalf("unable to write into the proc umounter descriptor: %v", err)
}
syncFile.Close()

var waitStatus unix.WaitStatus
if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil {
util.Fatalf("error waiting for the proc umounter process: %v", err)
}
if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 {
util.Fatalf("the proc umounter process failed: %v", waitStatus)
}
if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT {
util.Fatalf("/proc is still accessible")
}
}
}

// Notify the parent process the sandbox has booted (and that the controller
// is up).
startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
Expand Down Expand Up @@ -348,6 +392,9 @@ func (b *Boot) prepareArgs(exclude ...string) []string {
// process terminates.
args = append(args, "--attached")
}
if b.procMountSyncFD != -1 {
args = append(args, fmt.Sprintf("--proc-mount-sync-fd=%d", b.procMountSyncFD))
}
if len(b.productName) > 0 {
args = append(args, "--product-name", b.productName)
}
Expand All @@ -356,3 +403,24 @@ func (b *Boot) prepareArgs(exclude ...string) []string {
}
return args
}

// execProcUmounter execute a child process that umounts /proc when the sks[1]
// socket is closed.
func (b *Boot) execProcUmounter() (*exec.Cmd, *os.File) {
r, w, err := os.Pipe()
if err != nil {
util.Fatalf("error creating a pipe: %v", err)
}
defer r.Close()

cmd := exec.Command(specutils.ExePath)
cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc")
cmd.ExtraFiles = append(cmd.ExtraFiles, r)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
util.Fatalf("error executing umounter: %v", err)
}
return cmd, w
}
80 changes: 80 additions & 0 deletions runsc/cmd/umount_unsafe.go
@@ -0,0 +1,80 @@
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"context"
"os"
"syscall"
"unsafe"

"github.com/google/subcommands"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/runsc/cmd/util"
"gvisor.dev/gvisor/runsc/flag"
)

// Umount implements subcommands.Command for the "kill" command.
type Umount struct {
syncFD int
}

// Name implements subcommands.Command.Name.
func (*Umount) Name() string {
return "umount"
}

// Synopsis implements subcommands.Command.Synopsis.
func (*Umount) Synopsis() string {
return "umount the specified directory when one byte is read from synd-fd"
}

// Usage implements subcommands.Command.Usage.
func (*Umount) Usage() string {
return `umount --synd-fd=FD <directory path>`
}

// SetFlags implements subcommands.Command.SetFlags.
func (u *Umount) SetFlags(f *flag.FlagSet) {
f.IntVar(&u.syncFD, "sync-fd", -1, "")
}

// Execute implements subcommands.Command.Execute.
func (u *Umount) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
if f.NArg() == 0 || f.NArg() > 1 {
f.Usage()
return subcommands.ExitUsageError
}

dirPath := f.Arg(0)

syncFile := os.NewFile(uintptr(u.syncFD), "sync file")
defer syncFile.Close()

buf := make([]byte, 1)
if l, err := syncFile.Read(buf); err != nil || l != 1 {
util.Fatalf("unable to read from the sync descriptor: %v, error %v", l, err)
}

if _, _, errno := unix.RawSyscall(
unix.SYS_UMOUNT2,
uintptr(unsafe.Pointer(syscall.StringBytePtr(dirPath))),
uintptr(linux.MNT_DETACH), 0); errno != 0 {
util.Fatalf("Unable to umount %s: errno %v", dirPath, errno)
}

return subcommands.ExitSuccess
}

0 comments on commit fcbc289

Please sign in to comment.