Skip to content

Commit

Permalink
Implement mount namespaces
Browse files Browse the repository at this point in the history
This change implements only the basic functions of mount namespaces.
All features that depends on user namespaces will be implemented separately.

PiperOrigin-RevId: 552673896
  • Loading branch information
avagin authored and gvisor-bot committed Aug 1, 2023
1 parent e77ec6e commit 41bb04c
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 28 deletions.
69 changes: 41 additions & 28 deletions pkg/sentry/kernel/task_clone.go
Expand Up @@ -65,6 +65,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
return 0, nil, linuxerr.EINVAL
}
if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS {
return 0, nil, linuxerr.EINVAL
}

// Pull task registers and FPU state, a cloned task will inherit the
// state of the current task.
Expand Down Expand Up @@ -135,15 +138,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
netns.DecRef(t)
})

// TODO(b/63601033): Implement CLONE_NEWNS.
mntns := t.mountNamespace
if mntns != nil {
mntns.IncRef()
cu.Add(func() {
mntns.DecRef(t)
})
}

// We must hold t.mu to access t.image, but we can't hold it during Fork(),
// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
Expand All @@ -169,13 +163,27 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
}

var fsContext *FSContext
if args.Flags&linux.CLONE_FS == 0 {
if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
fsContext = t.fsContext.Fork()
} else {
fsContext = t.fsContext
fsContext.IncRef()
}

mntns := t.mountNamespace
if args.Flags&linux.CLONE_NEWNS != 0 {
var err error
mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd)
if err != nil {
return 0, nil, err
}
} else {
mntns.IncRef()
}
cu.Add(func() {
mntns.DecRef(t)
})

var fdTable *FDTable
if args.Flags&linux.CLONE_FILES == 0 {
fdTable = t.fdTable.Fork(t, MaxFdLimit)
Expand Down Expand Up @@ -534,49 +542,54 @@ func (t *Task) Unshare(flags int32) error {
t.mu.Unlock()
netns.DecRef(t)
}

cu := cleanup.Cleanup{}
// All cu actions has to be executed after releasing t.mu.
defer cu.Clean()
t.mu.Lock()
defer t.mu.Unlock()
// Can't defer unlock: DecRefs must occur without holding t.mu.
if flags&linux.CLONE_NEWUTS != 0 {
if !haveCapSysAdmin {
t.mu.Unlock()
return linuxerr.EPERM
}
// Note that this must happen after NewUserNamespace, so the
// new user namespace is used if there is one.
t.utsns = t.utsns.Clone(creds.UserNamespace)
}
var oldIPCNS *IPCNamespace
if flags&linux.CLONE_NEWIPC != 0 {
if !haveCapSysAdmin {
t.mu.Unlock()
return linuxerr.EPERM
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
oldIPCNS = t.ipcns
oldIPCNS := t.ipcns
t.ipcns = NewIPCNamespace(creds.UserNamespace)
t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
if oldIPCNS != nil {
oldIPCNS.DecRef(t)
}
cu.Add(func() { oldIPCNS.DecRef(t) })
}
var oldFDTable *FDTable
if flags&linux.CLONE_FILES != 0 {
oldFDTable = t.fdTable
oldFDTable := t.fdTable
t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
cu.Add(func() { oldFDTable.DecRef(t) })
}
var oldFSContext *FSContext
if flags&linux.CLONE_FS != 0 {
oldFSContext = t.fsContext
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
oldFSContext := t.fsContext
t.fsContext = oldFSContext.Fork()
cu.Add(func() { oldFSContext.DecRef(t) })
}
t.mu.Unlock()
if oldFDTable != nil {
oldFDTable.DecRef(t)
}
if oldFSContext != nil {
oldFSContext.DecRef(t)
if flags&linux.CLONE_NEWNS != 0 {
if !haveCapSysAdmin {
return linuxerr.EPERM
}
oldMountNS := t.mountNamespace
mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd)
if err != nil {
return err
}
t.mountNamespace = mntns
cu.Add(func() { oldMountNS.DecRef(t) })
}
return nil
}
Expand Down
78 changes: 78 additions & 0 deletions pkg/sentry/vfs/mount.go
Expand Up @@ -222,6 +222,84 @@ func (vfs *VirtualFilesystem) NewMountNamespaceFrom(ctx context.Context, creds *
return mntns
}

type cloneEntry struct {
prevMount *Mount
parentMount *Mount
}

func (vfs *VirtualFilesystem) updateRootAndCWD(ctx context.Context, root *VirtualDentry, cwd *VirtualDentry, src *Mount, dst *Mount) {
if root.mount == src {
root.mount.DecRef(ctx)
root.mount = dst
root.mount.IncRef()
}
if cwd.mount == src {
cwd.mount.DecRef(ctx)
cwd.mount = dst
cwd.mount.IncRef()
}
}

// CloneMountNamespace makes a copy of the specified mount namespace.
//
// If `root` or `cwd` have mounts in the old namespace, they will be replaced
// with proper mounts from the new namespace.
func (vfs *VirtualFilesystem) CloneMountNamespace(ctx context.Context, creds *auth.Credentials, ns *MountNamespace, root *VirtualDentry, cwd *VirtualDentry) (*MountNamespace, error) {
newns := &MountNamespace{
Owner: creds.UserNamespace,
mountpoints: make(map[*Dentry]uint32),
}
newns.InitRefs()

vdsToDecRef := []VirtualDentry{}
defer func() {
for _, vd := range vdsToDecRef {
vd.DecRef(ctx)
}
}()

vfs.mountMu.Lock()
defer vfs.mountMu.Unlock()

ns.root.root.IncRef()
ns.root.fs.IncRef()
newns.root = newMount(vfs, ns.root.fs, ns.root.root, newns, &MountOptions{Flags: ns.root.Flags, ReadOnly: ns.root.ReadOnly()})
if ns.root.propType == Shared {
vfs.addPeer(ns.root, newns.root)
}
vfs.updateRootAndCWD(ctx, root, cwd, ns.root, newns.root)

queue := []cloneEntry{cloneEntry{ns.root, newns.root}}
for len(queue) != 0 {
p := queue[0]
queue = queue[1:]
for c := range p.prevMount.children {
m := vfs.cloneMount(c, c.root, nil)
vd := VirtualDentry{
mount: p.parentMount,
dentry: c.point(),
}
vd.IncRef()

vds, err := vfs.connectMountAtLocked(ctx, m, vd)
m.DecRef(ctx)
vdsToDecRef = append(vdsToDecRef, vds...)
if err != nil {
newns.DecRef(ctx)
return nil, err
}
if c.propType == Shared {
vfs.addPeer(c, m)
}
vfs.updateRootAndCWD(ctx, root, cwd, c, m)
if len(c.children) != 0 {
queue = append(queue, cloneEntry{c, m})
}
}
}
return newns, nil
}

// NewFilesystem creates a new filesystem object not yet associated with any
// mounts. It can be installed into the filesystem tree with ConnectMountAt.
// Note that only the filesystem-specific mount options from opts are used by
Expand Down
66 changes: 66 additions & 0 deletions test/syscalls/linux/mount.cc
Expand Up @@ -14,13 +14,17 @@

#include <errno.h>
#include <fcntl.h>
#include <linux/magic.h>
#include <sched.h>
#include <stdio.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/signalfd.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/vfs.h>
#include <unistd.h>

#include <cstdint>
Expand Down Expand Up @@ -1439,6 +1443,68 @@ TEST(MountTest, DeadMountsAreDecRefd) {
}
}

TEST(MountTest, MountNamespace) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));

auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));

pid_t child = fork();
if (child == 0) {
// Create a new mount namespace and umount the test mount from it.
TEST_CHECK(unshare(CLONE_NEWNS) == 0);
TEST_CHECK(access(JoinPath(dir.path(), "foo").c_str(), F_OK) == 0);
TEST_CHECK(umount2(dir.path().c_str(), MNT_DETACH) == 0);
exit(0);
}
ASSERT_THAT(child, SyscallSucceeds());
int status;
ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);

// Check that the test mount is still here.
EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_RDWR));
}

TEST(MountTest, MountNamespacePropagation) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));

auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
auto const mnt = ASSERT_NO_ERRNO_AND_VALUE(
Mount("", dir.path(), "tmpfs", 0, "mode=0700", MNT_DETACH));
auto child_dir = JoinPath(dir.path(), "test");

ASSERT_THAT(mount(NULL, dir.path().c_str(), NULL, MS_SHARED, NULL),
SyscallSucceeds());
ASSERT_THAT(mkdir(child_dir.c_str(), 0700), SyscallSucceeds());
ASSERT_THAT(mount("child", child_dir.c_str(), "tmpfs", 0, NULL),
SyscallSucceeds());
EXPECT_NO_ERRNO(Open(JoinPath(child_dir, "foo"), O_CREAT | O_RDWR, 0777));

pid_t child = fork();
if (child == 0) {
TEST_CHECK(unshare(CLONE_NEWNS) == 0);
TEST_CHECK(access(JoinPath(child_dir, "foo").c_str(), F_OK) == 0);
// The test mount has to be umounted from the second mount namespace too.
TEST_CHECK(umount2(child_dir.c_str(), MNT_DETACH) == 0);
// The new mount has to be propagated to the second mount namespace.
TEST_CHECK(mount("test2", child_dir.c_str(), "tmpfs", 0, NULL) == 0);
TEST_CHECK(mknod(JoinPath(child_dir, "boo").c_str(), 0777 | S_IFREG, 0) ==
0);
exit(0);
}
ASSERT_THAT(child, SyscallSucceeds());
int status;
ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);

// Check that the test mount is still here.
EXPECT_NO_ERRNO(Open(JoinPath(child_dir, "boo"), O_RDWR));
EXPECT_THAT(umount2(child_dir.c_str(), MNT_DETACH), SyscallSucceeds());
}

} // namespace

} // namespace testing
Expand Down

0 comments on commit 41bb04c

Please sign in to comment.