Implement mount namespaces

This change implements only the basic functions of mount namespaces. All features that depends on user namespaces will be implemented separately. PiperOrigin-RevId: 552673896
google · Aug 1, 2023 · 41bb04c · 41bb04c
1 parent e77ec6e
commit 41bb04c
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 28 deletions.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
@@ -65,6 +65,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
 	if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
 		return 0, nil, linuxerr.EINVAL
 	}
+	if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS {
+		return 0, nil, linuxerr.EINVAL
+	}
 
 	// Pull task registers and FPU state, a cloned task will inherit the
 	// state of the current task.
@@ -135,15 +138,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
 		netns.DecRef(t)
 	})
 
-	// TODO(b/63601033): Implement CLONE_NEWNS.
-	mntns := t.mountNamespace
-	if mntns != nil {
-		mntns.IncRef()
-		cu.Add(func() {
-			mntns.DecRef(t)
-		})
-	}
-
 	// We must hold t.mu to access t.image, but we can't hold it during Fork(),
 	// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
 	// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
@@ -169,13 +163,27 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
 	}
 
 	var fsContext *FSContext
-	if args.Flags&linux.CLONE_FS == 0 {
+	if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
 		fsContext = t.fsContext.Fork()
 	} else {
 		fsContext = t.fsContext
 		fsContext.IncRef()
 	}
 
+	mntns := t.mountNamespace
+	if args.Flags&linux.CLONE_NEWNS != 0 {
+		var err error
+		mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd)
+		if err != nil {
+			return 0, nil, err
+		}
+	} else {
+		mntns.IncRef()
+	}
+	cu.Add(func() {
+		mntns.DecRef(t)
+	})
+
 	var fdTable *FDTable
 	if args.Flags&linux.CLONE_FILES == 0 {
 		fdTable = t.fdTable.Fork(t, MaxFdLimit)
@@ -534,49 +542,54 @@ func (t *Task) Unshare(flags int32) error {
 		t.mu.Unlock()
 		netns.DecRef(t)
 	}
+
+	cu := cleanup.Cleanup{}
+	// All cu actions has to be executed after releasing t.mu.
+	defer cu.Clean()
 	t.mu.Lock()
+	defer t.mu.Unlock()
 	// Can't defer unlock: DecRefs must occur without holding t.mu.
 	if flags&linux.CLONE_NEWUTS != 0 {
 		if !haveCapSysAdmin {
-			t.mu.Unlock()
 			return linuxerr.EPERM
 		}
 		// Note that this must happen after NewUserNamespace, so the
 		// new user namespace is used if there is one.
 		t.utsns = t.utsns.Clone(creds.UserNamespace)
 	}
-	var oldIPCNS *IPCNamespace
 	if flags&linux.CLONE_NEWIPC != 0 {
 		if !haveCapSysAdmin {
-			t.mu.Unlock()
 			return linuxerr.EPERM
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		oldIPCNS = t.ipcns
+		oldIPCNS := t.ipcns
 		t.ipcns = NewIPCNamespace(creds.UserNamespace)
 		t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
 		t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
-		if oldIPCNS != nil {
-			oldIPCNS.DecRef(t)
-		}
+		cu.Add(func() { oldIPCNS.DecRef(t) })
 	}
-	var oldFDTable *FDTable
 	if flags&linux.CLONE_FILES != 0 {
-		oldFDTable = t.fdTable
+		oldFDTable := t.fdTable
 		t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
+		cu.Add(func() { oldFDTable.DecRef(t) })
 	}
-	var oldFSContext *FSContext
-	if flags&linux.CLONE_FS != 0 {
-		oldFSContext = t.fsContext
+	if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
+		oldFSContext := t.fsContext
 		t.fsContext = oldFSContext.Fork()
+		cu.Add(func() { oldFSContext.DecRef(t) })
 	}
-	t.mu.Unlock()
-	if oldFDTable != nil {
-		oldFDTable.DecRef(t)
-	}
-	if oldFSContext != nil {
-		oldFSContext.DecRef(t)
+	if flags&linux.CLONE_NEWNS != 0 {
+		if !haveCapSysAdmin {
+			return linuxerr.EPERM
+		}
+		oldMountNS := t.mountNamespace
+		mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd)
+		if err != nil {
+			return err
+		}
+		t.mountNamespace = mntns
+		cu.Add(func() { oldMountNS.DecRef(t) })
 	}
 	return nil
 }

diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
@@ -222,6 +222,84 @@ func (vfs *VirtualFilesystem) NewMountNamespaceFrom(ctx context.Context, creds *
 	return mntns
 }
 
+type cloneEntry struct {
+	prevMount   *Mount
+	parentMount *Mount
+}
+
+func (vfs *VirtualFilesystem) updateRootAndCWD(ctx context.Context, root *VirtualDentry, cwd *VirtualDentry, src *Mount, dst *Mount) {
+	if root.mount == src {
+		root.mount.DecRef(ctx)
+		root.mount = dst
+		root.mount.IncRef()
+	}
+	if cwd.mount == src {
+		cwd.mount.DecRef(ctx)
+		cwd.mount = dst
+		cwd.mount.IncRef()
+	}
+}
+
+// CloneMountNamespace makes a copy of the specified mount namespace.
+//
+// If `root` or `cwd` have mounts in the old namespace, they will be replaced
+// with proper mounts from the new namespace.
+func (vfs *VirtualFilesystem) CloneMountNamespace(ctx context.Context, creds *auth.Credentials, ns *MountNamespace, root *VirtualDentry, cwd *VirtualDentry) (*MountNamespace, error) {
+	newns := &MountNamespace{
+		Owner:       creds.UserNamespace,
+		mountpoints: make(map[*Dentry]uint32),
+	}
+	newns.InitRefs()
+
+	vdsToDecRef := []VirtualDentry{}
+	defer func() {
+		for _, vd := range vdsToDecRef {
+			vd.DecRef(ctx)
+		}
+	}()
+
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+
+	ns.root.root.IncRef()
+	ns.root.fs.IncRef()
+	newns.root = newMount(vfs, ns.root.fs, ns.root.root, newns, &MountOptions{Flags: ns.root.Flags, ReadOnly: ns.root.ReadOnly()})
+	if ns.root.propType == Shared {
+		vfs.addPeer(ns.root, newns.root)
+	}
+	vfs.updateRootAndCWD(ctx, root, cwd, ns.root, newns.root)
+
+	queue := []cloneEntry{cloneEntry{ns.root, newns.root}}
+	for len(queue) != 0 {
+		p := queue[0]
+		queue = queue[1:]
+		for c := range p.prevMount.children {
+			m := vfs.cloneMount(c, c.root, nil)
+			vd := VirtualDentry{
+				mount:  p.parentMount,
+				dentry: c.point(),
+			}
+			vd.IncRef()
+
+			vds, err := vfs.connectMountAtLocked(ctx, m, vd)
+			m.DecRef(ctx)
+			vdsToDecRef = append(vdsToDecRef, vds...)
+			if err != nil {
+				newns.DecRef(ctx)
+				return nil, err
+			}
+			if c.propType == Shared {
+				vfs.addPeer(c, m)
+			}
+			vfs.updateRootAndCWD(ctx, root, cwd, c, m)
+			if len(c.children) != 0 {
+				queue = append(queue, cloneEntry{c, m})
+			}
+		}
+	}
+	return newns, nil
+}
+
 // NewFilesystem creates a new filesystem object not yet associated with any
 // mounts. It can be installed into the filesystem tree with ConnectMountAt.
 // Note that only the filesystem-specific mount options from opts are used by

diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
@@ -14,13 +14,17 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/magic.h>
+#include <sched.h>
 #include <stdio.h>
 #include <sys/eventfd.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/signalfd.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/vfs.h>
 #include <unistd.h>
 
 #include <cstdint>
@@ -1439,6 +1443,68 @@ TEST(MountTest, DeadMountsAreDecRefd) {
   }
 }
 
+TEST(MountTest, MountNamespace) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+  EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+
+  pid_t child = fork();
+  if (child == 0) {
+    // Create a new mount namespace and umount the test mount from it.
+    TEST_CHECK(unshare(CLONE_NEWNS) == 0);
+    TEST_CHECK(access(JoinPath(dir.path(), "foo").c_str(), F_OK) == 0);
+    TEST_CHECK(umount2(dir.path().c_str(), MNT_DETACH) == 0);
+    exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
+  ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+
+  // Check that the test mount is still here.
+  EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_RDWR));
+}
+
+TEST(MountTest, MountNamespacePropagation) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mnt = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", 0, "mode=0700", MNT_DETACH));
+  auto child_dir = JoinPath(dir.path(), "test");
+
+  ASSERT_THAT(mount(NULL, dir.path().c_str(), NULL, MS_SHARED, NULL),
+              SyscallSucceeds());
+  ASSERT_THAT(mkdir(child_dir.c_str(), 0700), SyscallSucceeds());
+  ASSERT_THAT(mount("child", child_dir.c_str(), "tmpfs", 0, NULL),
+              SyscallSucceeds());
+  EXPECT_NO_ERRNO(Open(JoinPath(child_dir, "foo"), O_CREAT | O_RDWR, 0777));
+
+  pid_t child = fork();
+  if (child == 0) {
+    TEST_CHECK(unshare(CLONE_NEWNS) == 0);
+    TEST_CHECK(access(JoinPath(child_dir, "foo").c_str(), F_OK) == 0);
+    // The test mount has to be umounted from the second mount namespace too.
+    TEST_CHECK(umount2(child_dir.c_str(), MNT_DETACH) == 0);
+    // The new mount has to be propagated to the second mount namespace.
+    TEST_CHECK(mount("test2", child_dir.c_str(), "tmpfs", 0, NULL) == 0);
+    TEST_CHECK(mknod(JoinPath(child_dir, "boo").c_str(), 0777 | S_IFREG, 0) ==
+               0);
+    exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
+  ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+
+  // Check that the test mount is still here.
+  EXPECT_NO_ERRNO(Open(JoinPath(child_dir, "boo"), O_RDWR));
+  EXPECT_THAT(umount2(child_dir.c_str(), MNT_DETACH), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing