diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 6e8ef648ef..e820a74508 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET), "mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS), "pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID), - "user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"), + "user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER), "ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC), "uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS), }), diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 8c4cff84aa..708ddafe28 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -1461,6 +1461,12 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { return pidns.GetInode() } return nil + case linux.CLONE_NEWUSER: + inode, _ := t.UserNamespace().TryGetInode().(*nsfs.Inode) + if inode == nil { + return nil + } + return inode default: panic("unknown namespace") } diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index ff56b7d878..fb37931c77 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -158,6 +158,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns UserCounters: k.GetUserCounters(creds.RealKUID), } config.NetworkNamespace.IncRef() + config.Credentials.UserNamespace.IncRef() t, err := k.TaskSet().NewTask(ctx, config) if err != nil { config.ThreadGroup.Release(ctx) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 4a6760b651..fd238f4737 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -99,6 +99,7 @@ go_library( "//pkg/errors/linuxerr", "//pkg/log", "//pkg/rand", + "//pkg/refs", "//pkg/sentry/seccheck", "//pkg/sentry/seccheck/points:points_go_proto", "//pkg/sync", diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index 69fa7d7d46..b97628ecc6 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/refs" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for @@ -60,6 +61,10 @@ type UserNamespace struct { // setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu. setgroupsAllowed bool + + // inode is the nsfs inode associated with this namespace. This is stored as + // refs.TryRefCounter instead of *nsfs.Inode because nsfs imports auth. + inode refs.TryRefCounter } // NewRootUserNamespace returns a UserNamespace that is appropriate for a @@ -99,6 +104,56 @@ func (ns *UserNamespace) Root() *UserNamespace { return ns } +// Type implements vfs.Namespace.Type. +func (ns *UserNamespace) Type() string { + return "user" +} + +// Destroy implements vfs.Namespace.Destroy. +func (ns *UserNamespace) Destroy(ctx context.Context) {} + +// UserNamespace implements vfs.Namespace.UserNamespace. +func (ns *UserNamespace) UserNamespace() *UserNamespace { + return ns +} + +// SetInode sets the nsfs inode associated with ns. The initial ref on inode is +// the task or kernel ref for a newly-created user namespace, so those callers +// don't need a separate IncRef. +func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) { + ns.mu.Lock() + defer ns.mu.Unlock() + ns.inode = inode +} + +// IncRef increments ns's inode refcount. +func (ns *UserNamespace) IncRef() { + ns.mu.Lock() + defer ns.mu.Unlock() + if ns.inode != nil { + ns.inode.IncRef() + } +} + +// TryGetInode returns ns's inode with an incremented refcount. +func (ns *UserNamespace) TryGetInode() refs.TryRefCounter { + ns.mu.Lock() + defer ns.mu.Unlock() + if ns.inode == nil || !ns.inode.TryIncRef() { + return nil + } + return ns.inode +} + +// DecRef decrements ns's inode refcount. +func (ns *UserNamespace) DecRef(ctx context.Context) { + ns.mu.Lock() + defer ns.mu.Unlock() + if ns.inode != nil { + ns.inode.DecRef(ctx) + } +} + // "The kernel imposes (since version 3.11) a limit of 32 nested levels of user // namespaces." - user_namespaces(7) const maxUserNamespaceDepth = 32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1a07719e60..3cd6356d4c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { } defer nsfsFilesystem.DecRef(ctx) k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) + k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace)) k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) @@ -1330,6 +1331,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, config.UTSNamespace.IncRef() config.IPCNamespace.IncRef() config.NetworkNamespace.IncRef() + config.Credentials.UserNamespace.IncRef() refcountCu.Release() // refs(mntns, fsContext) are transferred to NewTask() t, err := k.tasks.NewTask(ctx, config) if err != nil { @@ -2105,6 +2107,7 @@ func (k *Kernel) Release() { k.cleaupDevGofers() k.mf.Destroy() k.RootPIDNamespace().DecRef(ctx) + k.rootUserNamespace.DecRef(ctx) } // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 5473c03458..0d801eadcb 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -124,6 +125,8 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { // user_namespaces(7) creds := t.Credentials() userns := creds.UserNamespace + cu := cleanup.Make(func() {}) + defer cu.Clean() if args.Flags&linux.CLONE_NEWUSER != 0 { var err error // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and @@ -138,14 +141,17 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { if err != nil { return 0, nil, err } + userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns)) + } else { + userns.IncRef() } + cu.Add(func() { + userns.DecRef(t) + }) if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { return 0, nil, linuxerr.EPERM } - cu := cleanup.Make(func() {}) - defer cu.Clean() - utsns := t.utsns if args.Flags&linux.CLONE_NEWUTS != 0 { // Note that this must happen after NewUserNamespace so we get @@ -285,6 +291,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { if uc.uid != creds.RealKUID { uc = t.k.GetUserCounters(creds.RealKUID) } + childCreds := creds + if userns != creds.UserNamespace { + childCreds = creds.ForkIntoUserNamespace(userns) + } cfg := &TaskConfig{ Kernel: t.k, @@ -293,7 +303,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { TaskImage: image, FSContext: fsContext, FDTable: fdTable, - Credentials: creds, + Credentials: childCreds, NoNewPrivs: t.GetNoNewPrivs(), Niceness: t.Niceness(), NetworkNamespace: netns, @@ -341,10 +351,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { nt.SetSignalStack(t.SignalStack()) } - if userns != creds.UserNamespace { - nt.creds.Store(creds.ForkIntoUserNamespace(userns)) - } - // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) @@ -509,6 +515,7 @@ type namespaceSet struct { utsNS *UTSNamespace ipcNS *IPCNamespace mountNS *vfs.MountNamespace + userNS *auth.UserNamespace fsContext *FSContext } @@ -529,6 +536,9 @@ func (nss *namespaceSet) release(t *Task) { if nss.mountNS != nil { nss.mountNS.DecRef(t) } + if nss.userNS != nil { + nss.userNS.DecRef(t) + } if nss.fsContext != nil { nss.fsContext.DecRef(t) @@ -536,7 +546,7 @@ func (nss *namespaceSet) release(t *Task) { } func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error { - supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS) + supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER) if (uint32(flags) & ^supported) != 0 || flags == 0 { return linuxerr.EINVAL } @@ -567,6 +577,15 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error target.mu.Lock() defer target.mu.Unlock() + if flags&linux.CLONE_NEWUSER != 0 { + // User namespaces are stored in credentials, which outlive the other + // namespace fields cleared during task exit. + if target.ExitState() >= TaskExitInitiated { + return linuxerr.ESRCH + } + nss.userNS = target.Credentials().UserNamespace + nss.userNS.IncRef() + } if flags&linux.CLONE_NEWNET != 0 { nss.netNS = target.netns if nss.netNS == nil { @@ -631,6 +650,12 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error { } nss.mountNS = ns ns.IncRef() + case *auth.UserNamespace: + if flags != 0 && flags != linux.CLONE_NEWUSER { + return linuxerr.EINVAL + } + nss.userNS = ns + ns.IncRef() default: return linuxerr.EINVAL } @@ -667,8 +692,29 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { return err } + creds := t.Credentials() + checkCreds := creds + if nss.userNS != nil { + if nss.userNS == creds.UserNamespace { + return linuxerr.EINVAL + } + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return linuxerr.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + if t.FSContext().ReadRefs() != 1 { + return linuxerr.EINVAL + } + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) { + return linuxerr.EPERM + } + checkCreds = creds.ForkIntoUserNamespace(nss.userNS) + } + if nss.childPIDNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } // Allow setting the current or a child pid namespace. @@ -685,25 +731,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { } if nss.netNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.utsNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.ipcNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.mountNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldFSContext := t.FSContext() @@ -723,6 +769,10 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { // Swap to new namespaces. // Store replaced resources in nss so that they're cleaned up by the deferred function. t.mu.Lock() + if nss.userNS != nil { + t.creds.Store(checkCreds) + nss.userNS = creds.UserNamespace + } if nss.childPIDNS != nil { t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace } @@ -786,10 +836,12 @@ func (t *Task) Unshare(flags int32) error { // Prepare new execution context. creds := t.Credentials() + originalUserNS := creds.UserNamespace var ( newFSContext *FSContext newFDTable *FDTable newCreds bool + newUserNS *auth.UserNamespace newChildPIDNS *PIDNamespace newNetNS *inet.Namespace newUTSNS *UTSNamespace @@ -803,6 +855,9 @@ func (t *Task) Unshare(flags int32) error { if newFDTable != nil { newFDTable.DecRef(t) } + if newUserNS != nil { + newUserNS.DecRef(t) + } if newNetNS != nil { newNetNS.DecRef(t) } @@ -827,10 +882,11 @@ func (t *Task) Unshare(flags int32) error { return linuxerr.EPERM } var err error - newUserNS, err := creds.NewChildUserNamespace() + newUserNS, err = creds.NewChildUserNamespace() if err != nil { return err } + newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS)) creds = t.Credentials().ForkIntoUserNamespace(newUserNS) newCreds = true } @@ -869,11 +925,12 @@ func (t *Task) Unshare(flags int32) error { // Switch to new execution context. Store replaced resources in new* so // that they're cleaned up by the deferred function. + t.mu.Lock() + defer t.mu.Unlock() if newCreds { t.creds.Store(creds) + newUserNS = originalUserNS } - t.mu.Lock() - defer t.mu.Unlock() if newFSContext != nil { oldFSContext := t.FSContext() // unshareFromTask() lowers the old fs context's ref count, but its for us to diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 6e0792824a..d0978ab8ad 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -313,6 +313,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.ipcns = nil netns := t.netns t.netns = nil + userns := t.Credentials().UserNamespace childPIDNS := t.childPIDNamespace t.childPIDNamespace = nil t.mu.Unlock() @@ -320,6 +321,7 @@ func (*runExitMain) execute(t *Task) taskRunState { utsns.DecRef(t) ipcns.DecRef(t) netns.DecRef(t) + userns.DecRef(t) if childPIDNS != nil { childPIDNS.DecRef(t) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a5d4f19ee6..bfc9f2112e 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -74,7 +74,9 @@ type TaskConfig struct { // succeeds. FDTable *FDTable - // Credentials is the Credentials of the new task. + // Credentials is the Credentials of the new task. A reference must be held + // on Credentials.UserNamespace, which is transferred to TaskSet.NewTask + // whether or not it succeeds. Credentials *auth.Credentials // NoNewPrivs determines if the task can gain new privileges. @@ -143,6 +145,7 @@ func (ts *TaskSet) cloneNewTask(ctx context.Context, cfg *TaskConfig) (*Task, in var err error cleanup := func() { cfg.TaskImage.release(ctx) + cfg.Credentials.UserNamespace.DecRef(ctx) cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.UTSNamespace.DecRef(ctx) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index f3682170be..90c35550b6 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2511,8 +2511,10 @@ cc_binary( malloc = "//test/util:errno_safe_allocator", deps = select_gtest() + [ "//test/util:capability_util", + "//test/util:cleanup", "//test/util:file_descriptor", "//test/util:logging", + "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/setns.cc b/test/syscalls/linux/setns.cc index 5c625f7449..78b78ab1cf 100644 --- a/test/syscalls/linux/setns.cc +++ b/test/syscalls/linux/setns.cc @@ -12,22 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include #include #include +#include +#include #include +#include #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" +#include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/linux_capability_util.h" #include "test/util/logging.h" +#include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/test_util.h" @@ -35,6 +41,44 @@ namespace gvisor { namespace testing { namespace { +struct UserNamespaceChild { + FileDescriptor nsfd; + Cleanup cleanup; +}; + +UserNamespaceChild CreateUserNamespaceChild() { + int pfd[2]; + TEST_PCHECK(pipe(pfd) == 0); + FileDescriptor pipe_read(pfd[0]); + FileDescriptor pipe_write(pfd[1]); + + pid_t child = fork(); + TEST_PCHECK(child >= 0); + if (child == 0) { + pipe_read.reset(); + TEST_CHECK_SUCCESS(unshare(CLONE_NEWUSER)); + TEST_CHECK_SUCCESS(write(pipe_write.get(), "R", 1)); + pipe_write.reset(); + pause(); + _exit(0); + } + Cleanup cleanup([child] { + kill(child, SIGKILL); + int status; + RetryEINTR(waitpid)(child, &status, 0); + }); + pipe_write.reset(); + + char buf; + TEST_PCHECK(read(pipe_read.get(), &buf, 1) == 1); + + char nspath[PATH_MAX]; + snprintf(nspath, sizeof(nspath), "/proc/%d/ns/user", child); + FileDescriptor nsfd = TEST_CHECK_NO_ERRNO_AND_VALUE(Open(nspath, O_RDONLY)); + + return {std::move(nsfd), std::move(cleanup)}; +} + TEST(SetnsTest, ChangeIPCNamespace) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); @@ -156,6 +200,139 @@ TEST(SetnsTest, ChangeMountNamespaceZeroFlags) { ASSERT_THAT(setns(nsfd.get(), 0), SyscallSucceedsWithValue(0)); } +TEST(SetnsTest, ChangeUserNamespace) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace())); + + UserNamespaceChild child = CreateUserNamespaceChild(); + EXPECT_THAT(InForkedProcess([&child] { + TEST_CHECK_SUCCESS(setns(child.nsfd.get(), CLONE_NEWUSER)); + }), + IsPosixErrorOkAndHolds(0)); +} + +TEST(SetnsTest, ChangeUserNamespaceRejectsCurrentUserNamespace) { + const FileDescriptor nsfd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/thread-self/ns/user", O_RDONLY)); + EXPECT_THAT(setns(nsfd.get(), CLONE_NEWUSER), SyscallFailsWithErrno(EINVAL)); +} + +TEST(SetnsTest, ChangeUserNamespaceRejectsMultithreadedCaller) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace())); + + UserNamespaceChild child = CreateUserNamespaceChild(); + + int nsfd = child.nsfd.get(); + EXPECT_THAT(InForkedProcess([nsfd] { + int ready_fds[2]; + TEST_PCHECK(pipe(ready_fds) == 0); + int stop_fds[2]; + TEST_PCHECK(pipe(stop_fds) == 0); + int done_fds[2]; + TEST_PCHECK(pipe(done_fds) == 0); + + struct ThreadArgs { + int ready_read_fd; + int ready_write_fd; + int stop_read_fd; + int stop_write_fd; + int done_read_fd; + int done_write_fd; + } args = {ready_fds[0], ready_fds[1], stop_fds[0], + stop_fds[1], done_fds[0], done_fds[1]}; + struct clone_arg { + char stack[1024] __attribute__((aligned(16))); + char stack_ptr[0]; + } ca; + pid_t tid = clone( + +[](void* arg) { + ThreadArgs* args = static_cast(arg); + TEST_PCHECK(close(args->ready_read_fd) == 0); + TEST_PCHECK(close(args->stop_write_fd) == 0); + TEST_PCHECK(close(args->done_read_fd) == 0); + TEST_PCHECK(write(args->ready_write_fd, "R", 1) == 1); + TEST_PCHECK(close(args->ready_write_fd) == 0); + char buf; + TEST_PCHECK(read(args->stop_read_fd, &buf, 1) == 1); + TEST_PCHECK(close(args->stop_read_fd) == 0); + TEST_PCHECK(write(args->done_write_fd, "D", 1) == 1); + TEST_PCHECK(close(args->done_write_fd) == 0); + return 0; + }, + ca.stack_ptr, CLONE_SIGHAND | CLONE_THREAD | CLONE_VM, + &args); + TEST_PCHECK(tid >= 0); + TEST_PCHECK(close(ready_fds[1]) == 0); + TEST_PCHECK(close(stop_fds[0]) == 0); + TEST_PCHECK(close(done_fds[1]) == 0); + + char buf; + TEST_PCHECK(read(ready_fds[0], &buf, 1) == 1); + TEST_PCHECK(close(ready_fds[0]) == 0); + TEST_CHECK_ERRNO(setns(nsfd, CLONE_NEWUSER), EINVAL); + TEST_PCHECK(write(stop_fds[1], "S", 1) == 1); + TEST_PCHECK(close(stop_fds[1]) == 0); + TEST_PCHECK(read(done_fds[0], &buf, 1) == 1); + TEST_PCHECK(close(done_fds[0]) == 0); + }), + IsPosixErrorOkAndHolds(0)); +} + +TEST(SetnsTest, ChangeUserNamespaceRejectsSharedFS) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace())); + + UserNamespaceChild userns_child = CreateUserNamespaceChild(); + int nsfd = userns_child.nsfd.get(); + + EXPECT_THAT( + InForkedProcess([nsfd] { + int pfd[2]; + TEST_PCHECK(pipe(pfd) == 0); + + struct CloneFSArgs { + int read_fd; + int write_fd; + } args = {pfd[0], pfd[1]}; + struct clone_arg { + char stack[128] __attribute__((aligned(16))); + char stack_ptr[0]; + } ca; + pid_t fs_child = clone( + +[](void* arg) { + CloneFSArgs* args = static_cast(arg); + TEST_PCHECK(close(args->write_fd) == 0); + char buf; + TEST_PCHECK(read(args->read_fd, &buf, 1) >= 0); + TEST_PCHECK(close(args->read_fd) == 0); + _exit(0); + return 0; + }, + ca.stack_ptr, CLONE_FS | SIGCHLD, &args); + TEST_PCHECK(fs_child >= 0); + TEST_PCHECK(close(pfd[0]) == 0); + + TEST_CHECK_ERRNO(setns(nsfd, CLONE_NEWUSER), EINVAL); + + TEST_PCHECK(close(pfd[1]) == 0); + int status; + TEST_PCHECK(RetryEINTR(waitpid)(fs_child, &status, 0) == fs_child); + TEST_CHECK(WIFEXITED(status) && WEXITSTATUS(status) == 0); + }), + IsPosixErrorOkAndHolds(0)); +} + +TEST(SetnsTest, ChangeUserNamespaceRejectsMissingTargetCapability) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace())); + + const FileDescriptor parent_nsfd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/thread-self/ns/user", O_RDONLY)); + EXPECT_THAT(InForkedProcess([&parent_nsfd] { + TEST_CHECK_SUCCESS(unshare(CLONE_NEWUSER)); + TEST_CHECK_ERRNO(setns(parent_nsfd.get(), CLONE_NEWUSER), + EPERM); + }), + IsPosixErrorOkAndHolds(0)); +} + } // namespace } // namespace testing } // namespace gvisor