Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/sentry/fsimpl/proc/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns
"net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
"mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS),
"pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID),
"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
"user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER),
"ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC),
"uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS),
}),
Expand Down
6 changes: 6 additions & 0 deletions pkg/sentry/fsimpl/proc/task_files.go
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,12 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
return pidns.GetInode()
}
return nil
case linux.CLONE_NEWUSER:
inode, _ := t.UserNamespace().TryGetInode().(*nsfs.Inode)
if inode == nil {
return nil
}
return inode
default:
panic("unknown namespace")
}
Expand Down
1 change: 1 addition & 0 deletions pkg/sentry/fsimpl/testutil/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
UserCounters: k.GetUserCounters(creds.RealKUID),
}
config.NetworkNamespace.IncRef()
config.Credentials.UserNamespace.IncRef()
t, err := k.TaskSet().NewTask(ctx, config)
if err != nil {
config.ThreadGroup.Release(ctx)
Expand Down
1 change: 1 addition & 0 deletions pkg/sentry/kernel/auth/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ go_library(
"//pkg/errors/linuxerr",
"//pkg/log",
"//pkg/rand",
"//pkg/refs",
"//pkg/sentry/seccheck",
"//pkg/sentry/seccheck/points:points_go_proto",
"//pkg/sync",
Expand Down
55 changes: 55 additions & 0 deletions pkg/sentry/kernel/auth/user_namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/refs"
)

// A UserNamespace represents a user namespace. See user_namespaces(7) for
Expand Down Expand Up @@ -60,6 +61,10 @@ type UserNamespace struct {

// setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu.
setgroupsAllowed bool

// inode is the nsfs inode associated with this namespace. This is stored as
// refs.TryRefCounter instead of *nsfs.Inode because nsfs imports auth.
inode refs.TryRefCounter
}

// NewRootUserNamespace returns a UserNamespace that is appropriate for a
Expand Down Expand Up @@ -99,6 +104,56 @@ func (ns *UserNamespace) Root() *UserNamespace {
return ns
}

// Type implements vfs.Namespace.Type.
func (ns *UserNamespace) Type() string {
return "user"
}

// Destroy implements vfs.Namespace.Destroy.
func (ns *UserNamespace) Destroy(ctx context.Context) {}

// UserNamespace implements vfs.Namespace.UserNamespace.
func (ns *UserNamespace) UserNamespace() *UserNamespace {
return ns
}

// SetInode sets the nsfs inode associated with ns. The initial ref on inode is
// the task or kernel ref for a newly-created user namespace, so those callers
// don't need a separate IncRef.
func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) {
ns.mu.Lock()
defer ns.mu.Unlock()
ns.inode = inode
}

// IncRef increments ns's inode refcount.
func (ns *UserNamespace) IncRef() {
ns.mu.Lock()
defer ns.mu.Unlock()
if ns.inode != nil {
ns.inode.IncRef()
}
}

// TryGetInode returns ns's inode with an incremented refcount.
func (ns *UserNamespace) TryGetInode() refs.TryRefCounter {
ns.mu.Lock()
defer ns.mu.Unlock()
if ns.inode == nil || !ns.inode.TryIncRef() {
return nil
}
return ns.inode
}

// DecRef decrements ns's inode refcount.
func (ns *UserNamespace) DecRef(ctx context.Context) {
ns.mu.Lock()
defer ns.mu.Unlock()
if ns.inode != nil {
ns.inode.DecRef(ctx)
}
}

// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
// namespaces." - user_namespaces(7)
const maxUserNamespaceDepth = 32
Expand Down
3 changes: 3 additions & 0 deletions pkg/sentry/kernel/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
}
defer nsfsFilesystem.DecRef(ctx)
k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace))
k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
Expand Down Expand Up @@ -1330,6 +1331,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
config.UTSNamespace.IncRef()
config.IPCNamespace.IncRef()
config.NetworkNamespace.IncRef()
config.Credentials.UserNamespace.IncRef()
refcountCu.Release() // refs(mntns, fsContext) are transferred to NewTask()
t, err := k.tasks.NewTask(ctx, config)
if err != nil {
Expand Down Expand Up @@ -2105,6 +2107,7 @@ func (k *Kernel) Release() {
k.cleaupDevGofers()
k.mf.Destroy()
k.RootPIDNamespace().DecRef(ctx)
k.rootUserNamespace.DecRef(ctx)
}

// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
Expand Down
91 changes: 74 additions & 17 deletions pkg/sentry/kernel/task_clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/seccheck"
pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
"gvisor.dev/gvisor/pkg/sentry/vfs"
Expand Down Expand Up @@ -124,6 +125,8 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
// user_namespaces(7)
creds := t.Credentials()
userns := creds.UserNamespace
cu := cleanup.Make(func() {})
defer cu.Clean()
if args.Flags&linux.CLONE_NEWUSER != 0 {
var err error
// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
Expand All @@ -138,14 +141,17 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
if err != nil {
return 0, nil, err
}
userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns))
} else {
userns.IncRef()
}
cu.Add(func() {
userns.DecRef(t)
})
if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
return 0, nil, linuxerr.EPERM
}

cu := cleanup.Make(func() {})
defer cu.Clean()

utsns := t.utsns
if args.Flags&linux.CLONE_NEWUTS != 0 {
// Note that this must happen after NewUserNamespace so we get
Expand Down Expand Up @@ -285,6 +291,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
if uc.uid != creds.RealKUID {
uc = t.k.GetUserCounters(creds.RealKUID)
}
childCreds := creds
if userns != creds.UserNamespace {
childCreds = creds.ForkIntoUserNamespace(userns)
}

cfg := &TaskConfig{
Kernel: t.k,
Expand All @@ -293,7 +303,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
TaskImage: image,
FSContext: fsContext,
FDTable: fdTable,
Credentials: creds,
Credentials: childCreds,
NoNewPrivs: t.GetNoNewPrivs(),
Niceness: t.Niceness(),
NetworkNamespace: netns,
Expand Down Expand Up @@ -341,10 +351,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
nt.SetSignalStack(t.SignalStack())
}

if userns != creds.UserNamespace {
nt.creds.Store(creds.ForkIntoUserNamespace(userns))
}

// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
// nt that it must receive before its task goroutine starts running.
tid := nt.k.tasks.Root.IDOfTask(nt)
Expand Down Expand Up @@ -509,6 +515,7 @@ type namespaceSet struct {
utsNS *UTSNamespace
ipcNS *IPCNamespace
mountNS *vfs.MountNamespace
userNS *auth.UserNamespace

fsContext *FSContext
}
Expand All @@ -529,14 +536,17 @@ func (nss *namespaceSet) release(t *Task) {
if nss.mountNS != nil {
nss.mountNS.DecRef(t)
}
if nss.userNS != nil {
nss.userNS.DecRef(t)
}

if nss.fsContext != nil {
nss.fsContext.DecRef(t)
}
}

func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error {
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS)
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER)
if (uint32(flags) & ^supported) != 0 || flags == 0 {
return linuxerr.EINVAL
}
Expand Down Expand Up @@ -567,6 +577,15 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error
target.mu.Lock()
defer target.mu.Unlock()

if flags&linux.CLONE_NEWUSER != 0 {
// User namespaces are stored in credentials, which outlive the other
// namespace fields cleared during task exit.
if target.ExitState() >= TaskExitInitiated {
return linuxerr.ESRCH
}
nss.userNS = target.Credentials().UserNamespace
nss.userNS.IncRef()
}
if flags&linux.CLONE_NEWNET != 0 {
nss.netNS = target.netns
if nss.netNS == nil {
Expand Down Expand Up @@ -631,6 +650,12 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error {
}
nss.mountNS = ns
ns.IncRef()
case *auth.UserNamespace:
if flags != 0 && flags != linux.CLONE_NEWUSER {
return linuxerr.EINVAL
}
nss.userNS = ns
ns.IncRef()
default:
return linuxerr.EINVAL
}
Expand Down Expand Up @@ -667,8 +692,29 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
return err
}

creds := t.Credentials()
checkCreds := creds
if nss.userNS != nil {
if nss.userNS == creds.UserNamespace {
return linuxerr.EINVAL
}
t.tg.signalHandlers.mu.Lock()
if t.tg.tasksCount != 1 {
t.tg.signalHandlers.mu.Unlock()
return linuxerr.EINVAL
}
t.tg.signalHandlers.mu.Unlock()
if t.FSContext().ReadRefs() != 1 {
return linuxerr.EINVAL
}
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) {
return linuxerr.EPERM
}
checkCreds = creds.ForkIntoUserNamespace(nss.userNS)
}

if nss.childPIDNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
// Allow setting the current or a child pid namespace.
Expand All @@ -685,25 +731,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
}

if nss.netNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.utsNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.ipcNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.mountNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
oldFSContext := t.FSContext()
Expand All @@ -723,6 +769,10 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
// Swap to new namespaces.
// Store replaced resources in nss so that they're cleaned up by the deferred function.
t.mu.Lock()
if nss.userNS != nil {
t.creds.Store(checkCreds)
nss.userNS = creds.UserNamespace
}
if nss.childPIDNS != nil {
t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace
}
Expand Down Expand Up @@ -786,10 +836,12 @@ func (t *Task) Unshare(flags int32) error {

// Prepare new execution context.
creds := t.Credentials()
originalUserNS := creds.UserNamespace
var (
newFSContext *FSContext
newFDTable *FDTable
newCreds bool
newUserNS *auth.UserNamespace
newChildPIDNS *PIDNamespace
newNetNS *inet.Namespace
newUTSNS *UTSNamespace
Expand All @@ -803,6 +855,9 @@ func (t *Task) Unshare(flags int32) error {
if newFDTable != nil {
newFDTable.DecRef(t)
}
if newUserNS != nil {
newUserNS.DecRef(t)
}
if newNetNS != nil {
newNetNS.DecRef(t)
}
Expand All @@ -827,10 +882,11 @@ func (t *Task) Unshare(flags int32) error {
return linuxerr.EPERM
}
var err error
newUserNS, err := creds.NewChildUserNamespace()
newUserNS, err = creds.NewChildUserNamespace()
if err != nil {
return err
}
newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS))
creds = t.Credentials().ForkIntoUserNamespace(newUserNS)
newCreds = true
}
Expand Down Expand Up @@ -869,11 +925,12 @@ func (t *Task) Unshare(flags int32) error {

// Switch to new execution context. Store replaced resources in new* so
// that they're cleaned up by the deferred function.
t.mu.Lock()
defer t.mu.Unlock()
if newCreds {
t.creds.Store(creds)
newUserNS = originalUserNS
}
t.mu.Lock()
defer t.mu.Unlock()
if newFSContext != nil {
oldFSContext := t.FSContext()
// unshareFromTask() lowers the old fs context's ref count, but its for us to
Expand Down
Loading
Loading