Permalink
Browse files

syscall: use CLONE_VFORK safely

Currently, CLONE_VFORK is used without much regard to the stack. This
is dangerous, because anything the child does to the stack is visible
to the parent. For example, if the compiler were to reuse named stack
slots (which it currently doesn't do), it would be easy for the child
running in the same stack frame as the parent to corrupt local
variables that the parent then depended on. We're not sure of anything
specific going wrong in this code right now, but it is at best a
ticking time bomb.

CLONE_VFORK can only safely be used if we ensure the child does not
execute in any of the active stack frames of the parent. This commit
implements this by arranging for the parent to return immediately from
the frame the child will operate in, and for the child to never return
to the frame the parent will operate in.

Fixes #20732.

Change-Id: Iad5b4ddc2b994c082bd278bfd52ef53bd38c037f
Reviewed-on: https://go-review.googlesource.com/46173
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
  • Loading branch information...
aclements committed Jun 20, 2017
1 parent 5a5ac34 commit 67e537541c043c701001f002bed0cda70ce72767
Showing with 45 additions and 25 deletions.
  1. +45 −25 src/syscall/exec_linux.go
View
@@ -63,15 +63,46 @@ func runtime_AfterForkInChild()
// functions that do not grow the stack.
//go:norace
func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
// Set up and fork. This returns immediately in the parent or
// if there's an error.
r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
if locked {
runtime_AfterFork()
}
if err1 != 0 {
return 0, err1
}
// parent; return PID
pid = int(r1)
if sys.UidMappings != nil || sys.GidMappings != nil {
Close(p[0])
err := writeUidGidMappings(pid, sys)
var err2 Errno
if err != nil {
err2 = err.(Errno)
}
RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
Close(p[1])
}
return pid, 0
}
//go:norace
func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
// vfork requires that the child not touch any of the parent's
// active stack frames. Hence, the child does all post-fork
// processing in this stack frame and never returns, while the
// parent returns immediately from this frame and does all
// post-fork processing in the outer frame.
// Declare all variables at top in case any
// declarations require heap allocation (e.g., err1).
var (
r1 uintptr
err1 Errno
err2 Errno
nextfd int
i int
p [2]int
)
// Record parent PID so child can test if it has died.
@@ -94,13 +125,15 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr
// synchronizing writing of User ID/Group ID mappings.
if sys.UidMappings != nil || sys.GidMappings != nil {
if err := forkExecPipe(p[:]); err != nil {
return 0, err.(Errno)
err1 = err.(Errno)
return
}
}
// About to call fork.
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
locked = true
switch {
case runtime.GOARCH == "amd64" && sys.Cloneflags&CLONE_NEWUSER == 0:
r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
@@ -109,27 +142,14 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr
default:
r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
}
if err1 != 0 {
runtime_AfterFork()
return 0, err1
}
if r1 != 0 {
// parent; return PID
runtime_AfterFork()
pid = int(r1)
if sys.UidMappings != nil || sys.GidMappings != nil {
Close(p[0])
err := writeUidGidMappings(pid, sys)
if err != nil {
err2 = err.(Errno)
}
RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
Close(p[1])
}
return pid, 0
if err1 != 0 || r1 != 0 {
// If we're in the parent, we must return immediately
// so we're not in the same stack frame as the child.
// This can at most use the return PC, which the child
// will not modify, and the results of
// rawVforkSyscall, which must have been written after
// the child was replaced.
return
}
// Fork succeeded, now in child.

0 comments on commit 67e5375

Please sign in to comment.