/
unshare_linux.go
175 lines (160 loc) · 6.76 KB
/
unshare_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
//go:build go1.10
package unshare // import "github.com/docker/docker/internal/unshare"
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
func init() {
// The startup thread of a process is special in a few different ways.
// Most pertinent to the discussion at hand, any per-thread kernel state
// reflected in the /proc/[pid]/ directory for a process is taken from
// the state of the startup thread. Same goes for /proc/self/; it shows
// the state of the current process' startup thread, no matter which
// thread the files are being opened from. For most programs this is a
// distinction without a difference as the kernel state, such as the
// mount namespace and current working directory, is shared among (and
// kept synchronized across) all threads of a process. But things start
// to break down once threads start unsharing and modifying parts of
// their kernel state.
//
// The Go runtime schedules goroutines to execute on the startup thread,
// same as any other. How this could be problematic is best illustrated
// with a concrete example. Consider what happens if a call to
// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
// onto the startup thread. The thread's mount namespace will be
// unshared and modified. The contents of the /proc/[pid]/mountinfo file
// will then describe the mount tree of the unshared namespace, not the
// namespace of any other thread. It will remain this way until the
// process exits. (The startup thread is special in another way: exiting
// it puts the process into a "non-waitable zombie" state. To avoid this
// fate, the Go runtime parks the thread instead of exiting if a
// goroutine returns while locked to the startup thread. More
// information can be found in the Go runtime sources:
// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
// package reads from /proc/self/mountinfo, so will read the mount tree
// for the wrong namespace if the startup thread has had its mount
// namespace unshared! The /proc/thread-self/ directory, introduced in
// Linux 3.17, is one potential solution to this problem, but every
// package which opens files in /proc/self/ would need to be updated,
// and fallbacks to /proc/self/task/[tid]/ would be required to support
// older kernels. Overlooking any reference to /proc/self/ would
// manifest as stochastically-reproducible bugs, so this is far from an
// ideal solution.
//
// Reading from /proc/self/ would not be a problem if we could prevent
// the per-thread state of the startup thread from being modified
// nondeterministically in the first place. We can accomplish this
// simply by locking the main() function to the startup thread! Doing so
// excludes any other goroutine from being scheduled on the thread.
runtime.LockOSThread()
}
// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
// reversed using setns(2). The values are the basenames of the corresponding
// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
// state.
var reversibleSetnsFlags = map[int]string{
unix.CLONE_NEWCGROUP: "cgroup",
unix.CLONE_NEWNET: "net",
unix.CLONE_NEWUTS: "uts",
unix.CLONE_NEWPID: "pid",
unix.CLONE_NEWTIME: "time",
// The following CLONE_NEW* flags are not included because they imply
// another, irreversible flag when used with unshare(2).
// - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
// - unix.CLONE_NEWNS: implies CLONE_FS
// - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
}
// Go calls the given functions in a new goroutine, locked to an OS thread,
// which has had the parts of its execution state disassociated from the rest of
// the current process using [unshare(2)]. It blocks until the new goroutine has
// started and setupfn has returned. fn is only called if setupfn returns nil. A
// nil setupfn or fn is equivalent to passing a no-op function.
//
// The disassociated execution state and any changes made to it are only visible
// to the goroutine which the functions are called in. Any other goroutines,
// including ones started from the function, will see the same execution state
// as the rest of the process.
//
// The acceptable flags are documented in the [unshare(2)] Linux man-page.
// The corresponding CLONE_* constants are defined in package [unix].
//
// # Warning
//
// This function may terminate the thread which the new goroutine executed on
// after fn returns, which could cause subprocesses started with the
// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
// termination. Any subprocess started before this function is called may be
// affected, in addition to any subprocesses started inside setupfn or fn.
// There are more details at https://go.dev/issue/27505.
//
// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
func Go(flags int, setupfn func() error, fn func()) error {
started := make(chan error)
maskedFlags := flags
for f := range reversibleSetnsFlags {
maskedFlags &^= f
}
isReversible := maskedFlags == 0
go func() {
// Prepare to manipulate per-thread kernel state.
runtime.LockOSThread()
// Not all changes to the execution state can be reverted.
// If an irreversible change to the execution state is made, our
// only recourse is to have the tampered thread terminated by
// returning from this function while the goroutine remains
// wired to the thread. The Go runtime will terminate the thread
// and replace it with a fresh one as needed.
if isReversible {
defer func() {
if isReversible {
// All execution state has been restored without error.
// The thread is once again fungible.
runtime.UnlockOSThread()
}
}()
tid := unix.Gettid()
for f, ns := range reversibleSetnsFlags {
if flags&f != f {
continue
}
// The /proc/thread-self directory was added in Linux 3.17.
// We are not using it to maximize compatibility.
pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
if err != nil {
started <- &os.PathError{Op: "open", Path: pth, Err: err}
return
}
defer func() {
if isReversible {
if err := unix.Setns(fd, 0); err != nil {
isReversible = false
}
}
_ = unix.Close(fd)
}()
}
}
// Threads are implemented under Linux as processes which share
// a virtual memory space. Therefore in a multithreaded process
// unshare(2) disassociates parts of the calling thread's
// context from the thread it was clone(2)'d from.
if err := unix.Unshare(flags); err != nil {
started <- os.NewSyscallError("unshare", err)
return
}
if setupfn != nil {
if err := setupfn(); err != nil {
started <- err
return
}
}
close(started)
if fn != nil {
fn()
}
}()
return <-started
}