/
mountutil_linux.go
467 lines (427 loc) · 14.6 KB
/
mountutil_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mountutil
import (
"context"
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/oci"
"github.com/containerd/log"
"github.com/containerd/nerdctl/pkg/mountutil/volumestore"
"github.com/docker/go-units"
mobymount "github.com/moby/sys/mount"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
/*
Portions from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go
Portions from https://github.com/moby/moby/blob/v20.10.5/volume/mounts/linux_parser.go
Copyright (C) Docker/Moby authors.
Licensed under the Apache License, Version 2.0
NOTICE: https://github.com/moby/moby/blob/v20.10.5/NOTICE
*/
const (
DefaultMountType = "none"
// DefaultPropagationMode is the default propagation of mounts
// where user doesn't specify mount propagation explicitly.
// See also: https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L145
DefaultPropagationMode = "rprivate"
)
// UnprivilegedMountFlags is from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go#L420-L450
//
// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
func UnprivilegedMountFlags(path string) ([]string, error) {
var statfs unix.Statfs_t
if err := unix.Statfs(path, &statfs); err != nil {
return nil, &fs.PathError{Op: "stat", Path: path, Err: err}
}
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
unprivilegedFlags := map[uint64]string{
unix.MS_RDONLY: "ro",
unix.MS_NODEV: "nodev",
unix.MS_NOEXEC: "noexec",
unix.MS_NOSUID: "nosuid",
unix.MS_NOATIME: "noatime",
unix.MS_RELATIME: "relatime",
unix.MS_NODIRATIME: "nodiratime",
}
var flags []string
for mask, flag := range unprivilegedFlags {
if uint64(statfs.Flags)&mask == mask {
flags = append(flags, flag)
}
}
return flags, nil
}
// parseVolumeOptions parses specified optsRaw with using information of
// the volume type and the src directory when necessary.
func parseVolumeOptions(vType, src, optsRaw string) ([]string, []oci.SpecOpts, error) {
return parseVolumeOptionsWithMountInfo(vType, src, optsRaw, getMountInfo)
}
// getMountInfo gets mount.Info of a directory.
func getMountInfo(dir string) (mount.Info, error) {
sourcePath, err := filepath.EvalSymlinks(dir)
if err != nil {
return mount.Info{}, err
}
return mount.Lookup(sourcePath)
}
// parseVolumeOptionsWithMountInfo is the testable implementation
// of parseVolumeOptions.
func parseVolumeOptionsWithMountInfo(vType, src, optsRaw string, getMountInfoFunc func(string) (mount.Info, error)) ([]string, []oci.SpecOpts, error) {
var (
writeModeRawOpts []string
propagationRawOpts []string
bindOpts []string
)
for _, opt := range strings.Split(optsRaw, ",") {
switch opt {
case "rw", "ro", "rro":
writeModeRawOpts = append(writeModeRawOpts, opt)
case "private", "rprivate", "shared", "rshared", "slave", "rslave":
propagationRawOpts = append(propagationRawOpts, opt)
case "bind", "rbind":
// bind means not recursively bind-mounted, rbind is the opposite
bindOpts = append(bindOpts, opt)
case "":
// NOP
default:
log.L.Warnf("unsupported volume option %q", opt)
}
}
var opts []string
var specOpts []oci.SpecOpts
if len(bindOpts) > 0 && vType != Bind {
return nil, nil, fmt.Errorf("volume bind/rbind option is only supported for bind mount: %+v", bindOpts)
} else if len(bindOpts) > 1 {
return nil, nil, fmt.Errorf("duplicated bind/rbind option: %+v", bindOpts)
} else if len(bindOpts) > 0 {
opts = append(opts, bindOpts[0])
}
if len(writeModeRawOpts) > 1 {
return nil, nil, fmt.Errorf("duplicated read/write volume option: %+v", writeModeRawOpts)
} else if len(writeModeRawOpts) > 0 {
switch writeModeRawOpts[0] {
case "ro":
opts = append(opts, "ro")
case "rro":
// Mount option "rro" is supported since crun v1.4 / runc v1.1 (https://github.com/opencontainers/runc/pull/3272), with kernel >= 5.12.
// Older version of runc just ignores "rro", so we have to add "ro" too, to our best effort.
opts = append(opts, "ro", "rro")
if len(propagationRawOpts) != 1 || propagationRawOpts[0] != "rprivate" {
log.L.Warn("Mount option \"rro\" should be used in conjunction with \"rprivate\"")
}
case "rw":
// NOP
default:
// NOTREACHED
return nil, nil, fmt.Errorf("unexpected writeModeRawOpts[0]=%q", writeModeRawOpts[0])
}
}
if len(propagationRawOpts) > 1 {
return nil, nil, fmt.Errorf("duplicated volume propagation option: %+v", propagationRawOpts)
} else if len(propagationRawOpts) > 0 && vType != Bind {
return nil, nil, fmt.Errorf("volume propagation option is only supported for bind mount: %+v", propagationRawOpts)
} else if vType == Bind {
var pFlag string
var got string
if len(propagationRawOpts) > 0 {
got = propagationRawOpts[0]
}
switch got {
case "shared", "rshared":
pFlag = got
// a bind mount can be shared from shared mount
mi, err := getMountInfoFunc(src)
if err != nil {
return nil, nil, err
}
if err := ensureMountOptionalValue(mi, "shared:"); err != nil {
return nil, nil, err
}
// NOTE: Though OCI Runtime Spec doesn't explicitly describe, runc's default
// of RootfsPropagation is unix.MS_SLAVE | unix.MS_REC (i.e. runc applies
// "slave" to all mount points in the container recursively). This ends
// up marking the bind src directories "slave" and preventing it to shared
// with the host. So we set RootfsPropagation to "shared" here.
//
// See also:
// - OCI Runtime Spec: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config-linux.md#rootfs-mount-propagation
// - runc implementation: https://github.com/opencontainers/runc/blob/v1.0.0/libcontainer/rootfs_linux.go#L771-L777
specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error {
switch s.Linux.RootfsPropagation {
case "shared", "rshared":
// NOP
default:
s.Linux.RootfsPropagation = "shared"
}
return nil
})
case "slave", "rslave":
pFlag = got
// a bind mount can be a slave of shared or an existing slave mount
mi, err := getMountInfoFunc(src)
if err != nil {
return nil, nil, err
}
if err := ensureMountOptionalValue(mi, "shared:", "master:"); err != nil {
return nil, nil, err
}
// See above comments about RootfsPropagation. Here we make sure that
// the mountpoint can be a slave of the host mount.
specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error {
switch s.Linux.RootfsPropagation {
case "shared", "rshared", "slave", "rslave":
// NOP
default:
s.Linux.RootfsPropagation = "rslave"
}
return nil
})
case "private", "rprivate":
pFlag = got
default:
// No propagation is specified to this bind mount.
// NOTE: When RootfsPropagation is set (e.g. by other bind mount option), that
// propagation mode will be applied to this bind mount as well. So we need
// to set "rprivate" explicitly for preventing this bind mount from unexpectedly
// shared with the host. This behaviour is compatible to docker:
// https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L320-L322
//
// TODO: directories managed by containerd (e.g. /var/lib/containerd, /run/containerd, ...)
// should be marked as "rslave" instead of "rprivate". This is because allowing
// containers to hold their private bind mounts will prevent containerd from remove
// them. See also: https://github.com/moby/moby/pull/36055.
// Unfortunately, containerd doesn't expose the locations of directories where it manages.
// Current workaround is explicitly add "rshared" or "rslave" option to these bind mounts.
pFlag = DefaultPropagationMode
}
opts = append(opts, pFlag)
}
return opts, specOpts, nil
}
// ensure the mount of the specified directory has either of the specified
// "optional" value in the entry in the /proc/<pid>/mountinfo file.
//
// For more details about "optional" field:
// - https://github.com/moby/sys/blob/mountinfo/v0.4.1/mountinfo/mountinfo.go#L52-L56
func ensureMountOptionalValue(mi mount.Info, vals ...string) error {
var hasValue bool
for _, opt := range strings.Split(mi.Optional, " ") {
for _, mark := range vals {
if strings.HasPrefix(opt, mark) {
hasValue = true
}
}
}
if !hasValue {
return fmt.Errorf("mountpoint %q doesn't have optional field neither of %+v", mi.Mountpoint, vals)
}
return nil
}
func ProcessFlagTmpfs(s string) (*Processed, error) {
split := strings.SplitN(s, ":", 2)
dst := split[0]
options := []string{"noexec", "nosuid", "nodev"}
if len(split) == 2 {
raw := append(options, strings.Split(split[1], ",")...)
var err error
options, err = mobymount.MergeTmpfsOptions(raw)
if err != nil {
return nil, err
}
}
res := &Processed{
Mount: specs.Mount{
Type: "tmpfs",
Source: "tmpfs",
Destination: dst,
Options: options,
},
Type: Tmpfs,
Mode: strings.Join(options, ","),
}
return res, nil
}
func ProcessFlagMount(s string, volStore volumestore.VolumeStore) (*Processed, error) {
fields := strings.Split(s, ",")
var (
mountType string
src string
dst string
bindPropagation string
bindNonRecursive bool
rwOption string
tmpfsSize int64
tmpfsMode os.FileMode
err error
)
// set default values
mountType = Volume
tmpfsMode = os.FileMode(01777)
// three types of mount(and examples):
// --mount type=bind,source="$(pwd)"/target,target=/app2,readonly,bind-propagation=shared
// --mount type=tmpfs,destination=/app,tmpfs-mode=1770,tmpfs-size=1MB
// --mount type=volume,src=vol-1,dst=/app,readonly
// if type not specified, default will be set to volume
// --mount src=`pwd`/tmp,target=/app
for _, field := range fields {
parts := strings.SplitN(field, "=", 2)
key := strings.ToLower(parts[0])
if len(parts) == 1 {
switch key {
case "readonly", "ro", "rw", "rro":
rwOption = key
continue
case "bind-nonrecursive":
bindNonRecursive = true
continue
}
}
if len(parts) != 2 {
return nil, fmt.Errorf("invalid field '%s' must be a key=value pair", field)
}
value := parts[1]
switch key {
case "type":
switch value {
case "tmpfs":
mountType = Tmpfs
case "bind":
mountType = Bind
case "volume":
default:
return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", value)
}
case "source", "src":
src = value
case "target", "dst", "destination":
dst = value
case "readonly", "ro", "rw", "rro":
trueValue, err := strconv.ParseBool(value)
if err != nil {
return nil, fmt.Errorf("invalid value for %s: %s", key, value)
}
if trueValue {
rwOption = key
}
case "bind-propagation":
// here don't validate the propagation value
// parseVolumeOptions will do that.
bindPropagation = value
case "bind-nonrecursive":
bindNonRecursive, err = strconv.ParseBool(value)
if err != nil {
return nil, fmt.Errorf("invalid value for %s: %s", key, value)
}
case "tmpfs-size":
tmpfsSize, err = units.RAMInBytes(value)
if err != nil {
return nil, fmt.Errorf("invalid value for %s: %s", key, value)
}
case "tmpfs-mode":
ui64, err := strconv.ParseUint(value, 8, 32)
if err != nil {
return nil, fmt.Errorf("invalid value for %s: %s", key, value)
}
tmpfsMode = os.FileMode(ui64)
default:
return nil, fmt.Errorf("unexpected key '%s' in '%s'", key, field)
}
}
// compose new fileds and join into a string
// to call legacy ProcessFlagTmpfs or ProcessFlagV function
fields = []string{}
options := []string{}
if rwOption != "" {
if rwOption == "readonly" {
rwOption = "ro"
}
options = append(options, rwOption)
}
switch mountType {
case Tmpfs:
fields = []string{dst}
if tmpfsMode != 0 {
options = append(options, fmt.Sprintf("mode=%o", tmpfsMode))
}
if tmpfsSize > 0 {
options = append(options, getTmpfsSize(tmpfsSize))
}
case Volume, Bind:
fields = []string{src, dst}
if bindPropagation != "" {
options = append(options, bindPropagation)
}
if mountType == Bind {
if bindNonRecursive {
options = append(options, "bind")
} else {
options = append(options, "rbind")
}
}
}
if len(options) > 0 {
optionsStr := strings.Join(options, ",")
fields = append(fields, optionsStr)
}
fieldsStr := strings.Join(fields, ":")
log.L.Debugf("Call legacy %s process, spec: %s ", mountType, fieldsStr)
switch mountType {
case Tmpfs:
return ProcessFlagTmpfs(fieldsStr)
case Volume, Bind:
// createDir=false for --mount option to disallow creating directories on host if not found
return ProcessFlagV(fieldsStr, volStore, false)
}
return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", mountType)
}
// copy from https://github.com/moby/moby/blob/085c6a98d54720e70b28354ccec6da9b1b9e7fcf/volume/mounts/linux_parser.go#L375
func getTmpfsSize(size int64) string {
// calculate suffix here, making this linux specific, but that is
// okay, since API is that way anyways.
// we do this by finding the suffix that divides evenly into the
// value, returning the value itself, with no suffix, if it fails.
//
// For the most part, we don't enforce any semantic to this values.
// The operating system will usually align this and enforce minimum
// and maximums.
var (
suffix string
)
for _, r := range []struct {
suffix string
divisor int64
}{
{"g", 1 << 30},
{"m", 1 << 20},
{"k", 1 << 10},
} {
if size%r.divisor == 0 {
size = size / r.divisor
suffix = r.suffix
break
}
}
return fmt.Sprintf("size=%d%s", size, suffix)
}