/
cgroup.go
386 lines (325 loc) · 10.4 KB
/
cgroup.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// +build linux
/*
Copyright 2019 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cgroup
// #include <stdint.h>
// #include <stdlib.h>
// extern uint64_t cgroup_id(char *path);
import "C"
import (
"bufio"
"bytes"
"encoding/binary"
"io/ioutil"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/gravitational/teleport"
"github.com/gravitational/teleport/lib/defaults"
"github.com/gravitational/trace"
"github.com/pborman/uuid"
"github.com/sirupsen/logrus"
)
var log = logrus.WithFields(logrus.Fields{
trace.Component: teleport.ComponentCgroup,
})
// Config holds configuration for the cgroup service.
type Config struct {
// MountPath is where the cgroupv2 hierarchy is mounted.
MountPath string
}
// CheckAndSetDefaults checks BPF configuration.
func (c *Config) CheckAndSetDefaults() error {
if c.MountPath == "" {
c.MountPath = defaults.CgroupPath
}
return nil
}
// Service manages cgroup orchestration.
type Service struct {
*Config
// teleportRoot is the root cgroup that holds all Teleport sessions. Used
// to remove all cgroups upon shutdown.
teleportRoot string
}
// New creates a new cgroup service.
func New(config *Config) (*Service, error) {
err := config.CheckAndSetDefaults()
if err != nil {
return nil, trace.Wrap(err)
}
s := &Service{
Config: config,
teleportRoot: path.Join(config.MountPath, teleportRoot, uuid.New()),
}
// Mount the cgroup2 filesystem.
err = s.mount()
if err != nil {
return nil, trace.Wrap(err)
}
log.Debugf("Teleport session hierarchy mounted at: %v.", s.teleportRoot)
return s, nil
}
// Close will unmount the cgroup filesystem.
func (s *Service) Close() error {
err := s.cleanupHierarchy()
if err != nil {
return trace.Wrap(err)
}
err = s.unmount()
if err != nil {
return trace.Wrap(err)
}
log.Debugf("Cleaned up and unmounted Teleport session hierarchy at: %v.", s.teleportRoot)
return nil
}
// Create will create a cgroup for a given session.
func (s *Service) Create(sessionID string) error {
err := os.Mkdir(path.Join(s.teleportRoot, sessionID), fileMode)
if err != nil {
return trace.Wrap(err)
}
return nil
}
// Remove will remove the cgroup for a session. An existing processes will be
// moved to the root controller.
func (s *Service) Remove(sessionID string) error {
// Read in all PIDs for the cgroup.
pids, err := readPids(path.Join(s.teleportRoot, sessionID, cgroupProcs))
if err != nil {
return trace.Wrap(err)
}
// Move all PIDs to the root controller. This has to be done before a cgroup
// can be removed.
err = writePids(path.Join(s.MountPath, cgroupProcs), pids)
if err != nil {
return trace.Wrap(err)
}
// The rmdir syscall is used to remove a cgroup.
err = unix.Rmdir(path.Join(s.teleportRoot, sessionID))
if err != nil {
return trace.Wrap(err)
}
log.Debugf("Removed cgroup for session: %v.", sessionID)
return nil
}
// Place place a process in the cgroup for that session.
func (s *Service) Place(sessionID string, pid int) error {
// Open cgroup.procs file for the cgroup.
filepath := path.Join(s.teleportRoot, sessionID, cgroupProcs)
f, err := os.OpenFile(filepath, os.O_APPEND|os.O_WRONLY, fileMode)
if err != nil {
return trace.Wrap(err)
}
defer f.Close()
// Write PID and place process in cgroup.
_, err = f.WriteString(strconv.Itoa(pid))
if err != nil {
return trace.Wrap(err)
}
return nil
}
// readPids returns a slice of PIDs from a file. Used to get list of all PIDs
// within a cgroup.
func readPids(path string) ([]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, trace.Wrap(err)
}
defer f.Close()
var pids []string
scanner := bufio.NewScanner(f)
for scanner.Scan() {
pids = append(pids, scanner.Text())
}
if scanner.Err() != nil {
return nil, trace.Wrap(err)
}
return pids, nil
}
// writePids writes a slice of PIDS to a given file. Used to add processes to
// a cgroup.
func writePids(path string, pids []string) error {
f, err := os.OpenFile(path, os.O_WRONLY, fileMode)
if err != nil {
return trace.Wrap(err)
}
defer f.Close()
for _, pid := range pids {
_, err := f.WriteString(pid + "\n")
if err != nil {
return trace.Wrap(err)
}
}
return nil
}
// cleanupHierarchy removes any cgroups for any exisiting sessions.
func (s *Service) cleanupHierarchy() error {
var sessions []string
// Recursively look within the Teleport hierarchy for cgroups for session.
err := filepath.Walk(path.Join(s.teleportRoot), func(path string, info os.FileInfo, err error) error {
// Only pick up cgroup.procs files.
if !pattern.MatchString(path) {
return nil
}
// Extract the session ID. Skip over cgroup.procs files not for sessions.
parts := strings.Split(path, string(filepath.Separator))
if len(parts) != 5 {
return nil
}
sessionID := uuid.Parse(parts[3])
if sessionID == nil {
return nil
}
// Append to the list of sessions within the cgroup hierarchy.
sessions = append(sessions, sessionID.String())
return nil
})
if err != nil {
return trace.Wrap(err)
}
// Remove all sessions that were found.
for _, sessionID := range sessions {
err := s.Remove(sessionID)
if err != nil {
return trace.Wrap(err)
}
}
return nil
}
// mount mounts the cgroup2 filesystem.
func (s *Service) mount() error {
// Make sure path to cgroup2 mount point exists.
err := os.MkdirAll(s.MountPath, fileMode)
if err != nil {
return trace.Wrap(err)
}
// Check if the Teleport root cgroup exists, if it does the cgroup filesystem
// is already mounted, return right away.
files, err := ioutil.ReadDir(s.MountPath)
if err == nil && len(files) > 0 {
// Create cgroup that will hold Teleport sessions.
err = os.MkdirAll(s.teleportRoot, fileMode)
if err != nil {
return trace.Wrap(err)
}
return nil
}
// Mount the cgroup2 filesystem. Even if the cgroup filesystem is already
// mounted, it is safe to re-mount it at another location, both will have
// the exact same view of the hierarchy. From "man cgroups":
//
// It is not possible to mount the same controller against multiple
// cgroup hierarchies. For example, it is not possible to mount both
// the cpu and cpuacct controllers against one hierarchy, and to mount
// the cpu controller alone against another hierarchy. It is possible
// to create multiple mount points with exactly the same set of
// comounted controllers. However, in this case all that results is
// multiple mount points providing a view of the same hierarchy.
//
// The exact args to the mount syscall come strace of mount(8). From the
// docs: https://www.kernel.org/doc/Documentation/cgroup-v2.txt:
//
// Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2
// hierarchy can be mounted with the following mount command:
//
// # mount -t cgroup2 none $MOUNT_POINT
//
// The output of the strace looks like the following:
//
// mount("none", "/cgroup3", "cgroup2", MS_MGC_VAL, NULL) = 0
//
// Where MS_MGC_VAL can be dropped. From mount(2) because we only support
// kernels 4.18 and above for this feature.
//
// The mountflags argument may have the magic number 0xC0ED (MS_MGC_VAL)
// in the top 16 bits. (All of the other flags discussed in DESCRIPTION
// occupy the low order 16 bits of mountflags.) Specifying MS_MGC_VAL
// was required in kernel versions prior to 2.4, but since Linux 2.4 is
// no longer required and is ignored if specified.
err = unix.Mount("none", s.MountPath, "cgroup2", 0, "")
if err != nil {
return trace.Wrap(err)
}
log.Debugf("Mounted cgroup filesystem to %v.", s.MountPath)
// Create cgroup that will hold Teleport sessions.
err = os.MkdirAll(s.teleportRoot, fileMode)
if err != nil {
return trace.Wrap(err)
}
return nil
}
// unmount will unmount the cgroupv2 filesystem.
func (s *Service) unmount() error {
// The exact args to the umount syscall come from a strace of umount(8):
//
// umount2("/cgroup2", 0) = 0
err := unix.Unmount(s.MountPath, 0)
if err != nil {
return trace.Wrap(err)
}
return nil
}
type fileHandle struct {
CgroupID uint64
}
// ID returns the cgroup ID for the given session.
func (s *Service) ID(sessionID string) (uint64, error) {
var fh fileHandle
path := path.Join(s.teleportRoot, sessionID)
// Call the "name_to_handle_at" syscall directly (unix.NameToHandleAt is a
// thin wrapper around the syscall) instead of calling the glibc wrapper.
// This has to be done to support older versions of glibc (like the one
// CentOS 6 ships with) which don't have the "name_to_handle_at" wrapper.
//
// Note that unix.NameToHandleAt is slightly more than a thin wrapper, it
// calls "name_to_handle_at" in a loop to get the correct size of the
// returned "f_handle" value. See the below link for more details.
//
// https://github.com/torvalds/linux/commit/f269099a7e7a0c6732c4a817d0e99e92216414d9
handle, _, err := unix.NameToHandleAt(unix.AT_FDCWD, path, 0)
if err != nil {
return 0, trace.Wrap(err)
}
// Read in bytes of "f_handle" which should be 8 bytes encoded little-endian.
//
// At the moment, all supported platforms (Linux and either AMD64 or ARM)
// are little-endian, so this is not an issue for now. If we ever need to
// support a big-endian platform, this file will have to be split into platform
// specific versions. See the following thread for more details:
// https://groups.google.com/forum/#!topic/golang-nuts/3GEzwKfRRQw.
err = binary.Read(bytes.NewBuffer(handle.Bytes()), binary.LittleEndian, &fh)
if err != nil {
return 0, trace.Wrap(err)
}
return fh.CgroupID, nil
}
var (
// pattern matches cgroup process files.
pattern = regexp.MustCompile(`cgroup\.procs$`)
)
const (
// fileMode is the mode files and directories are created in within the
// cgroup filesystem.
fileMode = 0555
// teleportRoot is the prefix of the root cgroup that holds all other
// Teleport cgroups.
teleportRoot = "teleport"
// cgroupProcs is the name of the file that contains all processes within
// a cgroup.
cgroupProcs = "cgroup.procs"
)