-
Notifications
You must be signed in to change notification settings - Fork 8
/
container.go
220 lines (192 loc) · 5.97 KB
/
container.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
package dockerutil
import (
"context"
"fmt"
"io"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/docker/docker/api/types/container"
dockerclient "github.com/docker/docker/client"
"github.com/spf13/afero"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/envbox/xunix"
"github.com/coder/retry"
)
const (
runtime = "sysbox-runc"
// Default CPU period for containers.
DefaultCPUPeriod uint64 = 1e5
)
type DockerClient interface {
dockerclient.SystemAPIClient
dockerclient.ContainerAPIClient
dockerclient.ImageAPIClient
}
type ContainerConfig struct {
Log slog.Logger
Mounts []xunix.Mount
Devices []container.DeviceMapping
Envs []string
Name string
Image string
WorkingDir string
Hostname string
// HasInit dictates whether the entrypoint of the container is /sbin/init
// or 'sleep infinity'.
HasInit bool
CPUs int64
MemoryLimit int64
}
// CreateContainer creates a sysbox-runc container.
func CreateContainer(ctx context.Context, client DockerClient, conf *ContainerConfig) (string, error) {
host := &container.HostConfig{
Runtime: runtime,
AutoRemove: true,
Resources: container.Resources{
Devices: conf.Devices,
// Set resources for the inner container.
// This is important for processes inside the container to know what they
// have to work with.
// TODO: Sysbox does not copy cpu.cfs_{period,quota}_us into syscont-cgroup-root cgroup.
// These will not be visible inside the child container.
// See: https://github.com/nestybox/sysbox/issues/582
CPUPeriod: int64(DefaultCPUPeriod),
CPUQuota: conf.CPUs * int64(DefaultCPUPeriod),
Memory: conf.MemoryLimit,
},
ExtraHosts: []string{"host.docker.internal:host-gateway"},
Binds: generateBindMounts(conf.Mounts),
}
entrypoint := []string{"sleep", "infinity"}
if conf.HasInit {
entrypoint = []string{"/sbin/init"}
}
if conf.Hostname == "" {
conf.Hostname = conf.Name
}
cnt := &container.Config{
Image: conf.Image,
Entrypoint: entrypoint,
Cmd: []string{},
Env: conf.Envs,
Hostname: conf.Hostname,
WorkingDir: conf.WorkingDir,
Tty: false,
User: "root",
}
c, err := client.ContainerCreate(ctx, cnt, host, nil, nil, conf.Name)
if err != nil {
return "", xerrors.Errorf("create container: %w", err)
}
return c.ID, nil
}
type BootstrapConfig struct {
ContainerID string
User string
Script string
Env []string
Detach bool
StdOutErr io.Writer
}
// BoostrapContainer runs a script inside the container as the provided user.
// If conf.Script is empty then it is a noop.
func BootstrapContainer(ctx context.Context, client DockerClient, conf BootstrapConfig) error {
if conf.Script == "" {
return nil
}
var err error
for r, n := retry.New(time.Second, time.Second*2), 0; r.Wait(ctx) && n < 10; n++ {
var out io.Reader
out, err = ExecContainer(ctx, client, ExecConfig{
ContainerID: conf.ContainerID,
User: conf.User,
Cmd: "/bin/sh",
Args: []string{"-s"},
Stdin: strings.NewReader(conf.Script),
Env: conf.Env,
StdOutErr: conf.StdOutErr,
Detach: conf.Detach,
})
if err != nil {
output, rerr := io.ReadAll(out)
if rerr != nil {
err = xerrors.Errorf("read all: %w", err)
continue
}
err = xerrors.Errorf("boostrap container (%s): %w", output, err)
continue
}
break
}
if err != nil {
return xerrors.Errorf("timed out boostrapping container: %w", err)
}
return nil
}
// SetContainerQuota writes a quota to its correct location for the inner container.
// HACK: until https://github.com/nestybox/sysbox/issues/582 is resolved, we need to copy
// the CPU quota and period from the outer container to the inner container to ensure
// that applications inside the container know how much CPU they have to work with.
//
// For cgroupv2:
// - /sys/fs/cgroup/<subpath>/init.scope/cpu.max
//
// For cgroupv1:
// - /sys/fs/cgroup/cpu,cpuacct/<subpath>/syscont-cgroup-root/cpu.cfs_quota_us
// - /sys/fs/cgroup/cpu,cpuacct/<subpath>/syscont-cgroup-root/cpu.cfs_period_us
func SetContainerQuota(ctx context.Context, containerID string, quota xunix.CPUQuota) error {
switch quota.CGroup {
case xunix.CGroupV2:
return setContainerQuotaCGroupV2(ctx, containerID, quota)
case xunix.CGroupV1:
return setContainerQuotaCGroupV1(ctx, containerID, quota)
default:
return xerrors.Errorf("Unknown cgroup %d", quota.CGroup)
}
}
func setContainerQuotaCGroupV2(ctx context.Context, containerID string, quota xunix.CPUQuota) error {
var (
fs = xunix.GetFS(ctx)
cgroupBase = fmt.Sprintf("/sys/fs/cgroup/docker/%s/init.scope/", containerID)
)
var content string
if quota.Quota < 0 {
content = fmt.Sprintf("max %d\n", quota.Period)
} else {
content = fmt.Sprintf("%d %d\n", quota.Quota, quota.Period)
}
err := afero.WriteFile(fs, filepath.Join(cgroupBase, "cpu.max"), []byte(content), 0o644)
if err != nil {
return xerrors.Errorf("write cpu.max to inner container cgroup: %w", err)
}
return nil
}
func setContainerQuotaCGroupV1(ctx context.Context, containerID string, quota xunix.CPUQuota) error {
var (
fs = xunix.GetFS(ctx)
cgroupBase = fmt.Sprintf("/sys/fs/cgroup/cpu,cpuacct/docker/%s/syscont-cgroup-root/", containerID)
)
err := afero.WriteFile(fs, filepath.Join(cgroupBase, "cpu.cfs_period_us"), []byte(strconv.Itoa(quota.Period)), 0o644)
if err != nil {
return xerrors.Errorf("write cpu.cfs_period_us to inner container cgroup: %w", err)
}
err = afero.WriteFile(fs, filepath.Join(cgroupBase, "cpu.cfs_quota_us"), []byte(strconv.Itoa(quota.Quota)), 0o644)
if err != nil {
return xerrors.Errorf("write cpu.cfs_quota_us to inner container cgroup: %w", err)
}
return nil
}
func generateBindMounts(mounts []xunix.Mount) []string {
binds := make([]string, 0, len(mounts))
for _, mount := range mounts {
bind := fmt.Sprintf("%s:%s", mount.Source, mount.Mountpoint)
if mount.ReadOnly {
bind += ":ro"
}
binds = append(binds, bind)
}
return binds
}