Skip to content

Commit ae18a84

Browse files
committed
feat: Support running createContainer hooks in CDI spec
Description ------------ This commit adds the ability for gVisor to run createContainer hooks in the CDI spec. This is needed to support NVIDIA's k8s-device-plugin running in `DEVICE_LIST_STRATEGY=cdi-cri`. In this mode, the plugin creates a CDI spec file at `/var/run/cdi/[...].json` that contains instructions on how to mount GPU devices, which client libraries to bind-mount into the container, and which `nvidia-ctk` hooks need to be run. While the device cdev and client library injection mechanism already worked with gVisor, the createContainer hooks that created the library symlinks (e.g. `/usr/lib/x86_64-linux-gnu/libcuda.so -> libcuda.so.1`) and updated the ldconfig cache (`nvidia-ctk hook update-ldcache`) were missing. This meant that processes inside the container could not resolve the client libraries and thus did not know how to communicate with the `/dev/nvidiactl` and `/dev/nvidia${n}` cdevs. The CDI spec file contains the instructions on how to do this, so now gVisor follows it. gVisor previously solved this problem by using the `nvidia-container-cli configure` command. This largely did the same things that the CDI spec file instructs us to do, but it is a legacy path and is not using CDI at all. How it Works ------------ In gofer_mount.go, the code is changed to have explicit understandings as to what is the containerRootFs (usually under /var/lib/.../root) and the goferRootFs (/proc/fs). The issue with nvidia-ctk hooks is that they would pivot_root(2) into the containerRootFs while gVisor would operate under the goferRootFs. This meant that nvidia-ctk did not see any CDI devices mounted into the containerRootFs. This commit changes gVisor such that all devices and setup is done under the containerRootFs. We then bind-mount containerRootFs into goferRootFs after running the CreateContainer hooks. The gofer pivot_roots into the goferRootFs as before. Note that createContainer hooks are only run if the underlying rootfs is writable. There are many scenarios, such as when using EROFS, where createContainer hooks can't be executed. This problem will be saved for another day to solve. Signed-off-by: LandonTClipp <lclipp@coreweave.com>
1 parent eafda43 commit ae18a84

6 files changed

Lines changed: 123 additions & 51 deletions

File tree

runsc/cmd/gofer.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func (g *Gofer) Synopsis() string {
126126

127127
// Usage implements subcommands.Command.
128128
func (*Gofer) Usage() string {
129-
return "gofer [flags]\n"
129+
return "gofer [flags] <container ID>\n"
130130
}
131131

132132
// SetFlags implements subcommands.Command.
@@ -160,6 +160,11 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomm
160160
f.Usage()
161161
return subcommands.ExitUsageError
162162
}
163+
if f.NArg() != 1 {
164+
f.Usage()
165+
return subcommands.ExitUsageError
166+
}
167+
containerID := f.Arg(0)
163168

164169
conf := args[0].(*config.Config)
165170

@@ -185,7 +190,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomm
185190
defer goferToHostRPC.Close()
186191

187192
if g.setUpRoot {
188-
if err := sandboxsetup.SetupRootFS(spec, conf, g.mountConfs, g.devIoFD, makeRPCMountOpener(goferToHostRPC)); err != nil {
193+
if err := sandboxsetup.SetupRootFS(spec, conf, g.mountConfs, g.devIoFD, makeRPCMountOpener(goferToHostRPC), containerID, g.bundleDir); err != nil {
189194
util.Fatalf("Error setting up root FS: %v", err)
190195
}
191196
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {

runsc/cmd/sandboxsetup/gofer_mount.go

Lines changed: 98 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -77,23 +77,52 @@ func WriteMounts(mountsFD int, mounts []specs.Mount) error {
7777
return nil
7878
}
7979

80-
// SetupRootFS prepares the root filesystem for the gofer process. It mounts
81-
// the container root, sets up submounts and /dev, and optionally remounts
82-
// root as read-only. If chroot mode is active, it also performs a
83-
// pivot_root.
80+
// SetupRootFS prepares the filesystem the gofer will serve to the sandbox.
81+
// There are some gVisor-specific quirks:
82+
// - The gofer runs with minimal capabilities (see runsc/cmd/gofer.go).
83+
// However, gofer startup requires more capabilities (for example,
84+
// pivot_root(2) requires CAP_SYS_ADMIN).
85+
// - As a result, the gofer sets up the container rootfs, does pivot_root(2),
86+
// and then re-executes itself while dropping extra capabilities.
87+
// - To re-execute itself, it requires access to `/proc/self/exe` after
88+
// pivot_root(2).
89+
//
90+
// For this reason, we can't just pivot_root(2) into the container rootfs path
91+
// (i.e. spec.Root.Path), as it would mean bind-mounting host /proc inside
92+
// spec.Root.Path, which might be a read-only filesystem.
93+
//
94+
// Furthermore, createContainer hooks from the OCI spec need to run before the
95+
// pivot_root(2) and expect the container rootfs to be prepared at
96+
// spec.Root.Path with all the bind-mounts in place.
97+
//
98+
// To satisfy all of these requirements, this is the approach we take:
99+
// 1. We prepare all the bind-mounts in `spec.Root.Path` and execute the
100+
// createContainer hooks.
101+
// 2. We create a new tmpfs mount at /proc/fs.
102+
// 3. We bind-mount host /proc and spec.Root.Path onto /proc/fs/proc and
103+
// /proc/fs/root, respectively.
104+
// 4. We then pivot_root(2) into /proc/fs. Now host procfs is accessible via
105+
// /proc/ and container rootfs is accessible via /root.
106+
// 5. We re-exec the gofer binary and drop extra capabilities.
107+
// 6. We unmount the /proc bind mount.
108+
// 7. We chroot(2) into /root (note that the gofer runs with CAP_SYS_CHROOT).
109+
//
110+
// This function does steps 1-4.
84111
//
85112
// mountConfs must be indexed such that mountConfs[0] is the root filesystem
86113
// configuration and subsequent entries correspond to spec mounts with
87114
// mount configs.
88-
func SetupRootFS(spec *specs.Spec, conf *config.Config, mountConfs []specutils.GoferMountConf, devIoFD int, mountOpener MountOpener) error {
115+
func SetupRootFS(spec *specs.Spec, conf *config.Config, mountConfs []specutils.GoferMountConf, devIoFD int, mountOpener MountOpener, containerID string, bundleDir string) error {
89116
// Convert all shared mounts into slaves to be sure that nothing will be
90117
// propagated outside of our namespace.
91118
procPath := "/proc"
92119
if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil {
93120
util.Fatalf("error converting mounts: %v", err)
94121
}
95122

96-
root := spec.Root.Path
123+
goferRootFs := spec.Root.Path
124+
containerRootFs := spec.Root.Path
125+
97126
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
98127
// runsc can't be re-executed without /proc, so we create a tmpfs mount,
99128
// mount ./proc and ./root there, then move this mount to the root and after
@@ -132,51 +161,87 @@ func SetupRootFS(spec *specs.Spec, conf *config.Config, mountConfs []specutils.G
132161
if err := CopyFile("/proc/fs/etc/localtime", "/etc/localtime"); err != nil {
133162
log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
134163
}
135-
root = "/proc/fs/root"
164+
goferRootFs = "/proc/fs/root"
136165
procPath = "/proc/fs/proc"
137166
}
138167

139168
rootfsConf := mountConfs[0]
140-
if rootfsConf.ShouldUseLisafs() {
141-
// Mount root path followed by submounts.
142-
if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil {
143-
return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
144-
}
169+
if !rootfsConf.ShouldUseLisafs() {
170+
// When not using gofer mount for rootfs, spec.Root.Path may not be set or may not be a
171+
// writable directory. So set up the container rootfs directly in /proc/fs/root, which
172+
// is a writable directory.
173+
containerRootFs = goferRootFs
174+
}
145175

146-
flags := uint32(unix.MS_SLAVE | unix.MS_REC)
147-
if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
148-
flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
149-
}
150-
if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil {
151-
return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
152-
}
176+
// Many CDI createContainer hooks will pivot_root(2) into spec.Root.Path.
177+
// pivot_root(2) requires the target to be a mount point, so we must self-bind-mount
178+
// spec.Root.Path here. runc does this step unconditionally in
179+
// libcontainer/rootfs_linux.go:prepareRoot()
180+
if err := unix.Mount(containerRootFs, containerRootFs, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
181+
return fmt.Errorf("self-bind-mounting rootfs %q for hooks: %w", containerRootFs, err)
182+
}
183+
184+
// Ensure the containerRootFs is set to the RootfsPropagation that the user requested.
185+
// Mounts added under SetupMounts inherit the propagation flags of the root.
186+
flags := uint32(unix.MS_SLAVE | unix.MS_REC)
187+
if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
188+
flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
189+
}
190+
if err := specutils.SafeMount("", containerRootFs, "", uintptr(flags), "", procPath); err != nil {
191+
return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", containerRootFs, flags, err)
153192
}
154193

155194
// Replace the current spec, with the clean spec with symlinks resolved.
156-
if err := SetupMounts(conf, spec.Mounts, root, procPath, mountConfs, mountOpener); err != nil {
195+
if err := SetupMounts(conf, spec.Mounts, containerRootFs, procPath, mountConfs, mountOpener); err != nil {
157196
util.Fatalf("error setting up FS: %v", err)
158197
}
159198

160199
// Set up /dev directory if needed.
161200
if devIoFD >= 0 {
162-
if err := SetupDev(spec, conf, root, procPath); err != nil {
201+
if err := SetupDev(spec, conf, containerRootFs, procPath); err != nil {
163202
util.Fatalf("error setting up /dev: %v", err)
164203
}
165204
}
166205

167-
// Check if root needs to be remounted as readonly.
168-
if rootfsConf.ShouldUseLisafs() && (spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs()) {
169-
// If root is a mount point but not read-only, we can change mount options
170-
// to make it read-only for extra safety.
171-
// unix.MS_NOSUID and unix.MS_NODEV are included here not only
172-
// for safety reasons but also because they can be locked and
173-
// any attempts to unset them will fail. See
174-
// mount_namespaces(7) for more details.
175-
log.Infof("Remounting root as readonly: %q", root)
176-
flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV)
177-
if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil {
178-
return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
206+
if rootfsConf.ShouldUseLisafs() {
207+
if spec.Hooks != nil && len(spec.Hooks.CreateContainer) > 0 {
208+
state := specs.State{
209+
Version: specs.Version,
210+
ID: containerID,
211+
Status: specs.StateCreating,
212+
// The container pid is not easily available at this point. We'll set it to -1 to indicate that it's not available.
213+
// A future improvement could plumb this value through to the gofer if it becomes necessary.
214+
Pid: -1,
215+
Bundle: bundleDir,
216+
Annotations: spec.Annotations,
217+
}
218+
if err := specutils.ExecuteHooks(spec.Hooks.CreateContainer, state); err != nil {
219+
util.Fatalf("error executing CreateContainer hooks: %v", err)
220+
}
221+
}
222+
223+
// Now that spec.Root.Path has been prepared, we can bind-mount it to the new root. This will
224+
// make the container rootfs visible in the gofer root.
225+
if err := unix.Mount(containerRootFs, goferRootFs, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
226+
return fmt.Errorf("binding prepared rootfs to gofer root: %v", err)
227+
}
228+
229+
// Check if root needs to be remounted as readonly.
230+
if spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs() {
231+
// If root is a mount point but not read-only, we can change mount options
232+
// to make it read-only for extra safety.
233+
// unix.MS_NOSUID and unix.MS_NODEV are included here not only
234+
// for safety reasons but also because they can be locked and
235+
// any attempts to unset them will fail. See
236+
// mount_namespaces(7) for more details.
237+
log.Infof("Remounting root as readonly: %q", goferRootFs)
238+
flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV)
239+
if err := specutils.SafeMount(goferRootFs, goferRootFs, "bind", flags, "", procPath); err != nil {
240+
return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", goferRootFs, goferRootFs, flags, err)
241+
}
179242
}
243+
} else if spec.Hooks != nil && len(spec.Hooks.CreateContainer) > 0 {
244+
log.Warningf("CreateContainer hooks are not executed since container rootfs is not on lisafs")
180245
}
181246

182247
if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {

runsc/container/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ go_library(
1010
srcs = [
1111
"container.go",
1212
"gofer_to_host_rpc.go",
13-
"hook.go",
1413
"state_file.go",
1514
"status.go",
1615
],

runsc/container/container.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -311,15 +311,12 @@ func New(conf *config.Config, args Args) (*Container, error) {
311311
// "For runtimes that implement the deprecated prestart hooks as
312312
// createRuntime hooks, createRuntime hooks MUST be called after the
313313
// prestart hooks."
314-
if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
314+
if err := specutils.ExecuteHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
315315
return nil, err
316316
}
317-
if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil {
317+
if err := specutils.ExecuteHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil {
318318
return nil, err
319319
}
320-
if len(c.Spec.Hooks.CreateContainer) > 0 {
321-
log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported")
322-
}
323320
}
324321

325322
// Write the PID file. Containerd considers the call to create complete after
@@ -524,7 +521,7 @@ func (c *Container) startImpl(conf *config.Config, action string, startRoot func
524521
// the remaining hooks and lifecycle continue as if the hook had
525522
// succeeded" -OCI spec.
526523
if c.Spec.Hooks != nil {
527-
executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
524+
specutils.ExecuteHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
528525
}
529526

530527
c.changeStatus(Running)
@@ -999,7 +996,7 @@ func (c *Container) Destroy() error {
999996
// 2) Make sure it only runs once, because the root has been deleted, the
1000997
// container can't be loaded again.
1001998
if c.Spec.Hooks != nil {
1002-
executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
999+
specutils.ExecuteHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
10031000
}
10041001

10051002
if len(errs) == 0 {
@@ -1616,8 +1613,13 @@ func (c *Container) createGoferProcess(conf *config.Config, mountHints *boot.Pod
16161613

16171614
donations.Transfer(cmd, nextFD)
16181615

1616+
// Add container ID as the last argument.
1617+
cmd.Args = append(cmd.Args, c.ID)
1618+
log.Infof("Starting gofer with command: %v", cmd.Args)
1619+
16191620
// Start the gofer in the given namespace.
16201621
donation.LogDonations(cmd)
1622+
16211623
log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args)
16221624
if err := specutils.StartInNS(cmd, nss); err != nil {
16231625
return nil, nil, nil, nil, fmt.Errorf("gofer: %v", err)

runsc/specutils/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ go_library(
1111
"cri.go",
1212
"fs.go",
1313
"gofer_conf.go",
14+
"hooks.go",
1415
"namespace.go",
1516
"nvidia.go",
1617
"restore.go",
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
package container
15+
package specutils
1616

1717
import (
1818
"bytes"
@@ -39,27 +39,27 @@ import (
3939
// }]
4040
// },
4141

42-
// executeHooksBestEffort executes hooks and logs warning in case they fail.
42+
// ExecuteHooksBestEffort executes hooks and logs warning in case they fail.
4343
// Runs all hooks, always.
44-
func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
44+
func ExecuteHooksBestEffort(hooks []specs.Hook, s specs.State) {
4545
for _, h := range hooks {
46-
if err := executeHook(h, s); err != nil {
46+
if err := ExecuteHook(h, s); err != nil {
4747
log.Warningf("Failure to execute hook %+v, err: %v", h, err)
4848
}
4949
}
5050
}
5151

52-
// executeHooks executes hooks until the first one fails or they all execute.
53-
func executeHooks(hooks []specs.Hook, s specs.State) error {
52+
// ExecuteHooks executes hooks until the first one fails or they all execute.
53+
func ExecuteHooks(hooks []specs.Hook, s specs.State) error {
5454
for _, h := range hooks {
55-
if err := executeHook(h, s); err != nil {
55+
if err := ExecuteHook(h, s); err != nil {
5656
return err
5757
}
5858
}
5959
return nil
6060
}
6161

62-
func executeHook(h specs.Hook, s specs.State) error {
62+
func ExecuteHook(h specs.Hook, s specs.State) error {
6363
log.Debugf("Executing hook %+v, state: %+v", h, s)
6464

6565
if strings.TrimSpace(h.Path) == "" {

0 commit comments

Comments
 (0)