diff --git a/go/common/cgroup_rootless.go b/go/common/cgroup_rootless.go new file mode 100644 index 000000000..c9a2a2ae9 --- /dev/null +++ b/go/common/cgroup_rootless.go @@ -0,0 +1,48 @@ +package common + +import ( + "fmt" + "os" + "strconv" + "strings" +) + +// HostUID returns the *host* uid even if we're uid 0 inside a userns. +func HostUID() int { + // If we're in a user namespace, uid_map will show the host uid + if b, err := os.ReadFile("/proc/self/uid_map"); err == nil { + for _, ln := range strings.Split(string(b), "\n") { + f := strings.Fields(strings.TrimSpace(ln)) + // first mapping looks like: "0 " + if len(f) >= 3 && f[0] == "0" { + if hid, err := strconv.Atoi(f[1]); err == nil { + // If host uid is 0, we're not in a userns yet (identity mapping) + if hid != 0 { + return hid + } + } + } + } + } + if su := os.Getenv("SUDO_UID"); su != "" { + if hid, err := strconv.Atoi(su); err == nil && hid > 0 { + return hid + } + } + return os.Getuid() +} + +// DelegatedUserCgroupBase returns the systemd user slice path for this user. +func DelegatedUserCgroupBase() (string, error) { + uid := HostUID() + base := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/user@%d.service", uid, uid) + if st, err := os.Stat(base); err == nil && st.IsDir() { + return base, nil + } + return "", fmt.Errorf("no delegated user slice at uid %d", uid) +} + +// GetCgroupDelegationInstructions returns instructions for enabling systemd cgroup delegation. +func GetCgroupDelegationInstructions() string { + return "Systemd cgroup delegation is not enabled. Please enable it or run with sudo." +} \ No newline at end of file diff --git a/go/worker/cgroup_rootless.go b/go/worker/cgroup_rootless.go deleted file mode 100644 index b5fd6ad6c..000000000 --- a/go/worker/cgroup_rootless.go +++ /dev/null @@ -1,79 +0,0 @@ -package worker - -import ( - "fmt" - "log" - "os" - "path/filepath" - "strconv" - "strings" -) - -// hostUID returns the *host* uid even if we're uid 0 inside a userns. -func hostUID() int { - if b, err := os.ReadFile("/proc/self/uid_map"); err == nil { - for _, ln := range strings.Split(string(b), "\n") { - f := strings.Fields(strings.TrimSpace(ln)) - // first mapping looks like: "0 " - if len(f) >= 2 && f[0] == "0" { - if hid, err := strconv.Atoi(f[1]); err == nil { - return hid - } - } - } - } - if su := os.Getenv("SUDO_UID"); su != "" { - if hid, err := strconv.Atoi(su); err == nil && hid > 0 { - return hid - } - } - return os.Getuid() -} - -// delegatedUserCgroupBase returns the systemd user slice path for this user. -func delegatedUserCgroupBase() (string, error) { - uid := hostUID() - base := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/user@%d.service/user.slice", uid, uid) - if st, err := os.Stat(base); err == nil && st.IsDir() { - return base, nil - } - return "", fmt.Errorf("no delegated user slice at uid %d", uid) -} - -// ResolveCgroupPoolPath picks where to create the pool. -// Returns (path, disableCgroups). If disableCgroups is true, skip cgroup writes. -func ResolveCgroupPoolPath(clusterName string) (string, bool) { - // try systemd user slice (rootless-friendly) - if base, err := delegatedUserCgroupBase(); err == nil { - p := filepath.Join(base, clusterName+"-sandboxes.slice") - if err := os.MkdirAll(p, 0o755); err == nil { - _ = enableControllersBestEffort(base, []string{"+cpu", "+memory", "+pids"}) - return p, false - } - log.Printf("WARN: cannot create %s (%v); running without cgroups", p, err) - return "", true - } - - // fallback for rootful/legacy - p := filepath.Join("/sys/fs/cgroup", clusterName+"-sandboxes") - if err := os.MkdirAll(p, 0o755); err == nil { - return p, false - } - log.Printf("WARN: cannot create %s; running without cgroups", p) - return "", true -} - -// enableControllersBestEffort tries to enable controllers under the base. -// Failures are fine (depends on systemd delegation). -func enableControllersBestEffort(base string, plus []string) error { - sc := filepath.Join(base, "cgroup.subtree_control") - f, err := os.OpenFile(sc, os.O_WRONLY|os.O_APPEND, 0o644) - if err != nil { - return err - } - defer f.Close() - for _, t := range plus { - _, _ = f.WriteString(t + "\n") - } - return nil -} diff --git a/go/worker/helpers.go b/go/worker/helpers.go index fdbdd6171..311abd628 100644 --- a/go/worker/helpers.go +++ b/go/worker/helpers.go @@ -21,6 +21,34 @@ import ( "github.com/open-lambda/open-lambda/go/worker/embedded" ) +// cleanupMountsInUserNS removes all sandbox mount directories. +// Note: When the worker process (running in a user namespace) dies, all mounts +// created within that namespace are automatically cleaned up by the kernel. +// We just need to remove the empty directories. +func cleanupMountsInUserNS(dirName string) error { + files, err := os.ReadDir(dirName) + if err != nil { + return fmt.Errorf("error reading mount root: %s", err.Error()) + } + + errorCount := 0 + for _, file := range files { + path := filepath.Join(dirName, file.Name()) + fmt.Printf("Removing sandbox directory %s\n", path) + + // Just remove the directory - mounts are already gone when namespace died + if err := os.RemoveAll(path); err != nil { + fmt.Printf("ERROR: Could not remove %s: %s\n", path, err.Error()) + errorCount++ + } + } + + if errorCount > 0 { + return fmt.Errorf("failed to remove %d sandbox directories", errorCount) + } + return nil +} + func initOLBaseDir(baseDir string, dockerBaseImage string) error { if dockerBaseImage == "" { dockerBaseImage = "ol-wasm" @@ -274,13 +302,27 @@ func runningToStoppedClean() error { return fmt.Errorf("worker process did not stop within 60 seconds") } +// getCgRoot returns the cgroup root path for the given olPath. +// It tries the systemd user slice path first (rootless), then falls back to legacy path. +func getCgRoot(olPath string) string { + clusterName := filepath.Base(olPath) + + // Try systemd user slice (rootless-friendly) + if base, err := common.DelegatedUserCgroupBase(); err == nil { + return filepath.Join(base, clusterName+"-sandboxes.slice") + } + + // Fallback for rootful/legacy + return filepath.Join("/sys", "fs", "cgroup", clusterName+"-sandboxes") +} + // This function will transition the StoppedDirty state to StoppedClean state. // It attempts to clean up resources after detecting a dirty shutdown. // It cleans up cgroups and mounts associated with the OpenLambda instance at `olPath`. // Returns errors encountered during cleanup operations. func stoppedDirtyToStoppedClean(olPath string) error { // Clean up cgroups associated with sandboxes - cgRoot := filepath.Join("/sys", "fs", "cgroup", filepath.Base(olPath)+"-sandboxes") + cgRoot := getCgRoot(olPath) fmt.Printf("Attempting to clean up cgroups at %s\n", cgRoot) cgroupErrorCount := 0 @@ -339,24 +381,10 @@ func stoppedDirtyToStoppedClean(olPath string) error { if !sandboxRootStat.IsDir() { return fmt.Errorf("sandbox mount root is not a directory") } - // Perform cleanup - files, err := os.ReadDir(dirName) - if err != nil { - return fmt.Errorf("error reading mount root: %s", err.Error()) - } - for _, file := range files { - path := filepath.Join(dirName, file.Name()) - fmt.Printf("Attempting to unmount %s\n", path) - if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil { - // Print an error if unmounting fails. - fmt.Printf("Could not unmount: %s\n", err.Error()) - sandboxErrorCount += 1 - } - if err := syscall.Rmdir(path); err != nil { - // Print an error if removing the mount directory fails. - fmt.Printf("Could not remove mount dir: %s\n", err.Error()) - sandboxErrorCount += 1 - } + // Perform cleanup - try unmounting in user namespace if needed + if err := cleanupMountsInUserNS(dirName); err != nil { + fmt.Printf("Warning: %v\n", err) + sandboxErrorCount += 1 } } @@ -373,20 +401,14 @@ func stoppedDirtyToStoppedClean(olPath string) error { return fmt.Errorf("%d error(s) while cleaning up cgroup and %d error(s) while cleaning up sandbox", cgroupErrorCount, sandboxErrorCount) } - // Attempt to unmount the main mount directory - fmt.Printf("Attempting to clean up main mount directory at %s\n", dirName) - if err := syscall.Unmount(dirName, syscall.MNT_DETACH); err != nil { - // Log an error if unmounting the main directory fails. - if errors.Is(err, syscall.EINVAL) { - fmt.Printf("Sandbox mount root is not mounted. No need to clean up.\n") - } else { - return fmt.Errorf("could not unmount %s: %s", dirName, err.Error()) - } - } + // Note: root-sandboxes directory itself is usually not a mount point + // Individual subdirectories were mounts, but they're cleaned up when the namespace dies + fmt.Printf("Cleanup complete for %s\n", dirName) // Remove the worker.pid file - if err := os.Remove(filepath.Join(olPath, "worker", "worker.pid")); err != nil { - // Return an error if removing worker.pid fails. + pidPath := filepath.Join(olPath, "worker", "worker.pid") + if err := os.Remove(pidPath); err != nil && !errors.Is(err, os.ErrNotExist) { + // Only fail if file exists but we can't remove it return fmt.Errorf("could not remove worker.pid: %s", err.Error()) } diff --git a/go/worker/sandbox/cgroups/pool.go b/go/worker/sandbox/cgroups/pool.go index ca8a02fc8..3dbd2c0e3 100644 --- a/go/worker/sandbox/cgroups/pool.go +++ b/go/worker/sandbox/cgroups/pool.go @@ -5,7 +5,6 @@ import ( "log/slog" "os" "path" - "strconv" "strings" "syscall" "time" @@ -25,41 +24,6 @@ type CgroupPool struct { nextID int } -// / NOTE (rootless): helpers used only for delegated user-slice resolution. -func hostUIDForCgroups() int { - if b, err := os.ReadFile("/proc/self/uid_map"); err == nil { - for _, ln := range strings.Split(string(b), "\n") { - ln = strings.TrimSpace(ln) - if ln == "" { - continue - } - fs := strings.Fields(ln) - // first mapping: "0 " - if len(fs) >= 2 && fs[0] == "0" { - if hid, err := strconv.Atoi(fs[1]); err == nil && hid >= 0 { - return hid - } - } - } - } - if su := os.Getenv("SUDO_UID"); su != "" { - if hid, err := strconv.Atoi(su); err == nil && hid > 0 { - return hid - } - } - return os.Getuid() -} - -// NOTE (rootless): prefer systemd user slice when present for cgroup pool. -func delegatedUserCgroupBase() (string, error) { - uid := hostUIDForCgroups() - p := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/user@%d.service/user.slice", uid, uid) - if st, err := os.Stat(p); err == nil && st.IsDir() { - return p, nil - } - return "", fmt.Errorf("delegated user cgroup base not found for uid %d", uid) -} - // NOTE (rootless): best-effort guard for controller files. func writeOK(p string) bool { st, err := os.Stat(p) @@ -84,16 +48,55 @@ func NewCgroupPool(name string) (*CgroupPool, error) { nextID: 0, } - // create (or ensure) the pool directory - groupPath := pool.GroupPath() - pool.printf("using cgroup base: %s", groupPath) + // Try to create the pool directory with fallback logic: + // 1. Try systemd user slice (rootless-friendly) + // 2. Fall back to legacy path if that fails + var groupPath string + var createErr error + + // Try systemd user slice first + if base, err := common.DelegatedUserCgroupBase(); err == nil { + poolName := pool.Name + if !strings.HasSuffix(poolName, ".slice") { + poolName += ".slice" + } + groupPath = path.Join(base, poolName) + pool.printf("trying systemd user slice: %s", groupPath) + if err := os.MkdirAll(groupPath, 0o755); err == nil { + pool.printf("using cgroup base: %s", groupPath) + // Best-effort: enable controllers + rpath := fmt.Sprintf("%s/cgroup.subtree_control", groupPath) + if f, err := os.OpenFile(rpath, os.O_WRONLY|os.O_APPEND, 0); err == nil { + _, _ = f.WriteString("+pids +io +memory +cpu\n") + _ = f.Close() + } else { + pool.printf("WARN: could not write %s (%v); continuing without delegating controllers", rpath, err) + } + go pool.cgTask() + return pool, nil + } else { + pool.printf("WARN: cannot create %s (%v); falling back to legacy path", groupPath, err) + createErr = err + } + } + + // Fallback to legacy path + groupPath = fmt.Sprintf("/sys/fs/cgroup/%s", pool.Name) + pool.printf("trying legacy path: %s", groupPath) if err := os.MkdirAll(groupPath, 0o700); err != nil { - return nil, fmt.Errorf("MkdirAll %s: %w", groupPath, err) + // Both paths failed - provide helpful error message + errMsg := fmt.Sprintf("Failed to create cgroup pool at both systemd user slice and legacy path.\n") + if createErr != nil { + errMsg += fmt.Sprintf(" - User slice error: %v\n", createErr) + } + errMsg += fmt.Sprintf(" - Legacy path error: %v\n\n", err) + errMsg += common.GetCgroupDelegationInstructions() + return nil, fmt.Errorf("%s", errMsg) } - // Best-effort: make controllers available to child groups. - // Not all Ubuntu/systemd setups delegate +cpu/+memory/+pids to user slices. - // We ignore failures here and let later code skip writes if delegation is missing. + pool.printf("using cgroup base: %s", groupPath) + + // Best-effort: make controllers available to child groups rpath := fmt.Sprintf("%s/cgroup.subtree_control", groupPath) if f, err := os.OpenFile(rpath, os.O_WRONLY|os.O_APPEND, 0); err == nil { _, _ = f.WriteString("+pids +io +memory +cpu\n") @@ -257,7 +260,7 @@ func (pool *CgroupPool) GetCg(memLimitMB int, moveMemCharge bool, cpuPercent int // GroupPath returns the path to the Cgroup pool for OpenLambda func (pool *CgroupPool) GroupPath() string { - if base, err := delegatedUserCgroupBase(); err == nil { + if base, err := common.DelegatedUserCgroupBase(); err == nil { name := pool.Name if !strings.HasSuffix(name, ".slice") { name += ".slice"