Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions go/common/cgroup_rootless.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package common

import (
"fmt"
"os"
"strconv"
"strings"
)

// HostUID returns the *host* uid even if we're uid 0 inside a userns.
func HostUID() int {
// If we're in a user namespace, uid_map will show the host uid
if b, err := os.ReadFile("/proc/self/uid_map"); err == nil {
for _, ln := range strings.Split(string(b), "\n") {
f := strings.Fields(strings.TrimSpace(ln))
// first mapping looks like: "0 <host_uid> <size>"
if len(f) >= 3 && f[0] == "0" {
if hid, err := strconv.Atoi(f[1]); err == nil {
// If host uid is 0, we're not in a userns yet (identity mapping)
if hid != 0 {
return hid
}
}
}
}
}
if su := os.Getenv("SUDO_UID"); su != "" {
if hid, err := strconv.Atoi(su); err == nil && hid > 0 {
return hid
}
}
return os.Getuid()
}

// DelegatedUserCgroupBase returns the systemd user slice path for this user.
func DelegatedUserCgroupBase() (string, error) {
uid := HostUID()
base := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/user@%d.service", uid, uid)
if st, err := os.Stat(base); err == nil && st.IsDir() {
return base, nil
}
return "", fmt.Errorf("no delegated user slice at uid %d", uid)
}

// GetCgroupDelegationInstructions returns instructions for enabling systemd cgroup delegation.
func GetCgroupDelegationInstructions() string {
return "Systemd cgroup delegation is not enabled. Please enable it or run with sudo."
}
79 changes: 0 additions & 79 deletions go/worker/cgroup_rootless.go

This file was deleted.

84 changes: 53 additions & 31 deletions go/worker/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,34 @@ import (
"github.com/open-lambda/open-lambda/go/worker/embedded"
)

// cleanupMountsInUserNS removes all sandbox mount directories.
// Note: When the worker process (running in a user namespace) dies, all mounts
// created within that namespace are automatically cleaned up by the kernel.
// We just need to remove the empty directories.
func cleanupMountsInUserNS(dirName string) error {
files, err := os.ReadDir(dirName)
if err != nil {
return fmt.Errorf("error reading mount root: %s", err.Error())
}

errorCount := 0
for _, file := range files {
path := filepath.Join(dirName, file.Name())
fmt.Printf("Removing sandbox directory %s\n", path)

// Just remove the directory - mounts are already gone when namespace died
if err := os.RemoveAll(path); err != nil {
fmt.Printf("ERROR: Could not remove %s: %s\n", path, err.Error())
errorCount++
}
}

if errorCount > 0 {
return fmt.Errorf("failed to remove %d sandbox directories", errorCount)
}
return nil
}

func initOLBaseDir(baseDir string, dockerBaseImage string) error {
if dockerBaseImage == "" {
dockerBaseImage = "ol-wasm"
Expand Down Expand Up @@ -274,13 +302,27 @@ func runningToStoppedClean() error {
return fmt.Errorf("worker process did not stop within 60 seconds")
}

// getCgRoot returns the cgroup root path for the given olPath.
// It tries the systemd user slice path first (rootless), then falls back to legacy path.
func getCgRoot(olPath string) string {
clusterName := filepath.Base(olPath)

// Try systemd user slice (rootless-friendly)
if base, err := common.DelegatedUserCgroupBase(); err == nil {
return filepath.Join(base, clusterName+"-sandboxes.slice")
}

// Fallback for rootful/legacy
return filepath.Join("/sys", "fs", "cgroup", clusterName+"-sandboxes")
}

// This function will transition the StoppedDirty state to StoppedClean state.
// It attempts to clean up resources after detecting a dirty shutdown.
// It cleans up cgroups and mounts associated with the OpenLambda instance at `olPath`.
// Returns errors encountered during cleanup operations.
func stoppedDirtyToStoppedClean(olPath string) error {
// Clean up cgroups associated with sandboxes
cgRoot := filepath.Join("/sys", "fs", "cgroup", filepath.Base(olPath)+"-sandboxes")
cgRoot := getCgRoot(olPath)
fmt.Printf("Attempting to clean up cgroups at %s\n", cgRoot)

cgroupErrorCount := 0
Expand Down Expand Up @@ -339,24 +381,10 @@ func stoppedDirtyToStoppedClean(olPath string) error {
if !sandboxRootStat.IsDir() {
return fmt.Errorf("sandbox mount root is not a directory")
}
// Perform cleanup
files, err := os.ReadDir(dirName)
if err != nil {
return fmt.Errorf("error reading mount root: %s", err.Error())
}
for _, file := range files {
path := filepath.Join(dirName, file.Name())
fmt.Printf("Attempting to unmount %s\n", path)
if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil {
// Print an error if unmounting fails.
fmt.Printf("Could not unmount: %s\n", err.Error())
sandboxErrorCount += 1
}
if err := syscall.Rmdir(path); err != nil {
// Print an error if removing the mount directory fails.
fmt.Printf("Could not remove mount dir: %s\n", err.Error())
sandboxErrorCount += 1
}
// Perform cleanup - try unmounting in user namespace if needed
if err := cleanupMountsInUserNS(dirName); err != nil {
fmt.Printf("Warning: %v\n", err)
sandboxErrorCount += 1
}
}

Expand All @@ -373,20 +401,14 @@ func stoppedDirtyToStoppedClean(olPath string) error {
return fmt.Errorf("%d error(s) while cleaning up cgroup and %d error(s) while cleaning up sandbox", cgroupErrorCount, sandboxErrorCount)
}

// Attempt to unmount the main mount directory
fmt.Printf("Attempting to clean up main mount directory at %s\n", dirName)
if err := syscall.Unmount(dirName, syscall.MNT_DETACH); err != nil {
// Log an error if unmounting the main directory fails.
if errors.Is(err, syscall.EINVAL) {
fmt.Printf("Sandbox mount root is not mounted. No need to clean up.\n")
} else {
return fmt.Errorf("could not unmount %s: %s", dirName, err.Error())
}
}
// Note: root-sandboxes directory itself is usually not a mount point
// Individual subdirectories were mounts, but they're cleaned up when the namespace dies
fmt.Printf("Cleanup complete for %s\n", dirName)

// Remove the worker.pid file
if err := os.Remove(filepath.Join(olPath, "worker", "worker.pid")); err != nil {
// Return an error if removing worker.pid fails.
pidPath := filepath.Join(olPath, "worker", "worker.pid")
if err := os.Remove(pidPath); err != nil && !errors.Is(err, os.ErrNotExist) {
// Only fail if file exists but we can't remove it
return fmt.Errorf("could not remove worker.pid: %s", err.Error())
}

Expand Down
91 changes: 47 additions & 44 deletions go/worker/sandbox/cgroups/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"log/slog"
"os"
"path"
"strconv"
"strings"
"syscall"
"time"
Expand All @@ -25,41 +24,6 @@ type CgroupPool struct {
nextID int
}

// / NOTE (rootless): helpers used only for delegated user-slice resolution.
func hostUIDForCgroups() int {
if b, err := os.ReadFile("/proc/self/uid_map"); err == nil {
for _, ln := range strings.Split(string(b), "\n") {
ln = strings.TrimSpace(ln)
if ln == "" {
continue
}
fs := strings.Fields(ln)
// first mapping: "0 <host_uid> <size>"
if len(fs) >= 2 && fs[0] == "0" {
if hid, err := strconv.Atoi(fs[1]); err == nil && hid >= 0 {
return hid
}
}
}
}
if su := os.Getenv("SUDO_UID"); su != "" {
if hid, err := strconv.Atoi(su); err == nil && hid > 0 {
return hid
}
}
return os.Getuid()
}

// NOTE (rootless): prefer systemd user slice when present for cgroup pool.
func delegatedUserCgroupBase() (string, error) {
uid := hostUIDForCgroups()
p := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/user@%d.service/user.slice", uid, uid)
if st, err := os.Stat(p); err == nil && st.IsDir() {
return p, nil
}
return "", fmt.Errorf("delegated user cgroup base not found for uid %d", uid)
}

// NOTE (rootless): best-effort guard for controller files.
func writeOK(p string) bool {
st, err := os.Stat(p)
Expand All @@ -84,16 +48,55 @@ func NewCgroupPool(name string) (*CgroupPool, error) {
nextID: 0,
}

// create (or ensure) the pool directory
groupPath := pool.GroupPath()
pool.printf("using cgroup base: %s", groupPath)
// Try to create the pool directory with fallback logic:
// 1. Try systemd user slice (rootless-friendly)
// 2. Fall back to legacy path if that fails
var groupPath string
var createErr error

// Try systemd user slice first
if base, err := common.DelegatedUserCgroupBase(); err == nil {
poolName := pool.Name
if !strings.HasSuffix(poolName, ".slice") {
poolName += ".slice"
}
groupPath = path.Join(base, poolName)
pool.printf("trying systemd user slice: %s", groupPath)
if err := os.MkdirAll(groupPath, 0o755); err == nil {
pool.printf("using cgroup base: %s", groupPath)
// Best-effort: enable controllers
rpath := fmt.Sprintf("%s/cgroup.subtree_control", groupPath)
if f, err := os.OpenFile(rpath, os.O_WRONLY|os.O_APPEND, 0); err == nil {
_, _ = f.WriteString("+pids +io +memory +cpu\n")
_ = f.Close()
} else {
pool.printf("WARN: could not write %s (%v); continuing without delegating controllers", rpath, err)
}
go pool.cgTask()
return pool, nil
} else {
pool.printf("WARN: cannot create %s (%v); falling back to legacy path", groupPath, err)
createErr = err
}
}

// Fallback to legacy path
groupPath = fmt.Sprintf("/sys/fs/cgroup/%s", pool.Name)
pool.printf("trying legacy path: %s", groupPath)
if err := os.MkdirAll(groupPath, 0o700); err != nil {
return nil, fmt.Errorf("MkdirAll %s: %w", groupPath, err)
// Both paths failed - provide helpful error message
errMsg := fmt.Sprintf("Failed to create cgroup pool at both systemd user slice and legacy path.\n")
if createErr != nil {
errMsg += fmt.Sprintf(" - User slice error: %v\n", createErr)
}
errMsg += fmt.Sprintf(" - Legacy path error: %v\n\n", err)
errMsg += common.GetCgroupDelegationInstructions()
return nil, fmt.Errorf("%s", errMsg)
}

// Best-effort: make controllers available to child groups.
// Not all Ubuntu/systemd setups delegate +cpu/+memory/+pids to user slices.
// We ignore failures here and let later code skip writes if delegation is missing.
pool.printf("using cgroup base: %s", groupPath)

// Best-effort: make controllers available to child groups
rpath := fmt.Sprintf("%s/cgroup.subtree_control", groupPath)
if f, err := os.OpenFile(rpath, os.O_WRONLY|os.O_APPEND, 0); err == nil {
_, _ = f.WriteString("+pids +io +memory +cpu\n")
Expand Down Expand Up @@ -257,7 +260,7 @@ func (pool *CgroupPool) GetCg(memLimitMB int, moveMemCharge bool, cpuPercent int

// GroupPath returns the path to the Cgroup pool for OpenLambda
func (pool *CgroupPool) GroupPath() string {
if base, err := delegatedUserCgroupBase(); err == nil {
if base, err := common.DelegatedUserCgroupBase(); err == nil {
name := pool.Name
if !strings.HasSuffix(name, ".slice") {
name += ".slice"
Expand Down