Skip to content

Commit

Permalink
Merge pull request #7443 from haircommander/cpu-lb-crun-1.27
Browse files Browse the repository at this point in the history
[1.27] highperfhooks: apply cgroup setting to crun cgroup
  • Loading branch information
openshift-ci[bot] committed Oct 26, 2023
2 parents 9a24aae + 88d6cac commit 9b9c375
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 81 deletions.
2 changes: 1 addition & 1 deletion internal/config/cgmgr/cgmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func createSandboxCgroup(sbParent, containerCgroup string) error {
if path == "" {
return fmt.Errorf("failed to find cpuset for newly created cgroup")
}
if err := os.Mkdir(path, 0o755); err != nil && !os.IsNotExist(err) {
if err := os.MkdirAll(path, 0o755); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to create cpuset for newly created cgroup: %w", err)
}
if err := libctr.WriteFile(path, "cpuset.sched_load_balance", "0"); err != nil {
Expand Down
19 changes: 5 additions & 14 deletions internal/runtimehandlerhooks/default_cpu_load_balance_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@ package runtimehandlerhooks

import (
"context"
"fmt"

"github.com/cri-o/cri-o/internal/config/node"
"github.com/cri-o/cri-o/internal/lib/sandbox"
"github.com/cri-o/cri-o/internal/oci"
"github.com/opencontainers/runc/libcontainer/cgroups"
)

// DefaultCPULoadBalanceHooks is used to run additional hooks that will configure containers for CPU load balancing.
Expand All @@ -31,20 +28,14 @@ func (*DefaultCPULoadBalanceHooks) PreStop(context.Context, *oci.Container, *san
func (*DefaultCPULoadBalanceHooks) PostStop(ctx context.Context, c *oci.Container, s *sandbox.Sandbox) error {
// Disable cpuset.sched_load_balance for all stale cgroups.
// This way, cpumanager can ignore stopped containers, but the running ones will still have exclusive access.
if node.CgroupIsV2() || c.Spoofed() {
if c.Spoofed() {
return nil
}
containerCgroup, containerCgroupParent, systemd, err := containerCgroupAndParent(s.CgroupParent(), c)
if err != nil {
return err
}
mgr, err := libctrManager(containerCgroup, containerCgroupParent, systemd)

_, containerManagers, err := libctrManagersForPodAndContainerCgroup(c, s.CgroupParent())
if err != nil {
return err
}
path := mgr.Path("cpuset")
if path == "" {
return fmt.Errorf("failed to find cpuset for newly created cgroup")
}
return cgroups.WriteFile(path, "cpuset.sched_load_balance", "0")

return disableCPULoadBalancing(containerManagers)
}
111 changes: 67 additions & 44 deletions internal/runtimehandlerhooks/high_performance_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"os/exec"
"path"
"path/filepath"
"strconv"
"strings"

"github.com/cri-o/cri-o/internal/config/cgmgr"
Expand Down Expand Up @@ -58,9 +57,15 @@ func (h *HighPerformanceHooks) PreStart(ctx context.Context, c *oci.Container, s
return nil
}

// creating libctr managers is expensive on v1. Reuse between CPU load balancing and CPU quota
podManager, containerManagers, err := libctrManagersForPodAndContainerCgroup(c, s.CgroupParent())
if err != nil {
return err
}

// disable the CPU load balancing for the container CPUs
if shouldCPULoadBalancingBeDisabled(s.Annotations()) {
if err := disableCPULoadBalancing(c); err != nil {
if err := disableCPULoadBalancing(containerManagers); err != nil {
return fmt.Errorf("set CPU load balancing: %w", err)
}
}
Expand All @@ -76,7 +81,7 @@ func (h *HighPerformanceHooks) PreStart(ctx context.Context, c *oci.Container, s
// disable the CFS quota for the container CPUs
if shouldCPUQuotaBeDisabled(s.Annotations()) {
log.Infof(ctx, "Disable cpu cfs quota for container %q", c.ID())
if err := setCPUQuota(s.CgroupParent(), c); err != nil {
if err := setCPUQuota(podManager, containerManagers); err != nil {
return fmt.Errorf("set CPU CFS quota: %w", err)
}
}
Expand Down Expand Up @@ -208,34 +213,19 @@ func annotationValueDeprecationWarning(annotation string) string {
// Since CRI-O is the owner of the container cgroup, it must set this value for
// the container. Some other entity (kubelet, external service) must ensure this is the case for all
// other cgroups that intersect (at minimum: all parent cgroups of this cgroup).
func disableCPULoadBalancing(c *oci.Container) error {
lspec := c.Spec().Linux
if lspec == nil ||
lspec.Resources == nil ||
lspec.Resources.CPU == nil ||
lspec.Resources.CPU.Cpus == "" {
return fmt.Errorf("find container %s CPUs", c.ID())
}

func disableCPULoadBalancing(containerManagers []cgroups.Manager) error {
if node.CgroupIsV2() {
return fmt.Errorf("disabling CPU load balancing on cgroupv2 not yet supported")
}

pid, err := c.Pid()
if err != nil {
return fmt.Errorf("failed to get pid of container %s: %w", c.ID(), err)
}
controllers, err := cgroups.ParseCgroupFile("/proc/" + strconv.Itoa(pid) + "/cgroup")
if err != nil {
return fmt.Errorf("failed to get cgroups of container %s: %w", c.ID(), err)
}

cpusetPath, ok := controllers["cpuset"]
if !ok {
return fmt.Errorf("failed to get cpuset of container %s", c.ID())
for i := len(containerManagers) - 1; i >= 0; i-- {
cpusetPath := containerManagers[i].Path("cpuset")
if err := cgroups.WriteFile(cpusetPath, "cpuset.sched_load_balance", "0"); err != nil {
return err
}
}

return cgroups.WriteFile("/sys/fs/cgroup/cpuset"+cpusetPath, "cpuset.sched_load_balance", "0")
return nil
}

func setIRQLoadBalancing(ctx context.Context, c *oci.Container, enable bool, irqSmpAffinityFile, irqBalanceConfigFile string) error {
Expand Down Expand Up @@ -287,21 +277,19 @@ func setIRQLoadBalancing(ctx context.Context, c *oci.Container, enable bool, irq
return nil
}

func setCPUQuota(parentDir string, c *oci.Container) error {
containerCgroup, containerCgroupParent, systemd, err := containerCgroupAndParent(parentDir, c)
if err != nil {
func setCPUQuota(podManager cgroups.Manager, containerManagers []cgroups.Manager) error {
if err := disableCPUQuotaForCgroup(podManager); err != nil {
return err
}
podCgroup := filepath.Base(containerCgroupParent)
podCgroupParent := filepath.Dir(containerCgroupParent)

if err := disableCPUQuotaForCgroup(podCgroup, podCgroupParent, systemd); err != nil {
return err
for _, containerManager := range containerManagers {
if err := disableCPUQuotaForCgroup(containerManager); err != nil {
return err
}
}
return disableCPUQuotaForCgroup(containerCgroup, containerCgroupParent, systemd)
return nil
}

func containerCgroupAndParent(parentDir string, c *oci.Container) (ctrCgroup, parentCgroup string, systemd bool, _ error) {
func libctrManagersForPodAndContainerCgroup(c *oci.Container, parentDir string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, _ error) {
var (
cgroupManager cgmgr.CgroupManager
err error
Expand All @@ -316,25 +304,60 @@ func containerCgroupAndParent(parentDir string, c *oci.Container) (ctrCgroup, pa
// Programming error, this is only possible if the manager string is invalid.
panic(err)
}
cgroupPath, err := cgroupManager.ContainerCgroupAbsolutePath(parentDir, c.ID())

containerCgroupFullPath, err := cgroupManager.ContainerCgroupAbsolutePath(parentDir, c.ID())
if err != nil {
return nil, nil, err
}

podCgroupFullPath := filepath.Dir(containerCgroupFullPath)
podManager, err = libctrManager(filepath.Base(podCgroupFullPath), filepath.Dir(podCgroupFullPath), cgroupManager.IsSystemd())
if err != nil {
return "", "", false, err
return nil, nil, err
}
containerCgroup := filepath.Base(cgroupPath)

containerCgroup := filepath.Base(containerCgroupFullPath)
// A quirk of libcontainer's cgroup driver.
// See explanation in disableCPUQuotaForCgroup function.
if cgroupManager.IsSystemd() {
containerCgroup = c.ID()
}
return containerCgroup, filepath.Dir(cgroupPath), cgroupManager.IsSystemd(), nil
}

func disableCPUQuotaForCgroup(cgroup, parent string, systemd bool) error {
mgr, err := libctrManager(cgroup, parent, systemd)
containerManager, err := libctrManager(containerCgroup, filepath.Dir(containerCgroupFullPath), cgroupManager.IsSystemd())
if err != nil {
return err
return nil, nil, err
}
containerManagers = []cgroups.Manager{containerManager}

// crun actually does the cgroup configuration in a child of the cgroup CRI-O expects to be the container's
extraManager, err := trueContainerCgroupManager(containerCgroupFullPath)
if err != nil {
return nil, nil, err
}
if extraManager != nil {
containerManagers = append(containerManagers, extraManager)
}
return podManager, containerManagers, nil
}

func trueContainerCgroupManager(expectedContainerCgroup string) (cgroups.Manager, error) {
// HACK: There isn't really a better way to check if the actual container cgroup is in a child cgroup of the expected.
// We could check /proc/$pid/cgroup, but we need to be able to query this after the container exits and the process is gone.
// We know the source of this: crun creates a sub cgroup of the container to do the actual management, to enforce systemd's single
// owner rule. Thus, we need to hardcode this check.
actualContainerCgroup := filepath.Join(expectedContainerCgroup, "container")
cgroupRoot := "/sys/fs/cgroup"
// Choose cpuset as the cgroup to check, with little reason.
if !node.CgroupIsV2() {
cgroupRoot += "/cpuset"
}
if _, err := os.Stat(filepath.Join(cgroupRoot, actualContainerCgroup)); err != nil {
return nil, nil
}
// must be crun, make another libctrManager. Regardless of cgroup driver, it will be treated as cgroupfs
return libctrManager(filepath.Base(actualContainerCgroup), filepath.Dir(actualContainerCgroup), false)
}

func disableCPUQuotaForCgroup(mgr cgroups.Manager) error {
return mgr.Set(&configs.Resources{
SkipDevices: true,
CpuQuota: -1,
Expand Down
3 changes: 3 additions & 0 deletions test/cgroups.bats
Original file line number Diff line number Diff line change
Expand Up @@ -219,4 +219,7 @@ EOF
cgroup_file="cpu.cfs_quota_us"
[[ $(cat "$CTR_CGROUP"/"$cgroup_file") == "-1" ]]
[[ $(cat "$POD_CGROUP"/"$cgroup_file") == "-1" ]]
if [[ "$CONTAINER_DEFAULT_RUNTIME" == "crun" ]]; then
[[ $(cat "$CTR_CGROUP"/container/"$cgroup_file") == "-1" ]]
fi
}
33 changes: 11 additions & 22 deletions test/cpu_load_balancing.bats
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,16 @@ EOF
}

function check_sched_load_balance() {
local pid="$1"
local ctr_id="$1"
local is_enabled="$2"

path=$(sched_load_balance_path "$pid")
set_container_pod_cgroup_root "cpuset" "$ctr_id"
cgroup_file="cpuset.sched_load_balance"

[[ "$is_enabled" == $(cat "$path") ]]
}

function sched_load_balance_path() {
local pid="$1"

path="/sys/fs/cgroup/cpuset"
loadbalance_filename="cpuset.sched_load_balance"
cgroup=$(grep cpuset /proc/"$pid"/cgroup | tr ":" " " | awk '{ printf $3 }')

echo "$path$cgroup/$loadbalance_filename"
[[ $(cat "$CTR_CGROUP"/"$cgroup_file") == "$is_enabled" ]]
if [[ "$CONTAINER_DEFAULT_RUNTIME" == "crun" ]]; then
[[ $(cat "$CTR_CGROUP"/container/"$cgroup_file") == "$is_enabled" ]]
fi
}

# Verify the pre start runtime handler hooks run when triggered by annotation and workload.
Expand All @@ -70,26 +64,23 @@ function sched_load_balance_path() {
"$TESTDATA"/container_sleep.json > "$ctrconfig"

ctr_id=$(crictl run "$ctrconfig" "$sboxconfig")
ctr_pid=$(crictl inspect "$ctr_id" | jq .info.pid)

# check for sched_load_balance
check_sched_load_balance "$ctr_pid" 0 # disabled
check_sched_load_balance "$ctr_id" 0 # disabled
}

# Verify the post stop runtime handler hooks run when a container is stopped manually.
@test "test cpu load balance disabled on manual stop" {
start_crio

ctr_id=$(crictl run "$TESTDATA"/container_sleep.json "$TESTDATA"/sandbox_config.json)
ctr_pid=$(crictl inspect "$ctr_id" | jq .info.pid)

# check for sched_load_balance
path=$(sched_load_balance_path "$ctr_pid")
[[ "1" == $(cat "$path") ]]
check_sched_load_balance "$ctr_id" 1 # enabled

# check sched_load_balance is disabled after container stopped
crictl stop "$ctr_id"
[[ "0" == $(cat "$path") ]]
check_sched_load_balance "$ctr_id" 0 # disabled
}

# Verify the post stop runtime handler hooks run when a container exits on its own.
Expand All @@ -99,12 +90,10 @@ function sched_load_balance_path() {
jq ' .command = ["/bin/sh", "-c", "sleep 5 && exit 0"]' \
"$TESTDATA"/container_config.json > "$ctrconfig"
ctr_id=$(crictl run "$ctrconfig" "$TESTDATA"/sandbox_config.json)
ctr_pid=$(crictl inspect "$ctr_id" | jq .info.pid)

path=$(sched_load_balance_path "$ctr_pid")
# wait until container exits naturally
sleep 10

# check for sched_load_balance
[[ "0" == $(cat "$path") ]]
check_sched_load_balance "$ctr_id" 0 # disabled
}

0 comments on commit 9b9c375

Please sign in to comment.