Skip to content

Commit

Permalink
perfruntime: mixed-cpus: add shared cpus to container's cgroup
Browse files Browse the repository at this point in the history
Given an annotation CRI-O appends additional CPUs (named shared CPUs)
into the container's cgroups.

The operation is done via a performance runtime hook.

This will allow the container to run light-weight tasks
on the shared cpus, while all the other task that
requires full isolation will keep running on the guaranteed CPUs.

It also injects env variables in to the container
in order to distinguish between the isolated and shared CPUs.

Signed-off-by: Talor Itzhak <titzhak@redhat.com>
  • Loading branch information
Tal-or committed Nov 29, 2023
1 parent 5cad7ab commit 9bdb2a6
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 6 deletions.
1 change: 1 addition & 0 deletions completions/bash/crio
Expand Up @@ -113,6 +113,7 @@ h
--seccomp-use-default-when-empty
--selinux
--separate-pull-cgroup
--shared-cpuset
--signature-policy
--signature-policy-dir
--stats-collection-period
Expand Down
1 change: 1 addition & 0 deletions completions/fish/crio.fish
Expand Up @@ -147,6 +147,7 @@ complete -c crio -n '__fish_crio_no_subcommand' -l seccomp-profile -r -d 'Path t
complete -c crio -n '__fish_crio_no_subcommand' -f -l seccomp-use-default-when-empty -d 'Use the default seccomp profile when an empty one is specified. This option is currently deprecated, and will be replaced by the SeccompDefault FeatureGate in Kubernetes.'
complete -c crio -n '__fish_crio_no_subcommand' -f -l selinux -d 'Enable selinux support.'
complete -c crio -n '__fish_crio_no_subcommand' -f -l separate-pull-cgroup -r -d '[EXPERIMENTAL] Pull in new cgroup.'
complete -c crio -n '__fish_crio_no_subcommand' -f -l shared-cpuset -r -d 'CPUs set that will be used for guaranteed containers that want access to shared cpus'
complete -c crio -n '__fish_crio_no_subcommand' -l signature-policy -r -d 'Path to signature policy JSON file.'
complete -c crio -n '__fish_crio_no_subcommand' -l signature-policy-dir -r -d 'Path to the root directory for namespaced signature policies. Must be an absolute path.'
complete -c crio -n '__fish_crio_no_subcommand' -f -l stats-collection-period -r -d 'The number of seconds between collecting pod and container stats. If set to 0, the stats are collected on-demand instead.'
Expand Down
1 change: 1 addition & 0 deletions completions/zsh/_crio
Expand Up @@ -120,6 +120,7 @@ it later with **--config**. Global options will modify the output.'
'--seccomp-use-default-when-empty'
'--selinux'
'--separate-pull-cgroup'
'--shared-cpuset'
'--signature-policy'
'--signature-policy-dir'
'--stats-collection-period'
Expand Down
3 changes: 3 additions & 0 deletions docs/crio.8.md
Expand Up @@ -109,6 +109,7 @@ crio
[--seccomp-use-default-when-empty]
[--selinux]
[--separate-pull-cgroup]=[value]
[--shared-cpuset]=[value]
[--signature-policy-dir]=[value]
[--signature-policy]=[value]
[--stats-collection-period]=[value]
Expand Down Expand Up @@ -390,6 +391,8 @@ crio [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...]

**--separate-pull-cgroup**="": [EXPERIMENTAL] Pull in new cgroup.

**--shared-cpuset**="": CPUs set that will be used for guaranteed containers that want access to shared cpus

**--signature-policy**="": Path to signature policy JSON file.

**--signature-policy-dir**="": Path to the root directory for namespaced signature policies. Must be an absolute path. (default: "/etc/crio/policies")
Expand Down
6 changes: 6 additions & 0 deletions docs/crio.conf.5.md
Expand Up @@ -289,6 +289,12 @@ the container runtime configuration.
You can specify CPUs in the Linux CPU list format.
To get better isolation for guaranteed pods, set this parameter to be equal to kubelet reserved-cpus.

**shared_cpuset**=""
Determines the CPU set which is allowed to be shared between guaranteed containers,
regardless of, and in addition to, the exclusiveness of their CPUs.
This field is optional and would not be used if not specified.
You can specify CPUs in the Linux CPU list format.

**namespaces_dir**="/var/run"
The directory where the state of the managed namespaces gets tracked. Only used when manage_ns_lifecycle is true

Expand Down
33 changes: 33 additions & 0 deletions internal/config/node/cgroups_linux.go
Expand Up @@ -7,6 +7,7 @@ import (
"errors"
"os"
"path/filepath"
"strconv"
"sync"

"github.com/containers/common/pkg/cgroups"
Expand All @@ -26,6 +27,38 @@ var (
cgroupIsV2Err error
)

// CgroupHierarchy is a cgroup version agnostic struct that allows
// access to controller paths
type CgroupHierarchy struct {
controllers map[string]string
}

func (ch *CgroupHierarchy) GetAbsoluteControllerContainerPath(controller string) string {
if CgroupIsV2() {
return filepath.Join("/sys/fs/cgroup", ch.controllers[""])
} else {
return filepath.Join("/sys/fs/cgroup", controller, ch.controllers[controller])
}
}

func (ch *CgroupHierarchy) GetAbsoluteControllerPodPath(controller string) string {
if CgroupIsV2() {
selfPath := filepath.Join("/sys/fs/cgroup", ch.controllers[""])
return filepath.Dir(selfPath)
} else {
selfPath := filepath.Join("/sys/fs/cgroup", controller, ch.controllers[controller])
return filepath.Dir(selfPath)
}
}

func CgroupBuildHierarchyFrom(containerPid int) (*CgroupHierarchy, error) {
controllers, err := libctrcgroups.ParseCgroupFile("/proc/" + strconv.Itoa(containerPid) + "/cgroup")
if err != nil {
return nil, err
}
return &CgroupHierarchy{controllers: controllers}, nil
}

func CgroupIsV2() bool {
var cgroupIsV2 bool
cgroupIsV2, cgroupIsV2Err = cgroups.IsCgroup2UnifiedMode()
Expand Down
9 changes: 9 additions & 0 deletions internal/criocli/criocli.go
Expand Up @@ -393,6 +393,9 @@ func mergeConfig(config *libconfig.Config, ctx *cli.Context) error {
if ctx.IsSet("infra-ctr-cpuset") {
config.InfraCtrCPUSet = ctx.String("infra-ctr-cpuset")
}
if ctx.IsSet("shared-cpuset") {
config.SharedCPUSet = ctx.String("shared-cpuset")
}
if ctx.IsSet("stats-collection-period") {
config.StatsCollectionPeriod = ctx.Int("stats-collection-period")
}
Expand Down Expand Up @@ -1111,6 +1114,12 @@ func getCrioFlags(defConf *libconfig.Config) []cli.Flag {
EnvVars: []string{"CONTAINER_INFRA_CTR_CPUSET"},
Value: defConf.InfraCtrCPUSet,
},
&cli.StringFlag{
Name: "shared-cpuset",
Usage: "CPUs set that will be used for guaranteed containers that want access to shared cpus",
EnvVars: []string{"CONTAINER_SHARED_CPUSET"},
Value: defConf.SharedCPUSet,
},
&cli.StringFlag{
Name: "clean-shutdown-file",
Usage: "Location for CRI-O to lay down the clean shutdown file. It indicates whether we've had time to sync changes to disk before shutting down. If not found, crio wipe will clear the storage directory.",
Expand Down
145 changes: 142 additions & 3 deletions internal/runtimehandlerhooks/high_performance_hooks_linux.go
Expand Up @@ -24,6 +24,7 @@ import (
libCtrMgr "github.com/opencontainers/runc/libcontainer/cgroups/manager"
"github.com/opencontainers/runc/libcontainer/configs"
specs "github.com/opencontainers/runtime-spec/specs-go"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/utils/cpuset"
)
Expand All @@ -33,9 +34,6 @@ const (
HighPerformance = "high-performance"
// IrqSmpAffinityProcFile contains the default smp affinity mask configuration
IrqSmpAffinityProcFile = "/proc/irq/default_smp_affinity"

cpusetCpus = "cpuset.cpus"
cpusetCpusExclusive = "cpuset.cpus.exclusive"
)

const (
Expand All @@ -48,11 +46,26 @@ const (
irqBalancedName = "irqbalance"
sysCPUDir = "/sys/devices/system/cpu"
sysCPUSaveDir = "/var/run/crio/cpu"
milliCPUToCPU = 1000
)

const (
cpusetPartition = "cpuset.cpus.partition"
cpusetExclusive = "cpuset.cpus.exclusive"
cgroupSubTreeControl = "cgroup.subtree_control"
cpusetCpus = "cpuset.cpus"
cpusetCpusExclusive = "cpuset.cpus.exclusive"
)

const (
IsolatedCPUsEnvVar = "OPENSHIFT_ISOLATED_CPUS"
SharedCPUsEnvVar = "OPENSHIFT_SHARED_CPUS"
)

// HighPerformanceHooks used to run additional hooks that will configure a system for the latency sensitive workloads
type HighPerformanceHooks struct {
irqBalanceConfigFile string
sharedCPUs string
}

func (h *HighPerformanceHooks) PreStart(ctx context.Context, c *oci.Container, s *sandbox.Sandbox) error {
Expand All @@ -76,6 +89,15 @@ func (h *HighPerformanceHooks) PreStart(ctx context.Context, c *oci.Container, s
}
}

if requestedSharedCPUs(s.Annotations(), c.CRIContainer().GetMetadata().GetName()) {
if h.sharedCPUs == "" {
return fmt.Errorf("shared CPUs were requested for container %q but none are defined", c.Name())
}
if err = setSharedCPUs(ctx, c, podManager, containerManagers, h.sharedCPUs, shouldCPULoadBalancingBeDisabled(s.Annotations())); err != nil {
return fmt.Errorf("set shared CPUs: %w", err)
}
}

// disable the IRQ smp load balancing for the container CPUs
if shouldIRQLoadBalancingBeDisabled(s.Annotations()) {
log.Infof(ctx, "Disable irq smp balancing for container %q", c.ID())
Expand Down Expand Up @@ -220,6 +242,12 @@ func annotationValueDeprecationWarning(annotation string) string {
return fmt.Sprintf("The usage of the annotation %q with value %q will be deprecated under 1.21", annotation, "true")
}

func requestedSharedCPUs(annotations fields.Set, cName string) bool {
key := crioannotations.CPUSharedAnnotation + "/" + cName
v, ok := annotations[key]
return ok && v == annotationEnable
}

// setCPULoadBalancing relies on the cpuset cgroup to disable load balancing for containers.
// The requisite condition to allow this is `cpuset.sched_load_balance` field must be set to 0 for all cgroups
// that intersect with `cpuset.cpus` of the container that desires load balancing.
Expand Down Expand Up @@ -940,3 +968,114 @@ func convertAnnotationToLatency(annotation string) (maxLatency string, err error

return "", fmt.Errorf("invalid annotation value %s", annotation)
}

func setSharedCPUs(ctx context.Context, c *oci.Container, podManager cgroups.Manager, containerManagers []cgroups.Manager, sharedCPUs string, isLoadBalancingDisabled bool) error {
if isContainerCPUsSpecEmpty(c) {
return fmt.Errorf("no cpus found for container %q", c.Name())
}
cpuSpec := c.Spec().Linux.Resources.CPU
isolatedCPUSet, err := cpuset.Parse(cpuSpec.Cpus)
if err != nil {
return fmt.Errorf("failed to parse container %q cpus: %w", c.Name(), err)
}
sharedCPUSet, err := cpuset.Parse(sharedCPUs)
if err != nil {
return fmt.Errorf("failed to parse shared cpus: %w", err)
}
if sharedCPUSet.IsEmpty() {
return fmt.Errorf("shared CPU set is empty")
}
// pod level operations
podCgroup, err := podManager.GetCgroups()
if err != nil {
return err
}
newPodQuota, err := calculatePodQuota(&sharedCPUSet, podCgroup.Resources.CpuQuota, podCgroup.Resources.CpuPeriod)
if err != nil {
return fmt.Errorf("failed to calculate pod quota: %w", err)
}
err = podManager.Set(&configs.Resources{
SkipDevices: true,
CpuQuota: newPodQuota,
})
if err != nil {
return err
}
// container level operations
ctrCPUSet := isolatedCPUSet.Union(sharedCPUSet)
ctrQuota, err := calculateMaximalQuota(&ctrCPUSet, *(cpuSpec.Period))
if err != nil {
return fmt.Errorf("failed to calculate container %s quota: %w", c.ID(), err)
}
err = containerManagers[len(containerManagers)-1].Set(&configs.Resources{
SkipDevices: true,
CpuQuota: ctrQuota,
CpusetCpus: ctrCPUSet.String(),
})
if err != nil {
return err
}
if isLoadBalancingDisabled && node.CgroupIsV2() {
// we need to move the isolated cpus into a separate child cgroup
// on V2 all controllers are under the same path
ctrCgroup := containerManagers[len(containerManagers)-1].Path("")
if err := cgroups.WriteFile(ctrCgroup, cgroupSubTreeControl, "+cpu +cpuset"); err != nil {
return err
}
if err := cgroups.WriteFile(ctrCgroup, cpusetPartition, "member"); err != nil {
return err
}
cgroupChildDir := filepath.Join(ctrCgroup, "cgroup-child")
if err := os.Mkdir(cgroupChildDir, 0o755); err != nil {
return err
}
if err != nil {
return err
}
if err := cgroups.WriteFile(cgroupChildDir, cpusetCpus, isolatedCPUSet.String()); err != nil {
return err
}
if err := cgroups.WriteFile(cgroupChildDir, cpusetExclusive, isolatedCPUSet.String()); err != nil {
return err
}
if err := cgroups.WriteFile(cgroupChildDir, cpusetPartition, "isolated"); err != nil {
return err
}
}
injectCpusetEnv(c, &isolatedCPUSet, &sharedCPUSet)
log.Infof(ctx, "Shared cpus ids %s were added to container %q", sharedCPUSet.String(), c.Name())
return nil
}

func isContainerCPUsSpecEmpty(c *oci.Container) bool {
return c.Spec().Linux == nil ||
c.Spec().Linux.Resources == nil ||
c.Spec().Linux.Resources.CPU == nil ||
c.Spec().Linux.Resources.CPU.Cpus == ""
}

func calculateMaximalQuota(cpus *cpuset.CPUSet, period uint64) (quota int64, err error) {
quan, err := resource.ParseQuantity(strconv.Itoa(cpus.Size()))
if err != nil {
return
}
// after we divide in milliCPUToCPU, it's safe to convert into int64
quota = int64((uint64(quan.MilliValue()) * period) / milliCPUToCPU)
return
}

func calculatePodQuota(sharedCpus *cpuset.CPUSet, existingQuota int64, period uint64) (int64, error) {
additionalQuota, err := calculateMaximalQuota(sharedCpus, period)
if err != nil {
return 0, err
}
return existingQuota + additionalQuota, err
}

func injectCpusetEnv(c *oci.Container, isolated, shared *cpuset.CPUSet) {
spec := c.Spec()
spec.Process.Env = append(spec.Process.Env,
fmt.Sprintf("%s=%s", IsolatedCPUsEnvVar, isolated.String()),
fmt.Sprintf("%s=%s", SharedCPUsEnvVar, shared.String()))
c.SetSpec(&spec)
}
34 changes: 34 additions & 0 deletions internal/runtimehandlerhooks/high_performance_hooks_test.go
Expand Up @@ -569,4 +569,38 @@ var _ = Describe("high_performance_hooks", func() {
})
})
})
Describe("setSharedCPUs", func() {
Context("with empty container CPUs list", func() {
container.SetSpec(
&specs.Spec{
Linux: &specs.Linux{
Resources: &specs.LinuxResources{
CPU: &specs.LinuxCPU{
Cpus: "0,1",
},
},
},
},
)
It("should result in error", func() {
Expect(setSharedCPUs(context.TODO(), container, nil, nil, "", false)).To(HaveOccurred())
})
})
Context("with empty shared CPUs list", func() {
container.SetSpec(
&specs.Spec{
Linux: &specs.Linux{
Resources: &specs.LinuxResources{
CPU: &specs.LinuxCPU{
Cpus: "0,1",
},
},
},
},
)
It("should result in error", func() {
Expect(setSharedCPUs(context.TODO(), container, nil, nil, "", false)).To(HaveOccurred())
})
})
})
})
7 changes: 4 additions & 3 deletions internal/runtimehandlerhooks/runtime_handler_hooks_linux.go
Expand Up @@ -15,11 +15,11 @@ func GetRuntimeHandlerHooks(ctx context.Context, config *libconfig.Config, handl
defer span.End()
if strings.Contains(handler, HighPerformance) {
log.Warnf(ctx, "The usage of the handler %q without adding high-performance feature annotations under allowed_annotations will be deprecated under 1.21", HighPerformance)
return &HighPerformanceHooks{config.IrqBalanceConfigFile}, nil
return &HighPerformanceHooks{irqBalanceConfigFile: config.IrqBalanceConfigFile, sharedCPUs: config.SharedCPUSet}, nil
}
if highPerformanceAnnotationsSpecified(annotations) {
log.Warnf(ctx, "The usage of the handler %q without adding high-performance feature annotations under allowed_annotations will be deprecated under 1.21", HighPerformance)
return &HighPerformanceHooks{config.IrqBalanceConfigFile}, nil
return &HighPerformanceHooks{irqBalanceConfigFile: config.IrqBalanceConfigFile, sharedCPUs: config.SharedCPUSet}, nil
}
if cpuLoadBalancingAllowed(config) {
return &DefaultCPULoadBalanceHooks{}, nil
Expand All @@ -34,7 +34,8 @@ func highPerformanceAnnotationsSpecified(annotations map[string]string) bool {
strings.HasPrefix(k, crioann.CPUQuotaAnnotation) ||
strings.HasPrefix(k, crioann.IRQLoadBalancingAnnotation) ||
strings.HasPrefix(k, crioann.CPUCStatesAnnotation) ||
strings.HasPrefix(k, crioann.CPUFreqGovernorAnnotation) {
strings.HasPrefix(k, crioann.CPUFreqGovernorAnnotation) ||
strings.HasPrefix(k, crioann.CPUSharedAnnotation) {
return true
}
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/annotations/annotations.go
Expand Up @@ -45,6 +45,12 @@ const (
// CPUFreqGovernorAnnotation sets the cpufreq governor for CPUs used by the container
CPUFreqGovernorAnnotation = "cpu-freq-governor.crio.io"

// CPUSharedAnnotation indicate that a container which is part of a guaranteed QoS pod,
// wants access to shared cpus.
// the container name should be appended at the end of the annotation
// example: cpu-shared.crio.io/containerA
CPUSharedAnnotation = "cpu-shared.crio.io"

// SeccompNotifierActionAnnotation indicates a container is allowed to use the seccomp notifier feature.
SeccompNotifierActionAnnotation = "io.kubernetes.cri-o.seccompNotifierAction"

Expand Down Expand Up @@ -87,4 +93,5 @@ var AllAllowedAnnotations = []string{
PodLinuxOverhead,
PodLinuxResources,
LinkLogsAnnotation,
CPUSharedAnnotation,
}

0 comments on commit 9bdb2a6

Please sign in to comment.