Skip to content

Commit

Permalink
add runc shim support for sched core
Browse files Browse the repository at this point in the history
In linux 5.14 and hopefully some backports, core scheduling allows processes to
be co scheduled within the same domain on SMT enabled systems.

The containerd impl sets the core sched domain when launching a shim. This
allows a clean way for each shim(container/pod) to be in its own domain and any
additional containers, (v2 pods) be be launched with the same domain as well as
any exec'd process added to the container.

kernel docs: https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html

Signed-off-by: Michael Crosby <michael@thepasture.io>
  • Loading branch information
crosbymichael committed Oct 8, 2021
1 parent 88e1cf5 commit e48bbe8
Show file tree
Hide file tree
Showing 99 changed files with 4,329 additions and 3,611 deletions.
1 change: 1 addition & 0 deletions cmd/containerd/command/main.go
Expand Up @@ -184,6 +184,7 @@ can be used and modified as necessary as a custom configuration.`
s *server.Server
err error
}

// run server initialization in a goroutine so we don't end up blocking important things like SIGTERM handling
// while the server is initializing.
// As an example opening the bolt database will block forever if another containerd is already running and containerd
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Expand Up @@ -62,7 +62,7 @@ require (
go.opentelemetry.io/otel/trace v1.0.1
golang.org/x/net v0.0.0-20210520170846-37e1c6afe023
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c
golang.org/x/sys v0.0.0-20210915083310-ed5796bab164
google.golang.org/grpc v1.41.0
google.golang.org/protobuf v1.27.1
gotest.tools/v3 v3.0.3
Expand Down
3 changes: 2 additions & 1 deletion go.sum
Expand Up @@ -762,8 +762,9 @@ golang.org/x/sys v0.0.0-20210423185535-09eb48e85fd7/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210426230700-d19ff857e887/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210915083310-ed5796bab164 h1:7ZDGnxgHAMw7thfC5bEos0RDAccZKxioiWBhfIe+tvw=
golang.org/x/sys v0.0.0-20210915083310-ed5796bab164/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d h1:SZxvLBoTP5yHO3Frd4z4vrF+DBX9vMVanchswa69toE=
Expand Down
49 changes: 49 additions & 0 deletions pkg/schedcore/prctl_linux.go
@@ -0,0 +1,49 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package schedcore

import (
"golang.org/x/sys/unix"
)

// PidType is the type of provided pid value and how it should be treated
type PidType int

const (
// Pid affects the current pid
Pid PidType = pidtypePid
// ThreadGroup affects all threads in the group
ThreadGroup PidType = pidtypeTgid
// ProcessGroup affects all processes in the group
ProcessGroup PidType = pidtypePgid
)

const (
pidtypePid = 0
pidtypeTgid = 1
pidtypePgid = 2
)

// Create a new sched core domain
func Create(t PidType) error {
return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_CREATE, 0, uintptr(t), 0)
}

// ShareFrom shares the sched core domain from the provided pid
func ShareFrom(pid uint64, t PidType) error {
return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_SHARE_FROM, uintptr(pid), uintptr(t), 0)
}
54 changes: 35 additions & 19 deletions runtime/v2/binary.go
Expand Up @@ -35,19 +35,28 @@ import (
"github.com/sirupsen/logrus"
)

func shimBinary(bundle *Bundle, runtime, containerdAddress string, containerdTTRPCAddress string) *binary {
type shimBinaryConfig struct {
runtime string
address string
ttrpcAddress string
schedCore bool
}

func shimBinary(bundle *Bundle, config shimBinaryConfig) *binary {
return &binary{
bundle: bundle,
runtime: runtime,
containerdAddress: containerdAddress,
containerdTTRPCAddress: containerdTTRPCAddress,
runtime: config.runtime,
containerdAddress: config.address,
containerdTTRPCAddress: config.ttrpcAddress,
schedCore: config.schedCore,
}
}

type binary struct {
runtime string
containerdAddress string
containerdTTRPCAddress string
schedCore bool
bundle *Bundle
}

Expand All @@ -61,13 +70,15 @@ func (b *binary) Start(ctx context.Context, opts *types.Any, onClose func()) (_

cmd, err := client.Command(
ctx,
b.runtime,
b.containerdAddress,
b.containerdTTRPCAddress,
b.bundle.Path,
opts,
args...,
)
&client.CommandConfig{
Runtime: b.runtime,
Address: b.containerdAddress,
TTRPCAddress: b.containerdTTRPCAddress,
Path: b.bundle.Path,
Opts: opts,
Args: args,
SchedCore: b.schedCore,
})
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -138,14 +149,19 @@ func (b *binary) Delete(ctx context.Context) (*runtime.Exit, error) {
}

cmd, err := client.Command(ctx,
b.runtime,
b.containerdAddress,
b.containerdTTRPCAddress,
bundlePath,
nil,
"-id", b.bundle.ID,
"-bundle", b.bundle.Path,
"delete")
&client.CommandConfig{
Runtime: b.runtime,
Address: b.containerdAddress,
TTRPCAddress: b.containerdTTRPCAddress,
Path: bundlePath,
Opts: nil,
Args: []string{
"-id", b.bundle.ID,
"-bundle", b.bundle.Path,
"delete",
},
})

if err != nil {
return nil, err
}
Expand Down
58 changes: 46 additions & 12 deletions runtime/v2/manager.go
Expand Up @@ -41,6 +41,8 @@ import (
type Config struct {
// Supported platforms
Platforms []string `toml:"platforms"`
// SchedCore enabled linux core scheduling
SchedCore bool `toml:"sched_core"`
}

func init() {
Expand All @@ -55,7 +57,8 @@ func init() {
Platforms: defaultPlatforms(),
},
InitFn: func(ic *plugin.InitContext) (interface{}, error) {
supportedPlatforms, err := parsePlatforms(ic.Config.(*Config).Platforms)
config := ic.Config.(*Config)
supportedPlatforms, err := parsePlatforms(config.Platforms)
if err != nil {
return nil, err
}
Expand All @@ -78,26 +81,45 @@ func init() {
cs := metadata.NewContainerStore(m.(*metadata.DB))
events := ep.(*exchange.Exchange)

return New(ic.Context, ic.Root, ic.State, ic.Address, ic.TTRPCAddress, events, cs)
return New(ic.Context, &ManagerConfig{
Root: ic.Root,
State: ic.State,
Address: ic.Address,
TTRPCAddress: ic.TTRPCAddress,
Events: events,
Store: cs,
SchedCore: config.SchedCore,
})
},
})
}

type ManagerConfig struct {
Root string
State string
Store containers.Store
Events *exchange.Exchange
Address string
TTRPCAddress string
SchedCore bool
}

// New task manager for v2 shims
func New(ctx context.Context, root, state, containerdAddress, containerdTTRPCAddress string, events *exchange.Exchange, cs containers.Store) (*TaskManager, error) {
for _, d := range []string{root, state} {
func New(ctx context.Context, config *ManagerConfig) (*TaskManager, error) {
for _, d := range []string{config.Root, config.State} {
if err := os.MkdirAll(d, 0711); err != nil {
return nil, err
}
}
m := &TaskManager{
root: root,
state: state,
containerdAddress: containerdAddress,
containerdTTRPCAddress: containerdTTRPCAddress,
root: config.Root,
state: config.State,
containerdAddress: config.Address,
containerdTTRPCAddress: config.TTRPCAddress,
schedCore: config.SchedCore,
tasks: runtime.NewTaskList(),
events: events,
containers: cs,
events: config.Events,
containers: config.Store,
}
if err := m.loadExistingTasks(ctx); err != nil {
return nil, err
Expand All @@ -111,6 +133,7 @@ type TaskManager struct {
state string
containerdAddress string
containerdTTRPCAddress string
schedCore bool

tasks *runtime.TaskList
events *exchange.Exchange
Expand Down Expand Up @@ -167,7 +190,12 @@ func (m *TaskManager) startShim(ctx context.Context, bundle *Bundle, id string,
topts = opts.RuntimeOptions
}

b := shimBinary(bundle, opts.Runtime, m.containerdAddress, m.containerdTTRPCAddress)
b := shimBinary(bundle, shimBinaryConfig{
runtime: opts.Runtime,
address: m.containerdAddress,
ttrpcAddress: m.containerdTTRPCAddress,
schedCore: m.schedCore,
})
shim, err := b.Start(ctx, topts, func() {
log.G(ctx).WithField("id", id).Info("shim disconnected")

Expand Down Expand Up @@ -303,7 +331,13 @@ func (m *TaskManager) loadTasks(ctx context.Context) error {
bundle.Delete()
continue
}
binaryCall := shimBinary(bundle, container.Runtime.Name, m.containerdAddress, m.containerdTTRPCAddress)
binaryCall := shimBinary(bundle,
shimBinaryConfig{
runtime: container.Runtime.Name,
address: m.containerdAddress,
ttrpcAddress: m.containerdTTRPCAddress,
schedCore: m.schedCore,
})
shim, err := loadShim(ctx, bundle, func() {
log.G(ctx).WithField("id", id).Info("shim disconnected")

Expand Down
11 changes: 11 additions & 0 deletions runtime/v2/runc/v1/service.go
Expand Up @@ -24,6 +24,7 @@ import (
"io"
"os"
"path/filepath"
goruntime "runtime"
"sync"
"syscall"
"time"
Expand All @@ -37,6 +38,7 @@ import (
"github.com/containerd/containerd/pkg/oom"
oomv1 "github.com/containerd/containerd/pkg/oom/v1"
"github.com/containerd/containerd/pkg/process"
"github.com/containerd/containerd/pkg/schedcore"
"github.com/containerd/containerd/pkg/stdio"
"github.com/containerd/containerd/runtime/v2/runc"
"github.com/containerd/containerd/runtime/v2/runc/options"
Expand Down Expand Up @@ -166,10 +168,19 @@ func (s *service) StartShim(ctx context.Context, opts shim.StartOpts) (_ string,

cmd.ExtraFiles = append(cmd.ExtraFiles, f)

goruntime.LockOSThread()
if os.Getenv("SCHED_CORE") != "" {
if err := schedcore.Create(schedcore.ProcessGroup); err != nil {
return "", errors.Wrap(err, "enable sched core support")
}
}

if err := cmd.Start(); err != nil {
f.Close()
return "", err
}
goruntime.UnlockOSThread()

defer func() {
if retErr != nil {
cmd.Process.Kill()
Expand Down
12 changes: 12 additions & 0 deletions runtime/v2/runc/v2/service.go
Expand Up @@ -25,6 +25,7 @@ import (
"io"
"os"
"path/filepath"
goruntime "runtime"
"sync"
"syscall"
"time"
Expand All @@ -40,6 +41,7 @@ import (
oomv1 "github.com/containerd/containerd/pkg/oom/v1"
oomv2 "github.com/containerd/containerd/pkg/oom/v2"
"github.com/containerd/containerd/pkg/process"
"github.com/containerd/containerd/pkg/schedcore"
"github.com/containerd/containerd/pkg/stdio"
"github.com/containerd/containerd/pkg/userns"
"github.com/containerd/containerd/runtime/v2/runc"
Expand Down Expand Up @@ -234,10 +236,20 @@ func (s *service) StartShim(ctx context.Context, opts shim.StartOpts) (_ string,

cmd.ExtraFiles = append(cmd.ExtraFiles, f)

goruntime.LockOSThread()
if os.Getenv("SCHED_CORE") != "" {
if err := schedcore.Create(schedcore.ProcessGroup); err != nil {
return "", errors.Wrap(err, "enable sched core support")
}
}

if err := cmd.Start(); err != nil {
f.Close()
return "", err
}

goruntime.UnlockOSThread()

defer func() {
if retErr != nil {
cmd.Process.Kill()
Expand Down

0 comments on commit e48bbe8

Please sign in to comment.