From 38d44d41db2612455ec4a5ea85e2932e884a506a Mon Sep 17 00:00:00 2001 From: Timo Beckers Date: Mon, 13 May 2024 17:40:50 +0200 Subject: [PATCH] loader: remove ctx from uncancellable functions Loading bpf objects used to be done by iproute2, where propagating ctx to the exec.Cmd invocations made sense, since realistically any shellout can hang for arbitrary reasons. Now the loader is fully hosted in the agent process, this no longer makes sense. Once we're blocked in a bpf() syscall, e.g. for loading a program, the verifier can be interrupted by sending a signal to the calling thread. Since the Go runtime routinely sends these signals under normal operation, ebpf-go will retry a few times when bpf() returns EINTR. The API currently doesn't expose a way to cancel program loading/verification, and there's no clear benefit to doing so in the first place. Verification is relatively lightweight compared to datapath compilation, so interrupting it during teardown is of questionable benefit. The agent doesn't expect it to be interruptible, it's bound to leave endpoints in an undefined state. This commit introduces the assumption that, once endpoint loading/attachment is kicked off (after compilation), it cannot be cancelled. This is reflected in the interface exposed to the rest of the system, by removing the ctx parameter on many methods. Only compilation can be interrupted, since it can take a long time on some systems, especially lower-spec. Signed-off-by: Timo Beckers --- pkg/datapath/fake/types/datapath.go | 2 +- pkg/datapath/loader/base.go | 6 +++--- pkg/datapath/loader/loader.go | 32 ++++++++++++++--------------- pkg/datapath/loader/loader_test.go | 4 ++-- pkg/datapath/loader/netlink.go | 8 +------- pkg/datapath/loader/types/types.go | 2 +- pkg/datapath/loader/xdp.go | 2 +- pkg/datapath/types/loader.go | 2 +- pkg/endpoint/bpf.go | 2 +- 9 files changed, 26 insertions(+), 34 deletions(-) diff --git a/pkg/datapath/fake/types/datapath.go b/pkg/datapath/fake/types/datapath.go index 9a371bf0e4ce1..21d1a2baac5a2 100644 --- a/pkg/datapath/fake/types/datapath.go +++ b/pkg/datapath/fake/types/datapath.go @@ -130,7 +130,7 @@ func (f *FakeLoader) CompileOrLoad(ctx context.Context, ep datapath.Endpoint, st panic("implement me") } -func (f *FakeLoader) ReloadDatapath(ctx context.Context, ep datapath.Endpoint, stats *metrics.SpanStat) error { +func (f *FakeLoader) ReloadDatapath(ep datapath.Endpoint, stats *metrics.SpanStat) error { panic("implement me") } diff --git a/pkg/datapath/loader/base.go b/pkg/datapath/loader/base.go index af52504e1932f..e592c682fb716 100644 --- a/pkg/datapath/loader/base.go +++ b/pkg/datapath/loader/base.go @@ -164,7 +164,7 @@ func cleanIngressQdisc(devices []string) error { } // reinitializeIPSec is used to recompile and load encryption network programs. -func (l *loader) reinitializeIPSec(ctx context.Context) error { +func (l *loader) reinitializeIPSec() error { // We need to take care not to load bpf_network and bpf_host onto the same // device. If devices are required, we load bpf_host and hence don't need // the code below, specific to EncryptInterface. Specifically, we will load @@ -200,7 +200,7 @@ func (l *loader) reinitializeIPSec(ctx context.Context) error { return nil } - coll, finalize, err := loadDatapath(ctx, networkObj) + coll, finalize, err := loadDatapath(networkObj) if err != nil { return fmt.Errorf("loading %s: %w", networkObj, err) } @@ -449,7 +449,7 @@ func (l *loader) Reinitialize(ctx context.Context, tunnelConfig tunnel.Config, d log.WithError(err).Fatal("failed to compile encryption programs") } - if err := l.reinitializeIPSec(ctx); err != nil { + if err := l.reinitializeIPSec(); err != nil { return err } } diff --git a/pkg/datapath/loader/loader.go b/pkg/datapath/loader/loader.go index 109ac5a887c25..2286dbe34be71 100644 --- a/pkg/datapath/loader/loader.go +++ b/pkg/datapath/loader/loader.go @@ -399,20 +399,14 @@ func removeObsoleteNetdevPrograms(devices []string) error { // - cilium_host: ingress and egress // - cilium_net: ingress // - native devices: ingress and (optionally) egress if certain features require it -func (l *loader) reloadHostDatapath(ctx context.Context, ep datapath.Endpoint, objPath string, devices []string) error { - // Warning: here be dragons. There used to be a single loop over - // interfaces+objs+progs here from the iproute2 days, but this was never - // correct to begin with. Tail call maps were always reused when possible, - // causing control flow to transition through invalid states as new tail calls - // were sequentially upserted into the array. - +func (l *loader) reloadHostDatapath(ep datapath.Endpoint, objPath string, devices []string) error { // Replace programs on cilium_host. host, err := netlink.LinkByName(ep.InterfaceName()) if err != nil { return fmt.Errorf("retrieving device %s: %w", ep.InterfaceName(), err) } - coll, finalize, err := loadDatapath(ctx, objPath) + coll, finalize, err := loadDatapath(objPath) if err != nil { return err } @@ -442,7 +436,7 @@ func (l *loader) reloadHostDatapath(ctx context.Context, ep datapath.Endpoint, o return err } - coll, finalize, err = loadDatapath(ctx, secondDevObjPath) + coll, finalize, err = loadDatapath(secondDevObjPath) if err != nil { return err } @@ -472,7 +466,7 @@ func (l *loader) reloadHostDatapath(ctx context.Context, ep datapath.Endpoint, o } // Load bpf_netdev_.o. - coll, finalize, err := loadDatapath(ctx, netdevObjPath) + coll, finalize, err := loadDatapath(netdevObjPath) if err != nil { return err } @@ -520,7 +514,7 @@ func (l *loader) reloadHostDatapath(ctx context.Context, ep datapath.Endpoint, o return nil } -func (l *loader) reloadDatapath(ctx context.Context, ep datapath.Endpoint, dirs *directoryInfo) error { +func (l *loader) reloadDatapath(ep datapath.Endpoint, dirs *directoryInfo) error { // Replace the current program objPath := path.Join(dirs.Output, endpointObj) device := ep.InterfaceName() @@ -535,11 +529,11 @@ func (l *loader) reloadDatapath(ctx context.Context, ep datapath.Endpoint, dirs } objPath = path.Join(dirs.Output, hostEndpointObj) - if err := l.reloadHostDatapath(ctx, ep, objPath, devices); err != nil { + if err := l.reloadHostDatapath(ep, objPath, devices); err != nil { return err } } else { - coll, finalize, err := loadDatapath(ctx, objPath) + coll, finalize, err := loadDatapath(objPath) if err != nil { return err } @@ -594,12 +588,16 @@ func (l *loader) replaceOverlayDatapath(ctx context.Context, cArgs []string, ifa return fmt.Errorf("compiling overlay program: %w", err) } + if err := ctx.Err(); err != nil { + return err + } + device, err := netlink.LinkByName(iface) if err != nil { return fmt.Errorf("retrieving device %s: %w", iface, err) } - coll, finalize, err := loadDatapath(ctx, overlayObj) + coll, finalize, err := loadDatapath(overlayObj) if err != nil { return err } @@ -693,11 +691,11 @@ func (l *loader) compileOrLoad(ctx context.Context, ep datapath.Endpoint, dirs * } stats.BpfWriteELF.End(err == nil) - return l.ReloadDatapath(ctx, ep, stats) + return l.ReloadDatapath(ep, stats) } // ReloadDatapath reloads the BPF datapath programs for the specified endpoint. -func (l *loader) ReloadDatapath(ctx context.Context, ep datapath.Endpoint, stats *metrics.SpanStat) (err error) { +func (l *loader) ReloadDatapath(ep datapath.Endpoint, stats *metrics.SpanStat) (err error) { dirs := directoryInfo{ Library: option.Config.BpfDir, Runtime: option.Config.StateDir, @@ -705,7 +703,7 @@ func (l *loader) ReloadDatapath(ctx context.Context, ep datapath.Endpoint, stats Output: ep.StateDir(), } stats.BpfLoadProg.Start() - err = l.reloadDatapath(ctx, ep, &dirs) + err = l.reloadDatapath(ep, &dirs) stats.BpfLoadProg.End(err == nil) return err } diff --git a/pkg/datapath/loader/loader_test.go b/pkg/datapath/loader/loader_test.go index 8ce676cde5acf..927bebd43382f 100644 --- a/pkg/datapath/loader/loader_test.go +++ b/pkg/datapath/loader/loader_test.go @@ -137,7 +137,7 @@ func TestReload(t *testing.T) { linkDir := testutils.TempBPFFS(t) for range 2 { - coll, finalize, err := loadDatapath(ctx, objPath) + coll, finalize, err := loadDatapath(objPath) require.NoError(t, err) require.NoError(t, attachSKBProgram(l, coll.Programs[symbolFromEndpoint], @@ -279,7 +279,7 @@ func BenchmarkReplaceDatapath(b *testing.B) { objPath := fmt.Sprintf("%s/%s", dirInfo.Output, endpointObj) b.ResetTimer() for i := 0; i < b.N; i++ { - coll, finalize, err := loadDatapath(ctx, objPath) + coll, finalize, err := loadDatapath(objPath) if err != nil { b.Fatal(err) } diff --git a/pkg/datapath/loader/netlink.go b/pkg/datapath/loader/netlink.go index 41c193e5c3357..7f7a68a116122 100644 --- a/pkg/datapath/loader/netlink.go +++ b/pkg/datapath/loader/netlink.go @@ -4,7 +4,6 @@ package loader import ( - "context" "errors" "fmt" "net" @@ -50,12 +49,7 @@ func directionToParent(dir string) uint32 { // For example, this is the case with from-netdev and to-netdev. If eth0:to-netdev // gets its program and maps replaced and unpinned, its eth0:from-netdev counterpart // will miss tail calls (and drop packets) until it has been replaced as well. -func loadDatapath(ctx context.Context, obj string) (*ebpf.Collection, func(), error) { - // Avoid unnecessarily loading a prog. - if err := ctx.Err(); err != nil { - return nil, nil, err - } - +func loadDatapath(obj string) (*ebpf.Collection, func(), error) { l := log.WithField("objPath", obj) // Load the ELF from disk. diff --git a/pkg/datapath/loader/types/types.go b/pkg/datapath/loader/types/types.go index 324e250c947ab..9f5c5ec6c5c2e 100644 --- a/pkg/datapath/loader/types/types.go +++ b/pkg/datapath/loader/types/types.go @@ -23,7 +23,7 @@ type Loader interface { HostDatapathInitialized() <-chan struct{} Reinitialize(ctx context.Context, tunnelConfig tunnel.Config, deviceMTU int, iptMgr types.IptablesManager, p types.Proxy) error ReinitializeXDP(ctx context.Context, extraCArgs []string) error - ReloadDatapath(ctx context.Context, ep types.Endpoint, stats *metrics.SpanStat) (err error) + ReloadDatapath(ep types.Endpoint, stats *metrics.SpanStat) (err error) RestoreTemplates(stateDir string) error Unload(ep types.Endpoint) } diff --git a/pkg/datapath/loader/xdp.go b/pkg/datapath/loader/xdp.go index 75471e6a88b96..27e2d39aa84f7 100644 --- a/pkg/datapath/loader/xdp.go +++ b/pkg/datapath/loader/xdp.go @@ -157,7 +157,7 @@ func compileAndLoadXDPProg(ctx context.Context, xdpDev, xdpMode string, extraCAr return fmt.Errorf("retrieving device %s: %w", xdpDev, err) } - coll, finalize, err := loadDatapath(ctx, objPath) + coll, finalize, err := loadDatapath(objPath) if err != nil { return err } diff --git a/pkg/datapath/types/loader.go b/pkg/datapath/types/loader.go index 9ed11396400aa..a0178a0a47089 100644 --- a/pkg/datapath/types/loader.go +++ b/pkg/datapath/types/loader.go @@ -18,7 +18,7 @@ type Loader interface { CallsMapPath(id uint16) string CustomCallsMapPath(id uint16) string CompileOrLoad(ctx context.Context, ep Endpoint, stats *metrics.SpanStat) error - ReloadDatapath(ctx context.Context, ep Endpoint, stats *metrics.SpanStat) error + ReloadDatapath(ep Endpoint, stats *metrics.SpanStat) error ReinitializeXDP(ctx context.Context, extraCArgs []string) error EndpointHash(cfg EndpointConfiguration) (string, error) Unload(ep Endpoint) diff --git a/pkg/endpoint/bpf.go b/pkg/endpoint/bpf.go index a4890707da960..6b84c262f3b8d 100644 --- a/pkg/endpoint/bpf.go +++ b/pkg/endpoint/bpf.go @@ -671,7 +671,7 @@ func (e *Endpoint) realizeBPFState(regenContext *regenerationContext) (compilati } compilationExecuted = true } else { // RegenerateWithDatapathLoad - err = e.owner.Datapath().Loader().ReloadDatapath(datapathRegenCtxt.completionCtx, datapathRegenCtxt.epInfoCache, &stats.datapathRealization) + err = e.owner.Datapath().Loader().ReloadDatapath(datapathRegenCtxt.epInfoCache, &stats.datapathRealization) if err == nil { e.getLogger().Info("Reloaded endpoint BPF program") } else {