Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpf: Add extension for running sock LB on MKE-related containers #17513

Merged
merged 1 commit into from
Oct 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 12 additions & 1 deletion bpf/bpf_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,16 @@ void ctx_set_port(struct bpf_sock_addr *ctx, __be16 dport)
ctx->user_port = (__u32)dport;
}

static __always_inline __maybe_unused bool task_in_extended_hostns(void)
{
#ifdef ENABLE_MKE
/* Extension for non-Cilium managed containers on MKE. */
return get_cgroup_classid() == MKE_HOST;
#else
return false;
#endif
}

static __always_inline __maybe_unused bool
ctx_in_hostns(void *ctx __maybe_unused, __net_cookie *cookie)
{
Expand All @@ -91,7 +101,8 @@ ctx_in_hostns(void *ctx __maybe_unused, __net_cookie *cookie)

if (cookie)
*cookie = own_cookie;
return own_cookie == HOST_NETNS_COOKIE;
return own_cookie == HOST_NETNS_COOKIE ||
task_in_extended_hostns();
#else
if (cookie)
*cookie = 0;
Expand Down
3 changes: 3 additions & 0 deletions bpf/include/bpf/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ static __u64 BPF_FUNC(jiffies64);
static __sock_cookie BPF_FUNC(get_socket_cookie, void *ctx);
static __net_cookie BPF_FUNC(get_netns_cookie, void *ctx);

/* Legacy cgroups */
static __u32 BPF_FUNC(get_cgroup_classid);

/* Debugging */
static __printf(1, 3) void
BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...);
Expand Down
8 changes: 8 additions & 0 deletions daemon/cmd/daemon_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,14 @@ func init() {
flags.Bool(option.EnableLocalRedirectPolicy, false, "Enable Local Redirect Policy")
option.BindEnv(option.EnableLocalRedirectPolicy)

flags.Bool(option.EnableMKE, false, "Enable BPF kube-proxy replacement for MKE environments")
flags.MarkHidden(option.EnableMKE)
option.BindEnv(option.EnableMKE)

flags.String(option.CgroupPathMKE, "", "Cgroup v1 net_cls mount path for MKE environments")
flags.MarkHidden(option.CgroupPathMKE)
option.BindEnv(option.CgroupPathMKE)

flags.String(option.NodePortMode, option.NodePortModeSNAT, "BPF NodePort mode (\"snat\", \"dsr\", \"hybrid\")")
flags.MarkHidden(option.NodePortMode)
option.BindEnv(option.NodePortMode)
Expand Down
93 changes: 93 additions & 0 deletions daemon/cmd/kube_proxy_replacement.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@ package cmd
import (
"errors"
"fmt"
"io"
"math"
"net"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/cilium/cilium/pkg/bpf"
"github.com/cilium/cilium/pkg/datapath/linux/probes"
"github.com/cilium/cilium/pkg/datapath/loader"
datapathOption "github.com/cilium/cilium/pkg/datapath/option"
"github.com/cilium/cilium/pkg/logging/logfields"
"github.com/cilium/cilium/pkg/mac"
"github.com/cilium/cilium/pkg/maglev"
"github.com/cilium/cilium/pkg/mountinfo"
"github.com/cilium/cilium/pkg/node"
"github.com/cilium/cilium/pkg/option"
"github.com/cilium/cilium/pkg/probe"
Expand Down Expand Up @@ -230,6 +235,28 @@ func initKubeProxyReplacementOptions() (bool, error) {
// be v4-in-v6 connections even if the agent has v6 support disabled.
probe.HaveIPv6Support()

if option.Config.EnableMKE {
foundClassid := false
foundCookie := false
if h := probesManager.GetHelpers("cgroup_sock_addr"); h != nil {
if _, ok := h["bpf_get_cgroup_classid"]; ok {
foundClassid = true
}
if _, ok := h["bpf_get_netns_cookie"]; ok {
foundCookie = true
}
}
if !foundClassid || !foundCookie {
if strict {
log.Fatalf("BPF kube-proxy replacement under MKE with --%s needs kernel 5.7 or newer", option.EnableMKE)
} else {
option.Config.EnableHostServicesTCP = false
option.Config.EnableHostServicesUDP = false
log.Warnf("Disabling host reachable services under MKE with --%s. Needs kernel 5.7 or newer.", option.EnableMKE)
}
}
}

option.Config.EnableHostServicesPeer = true
if option.Config.EnableIPv4 {
if err := bpf.TestDummyProg(bpf.ProgTypeCgroupSockAddr, bpf.BPF_CGROUP_INET4_GETPEERNAME); err != nil {
Expand Down Expand Up @@ -488,6 +515,12 @@ func finishKubeProxyReplacementInit(isKubeProxyReplacementStrict bool) error {
// | After this point, BPF NodePort should not be disabled |
// +-------------------------------------------------------+

// For MKE, we only need to change/extend the socket LB behavior in case
// of kube-proxy replacement. Otherwise, nothing else is needed.
if option.Config.EnableMKE && option.Config.EnableHostReachableServices {
markHostExtension()
}

if !option.Config.EnableHostLegacyRouting {
msg := ""
switch {
Expand Down Expand Up @@ -597,6 +630,66 @@ func disableNodePort() {
option.Config.EnableHostLegacyRouting = true
}

// markHostExtension tells the socket LB that MKE managed containers belong
// to the "hostns" as well despite them residing in their own netns. We use
// net_cls as a marker.
func markHostExtension() {
borkmann marked this conversation as resolved.
Show resolved Hide resolved
prefix := option.Config.CgroupPathMKE
if prefix == "" {
mountInfos, err := mountinfo.GetMountInfo()
if err != nil {
log.WithError(err).Fatal("Cannot retrieve mount infos for MKE")
}
for _, mountInfo := range mountInfos {
if mountInfo.FilesystemType == "cgroup" &&
strings.Contains(mountInfo.SuperOptions, "net_cls") {
// There can be multiple entries with the same mountpoint.
// Assert that there is no conflict.
if prefix != "" && prefix != mountInfo.MountPoint {
log.Fatalf("Multiple cgroup v1 net_cls mounts: %s, %s",
prefix, mountInfo.MountPoint)
}
prefix = mountInfo.MountPoint
}
}
}
if prefix == "" {
log.Fatal("Cannot retrieve cgroup v1 net_cls mount info for MKE")
}
log.WithField(logfields.Path, prefix).Info("Found cgroup v1 net_cls mount on MKE")
err := filepath.Walk(prefix,
func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() || strings.Contains(path, "kubepods") || path == prefix {
return nil
}
log.WithField(logfields.Path, path).Info("Marking as MKE host extension")
f, err := os.OpenFile(path+"/net_cls.classid", os.O_RDWR, 0644)
borkmann marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return err
}
defer f.Close()
valBytes, err := io.ReadAll(f)
if err != nil {
return err
}
class, err := strconv.Atoi(string(valBytes[:len(valBytes)-1]))
if err != nil {
return err
}
if class != 0 && class != option.HostExtensionMKE {
return errors.New("net_cls.classid already in use")
}
_, err = io.WriteString(f, fmt.Sprintf("%d", option.HostExtensionMKE))
return err
})
if err != nil {
log.WithError(err).Fatal("Cannot mark MKE-related container")
}
}

// checkNodePortAndEphemeralPortRanges checks whether the ephemeral port range
// does not clash with the nodeport range to prevent the BPF nodeport from
// hijacking an existing connection on the local host which source port is
Expand Down
4 changes: 4 additions & 0 deletions pkg/datapath/linux/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,10 @@ func (h *HeaderfileWriter) WriteNodeConfig(w io.Writer, cfg *datapath.LocalNodeC
if option.Config.EnableHealthDatapath {
cDefinesMap["ENABLE_HEALTH_CHECK"] = "1"
}
if option.Config.EnableMKE && option.Config.EnableHostReachableServices {
cDefinesMap["ENABLE_MKE"] = "1"
cDefinesMap["MKE_HOST"] = fmt.Sprintf("%d", option.HostExtensionMKE)
}
if option.Config.EnableRecorder {
cDefinesMap["ENABLE_CAPTURE"] = "1"
if option.Config.EnableIPv4 {
Expand Down
14 changes: 14 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,12 @@ const (
// EnableLocalRedirectPolicy enables support for local redirect policy
EnableLocalRedirectPolicy = "enable-local-redirect-policy"

// EnableMKE enables MKE specific 'chaining' for kube-proxy replacement
EnableMKE = "enable-mke"

// CgroupPathMKE points to the cgroupv1 net_cls mount instance
CgroupPathMKE = "mke-cgroup-mount"

// LibDir enables the directory path to store runtime build environment
LibDir = "lib-dir"

Expand Down Expand Up @@ -1729,6 +1735,12 @@ type DaemonConfig struct {
// EnableRecorder enables the datapath pcap recorder
EnableRecorder bool

// EnableMKE enables MKE specific 'chaining' for kube-proxy replacement
EnableMKE bool

// CgroupPathMKE points to the cgroupv1 net_cls mount instance
CgroupPathMKE string

// KubeProxyReplacementHealthzBindAddr is the KubeProxyReplacement healthz server bind addr
KubeProxyReplacementHealthzBindAddr string

Expand Down Expand Up @@ -2443,6 +2455,8 @@ func (c *DaemonConfig) Populate() {
c.EnableSessionAffinity = viper.GetBool(EnableSessionAffinity)
c.EnableBandwidthManager = viper.GetBool(EnableBandwidthManager)
c.EnableRecorder = viper.GetBool(EnableRecorder)
c.EnableMKE = viper.GetBool(EnableMKE)
c.CgroupPathMKE = viper.GetString(CgroupPathMKE)
c.EnableHostFirewall = viper.GetBool(EnableHostFirewall)
c.EnableLocalRedirectPolicy = viper.GetBool(EnableLocalRedirectPolicy)
c.EncryptInterface = viper.GetStringSlice(EncryptInterface)
Expand Down
2 changes: 2 additions & 0 deletions pkg/option/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ const (
ClockSourceKtime BPFClockSource = iota
ClockSourceJiffies
)

const HostExtensionMKE = 0x1bda7a