From 080857bdedca67d58ec39f8f96c5f38b22f6dc0b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 21 Apr 2022 09:19:23 +0000 Subject: [PATCH] cilium: Add scope knob for local address to be considered host id in ipcache In some advanced environments, there may be devices in the hostns which could have a link scoped 10.x.y.z address. The default behavior of Cilium when populating its local ipcache is to skip all link local addresses as per listLocalAddresses(). Depending on the datapath configuration, this may cause issues when a Pod wants to talk to such an address in the hostns. For example, when routing doesn't go via stack such as the case in BPF host routing, then for such addresses, the ipcache will fall-back to WORLD id instead of HOST id. The datapath then assumes that this needs to be xmitted to the given device from tc layer instead of pushing traffic up the local stack as the case with HOST id traffic. Then, if such device is f.e. a dummy dev, such traffic is being blackholed. We tested that changing scope to global for such address would make traffic flow working, so the culprit really is in listLocalAddresses()'s logic which unconditionally skips all addr.Scope == int(netlink.SCOPE_LINK). Allow to customize this, given the kernel also allows many other scope values. The agent gets a new --local-max-addr-scope param for this so that e.g. link local scope can be included via `--local-max-addr-scope=253` or via `--local-max-addr-scope=link`. To preserve the default, it's still excluded. Example, default: # ./daemon/cilium-agent --identity-allocation-mode=crd \ --enable-ipv6=true --enable-ipv4=true --disable-envoy-version-check=true \ --tunnel=disabled --k8s-kubeconfig-path=$HOME/.kube/config \ --kube-proxy-replacement=strict --enable-l7-proxy=false \ --auto-direct-node-routes=true --enable-bandwidth-manager=true \ --ipv4-native-routing-cidr=10.91.0.0/16 --ipv6-native-routing-cidr=f00d::a5b:0:0:0/96 \ --enable-ipv4-masquerade=false --enable-ipv6-masquerade=false root@zh-lab-node-1:~/go/src/github.com/cilium/cilium# ./cilium/cilium bpf ipcache list | grep "identity=1 " 2001:1620:665:0:1ac0:4dff:fe09:c164/128 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 10.91.0.191/32 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 f00d::a5b:0:0:a962/128 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 192.168.178.91/32 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 Example, with link scope: # ./daemon/cilium-agent --identity-allocation-mode=crd \ --enable-ipv6=true --enable-ipv4=true --disable-envoy-version-check=true \ --tunnel=disabled --k8s-kubeconfig-path=$HOME/.kube/config \ --kube-proxy-replacement=strict --enable-l7-proxy=false \ --auto-direct-node-routes=true --enable-bandwidth-manager=true \ --ipv4-native-routing-cidr=10.91.0.0/16 --ipv6-native-routing-cidr=f00d::a5b:0:0:0/96 \ --enable-ipv4-masquerade=false --enable-ipv6-masquerade=false \ --local-max-addr-scope=link root@zh-lab-node-1:~/go/src/github.com/cilium/cilium# ./cilium/cilium bpf ipcache list | grep "identity=1 " 192.168.178.91/32 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 2.2.2.2/32 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 2001:1620:665:0:1ac0:4dff:fe09:c164/128 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 f00d::a5b:0:0:a962/128 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 10.91.0.191/32 identity=1 encryptkey=0 tunnelendpoint=0.0.0.0 Where 2.2.2.2/32 is a custom address with link scope on device enp5s0: 3: enp5s0: mtu 1500 qdisc mq state UP group default qlen 1000 link/ether 18:c0:4d:09:c1:64 brd ff:ff:ff:ff:ff:ff inet 2.2.2.2/32 scope link enp5s0 valid_lft forever preferred_lft forever inet 192.168.178.91/24 brd 192.168.178.255 scope global dynamic enp5s0 valid_lft 592972sec preferred_lft 592972sec inet6 2001:1620:665:0:1ac0:4dff:fe09:c164/64 scope global dynamic mngtmpaddr noprefixroute valid_lft 6805sec preferred_lft 3205sec inet6 fe80::1ac0:4dff:fe09:c164/64 scope link valid_lft forever preferred_lft forever [...] 8: cilium_host@cilium_net: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether fe:b4:96:9d:b4:74 brd ff:ff:ff:ff:ff:ff inet 10.91.0.191/32 scope link cilium_host valid_lft forever preferred_lft forever inet6 2001:1620:665:0:1ac0:4dff:fe09:c164/128 scope global valid_lft forever preferred_lft forever For start of 1.13 cycle we can change the defaults.AddressScopeMax to link. For this, the option --local-max-addr-scope is intentionally a hidden one, so that once we do the switch there is also an optional opt-out. Fixes: #19427 Signed-off-by: Daniel Borkmann --- daemon/cmd/daemon_main.go | 4 ++++ pkg/datapath/linux/node_addressing.go | 27 +++++++++++++++------------ pkg/defaults/defaults.go | 6 ++++++ pkg/ip/ip.go | 21 +++++++++++++++++++++ pkg/option/config.go | 17 +++++++++++++++++ 5 files changed, 63 insertions(+), 12 deletions(-) diff --git a/daemon/cmd/daemon_main.go b/daemon/cmd/daemon_main.go index d1d775207188..2490d4984ad3 100644 --- a/daemon/cmd/daemon_main.go +++ b/daemon/cmd/daemon_main.go @@ -585,6 +585,10 @@ func initializeFlags() { flags.Bool(option.EnableSVCSourceRangeCheck, true, "Enable check of service source ranges (currently, only for LoadBalancer)") option.BindEnv(option.EnableSVCSourceRangeCheck) + flags.String(option.AddressScopeMax, fmt.Sprintf("%d", defaults.AddressScopeMax), "Maximum local address scope for ipcache to consider host addresses") + flags.MarkHidden(option.AddressScopeMax) + option.BindEnv(option.AddressScopeMax) + flags.Bool(option.EnableBandwidthManager, false, "Enable BPF bandwidth manager") option.BindEnv(option.EnableBandwidthManager) diff --git a/pkg/datapath/linux/node_addressing.go b/pkg/datapath/linux/node_addressing.go index 22a2f176617e..ca78d4bef641 100644 --- a/pkg/datapath/linux/node_addressing.go +++ b/pkg/datapath/linux/node_addressing.go @@ -13,42 +13,45 @@ import ( "github.com/cilium/cilium/pkg/defaults" "github.com/cilium/cilium/pkg/ip" "github.com/cilium/cilium/pkg/node" + "github.com/cilium/cilium/pkg/option" ) // FIXME: This currently maps to the code in pkg/node/node_address.go. That // code should really move into this package. func listLocalAddresses(family int) ([]net.IP, error) { + var addresses []net.IP + ipsToExclude := node.GetExcludedIPs() addrs, err := netlink.AddrList(nil, family) if err != nil { return nil, err } - var addresses []net.IP - for _, addr := range addrs { - if addr.Scope == int(netlink.SCOPE_LINK) { + if addr.Scope > option.Config.AddressScopeMax { continue } if ip.IsExcluded(ipsToExclude, addr.IP) { continue } - if addr.IP.IsLoopback() { + if addr.IP.IsLoopback() || addr.IP.IsLinkLocalUnicast() { continue } addresses = append(addresses, addr.IP) } - if hostDevice, err := netlink.LinkByName(defaults.HostDevice); hostDevice != nil && err == nil { - addrs, err = netlink.AddrList(hostDevice, family) - if err != nil { - return nil, err - } - for _, addr := range addrs { - if addr.Scope == int(netlink.SCOPE_LINK) { - addresses = append(addresses, addr.IP) + if option.Config.AddressScopeMax < int(netlink.SCOPE_LINK) { + if hostDevice, err := netlink.LinkByName(defaults.HostDevice); hostDevice != nil && err == nil { + addrs, err = netlink.AddrList(hostDevice, family) + if err != nil { + return nil, err + } + for _, addr := range addrs { + if addr.Scope == int(netlink.SCOPE_LINK) { + addresses = append(addresses, addr.IP) + } } } } diff --git a/pkg/defaults/defaults.go b/pkg/defaults/defaults.go index 4c953c19b693..f3de0b9a64a3 100644 --- a/pkg/defaults/defaults.go +++ b/pkg/defaults/defaults.go @@ -5,12 +5,18 @@ package defaults import ( "time" + + "github.com/vishvananda/netlink" ) const ( // AgentHealthPort is the default value for option.AgentHealthPort AgentHealthPort = 9876 + // AddressScopeMax controls the maximum address scope for addresses to be + // considered local ones with HOST_ID in the ipcache + AddressScopeMax = int(netlink.SCOPE_LINK) - 1 + // ClusterHealthPort is the default value for option.ClusterHealthPort ClusterHealthPort = 4240 diff --git a/pkg/ip/ip.go b/pkg/ip/ip.go index a64ee04cce66..757b5f26d6b5 100644 --- a/pkg/ip/ip.go +++ b/pkg/ip/ip.go @@ -9,6 +9,9 @@ import ( "math/big" "net" "sort" + "strconv" + + "github.com/vishvananda/netlink" ) const ( @@ -857,6 +860,24 @@ func IsIPv6(ip net.IP) bool { return ip != nil && ip.To4() == nil } +// ParseScope returns the parsed address scope number. +func ParseScope(scope string) (int, error) { + switch scope { + case "global": + return int(netlink.SCOPE_UNIVERSE), nil + case "nowhere": + return int(netlink.SCOPE_NOWHERE), nil + case "host": + return int(netlink.SCOPE_HOST), nil + case "link": + return int(netlink.SCOPE_LINK), nil + case "site": + return int(netlink.SCOPE_SITE), nil + default: + return strconv.Atoi(scope) + } +} + // SortIPList sorts the provided net.IP slice in place. func SortIPList(ipList []net.IP) { sort.Slice(ipList, func(i, j int) bool { diff --git a/pkg/option/config.go b/pkg/option/config.go index 4e4c1e5234cd..964fe6ee7ba1 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -316,6 +316,10 @@ const ( // conflicting marks. EnableIdentityMark = "enable-identity-mark" + // AddressScopeMax controls the maximum address scope for addresses to be + // considered local ones with HOST_ID in the ipcache + AddressScopeMax = "local-max-addr-scope" + // EnableBandwidthManager enables EDT-based pacing EnableBandwidthManager = "enable-bandwidth-manager" @@ -1855,6 +1859,10 @@ type DaemonConfig struct { // features in BPF datapath KubeProxyReplacement string + // AddressScopeMax controls the maximum address scope for addresses to be + // considered local ones with HOST_ID in the ipcache + AddressScopeMax int + // EnableBandwidthManager enables EDT-based pacing EnableBandwidthManager bool @@ -2866,6 +2874,15 @@ func (c *DaemonConfig) Populate() { } } + if viper.IsSet(AddressScopeMax) { + c.AddressScopeMax, err = ip.ParseScope(viper.GetString(AddressScopeMax)) + if err != nil { + log.WithError(err).Fatalf("Cannot parse scope integer from --%s option", AddressScopeMax) + } + } else { + c.AddressScopeMax = defaults.AddressScopeMax + } + nativeRoutingCIDR := viper.GetString(NativeRoutingCIDR) ipv4NativeRoutingCIDR := viper.GetString(IPv4NativeRoutingCIDR)