Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

daemon: Implement route-based device detection #17219

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Documentation/operations/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,11 @@ Annotations:
1.11 Upgrade Notes
------------------

* The Cilium agent will now fail instead of falling back to auto-detection
when device wildcard expansion (``--devices=eth+``) yields no devices.
* Device auto-detection now discovers devices through the routing table and
only considers devices that have a global unicast route in some routing table.

Removed Options
~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions Documentation/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@ ui
uid
uint
unencrypted
unicast
uninline
uninstall
uninstallOnExit
Expand Down
12 changes: 11 additions & 1 deletion daemon/cmd/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ type Daemon struct {
monitorAgent *monitoragent.Agent
ciliumHealth *health.CiliumHealth

deviceManager *DeviceManager
Copy link
Contributor Author

@joamaki joamaki Aug 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A more suitable place for DeviceManager might be in pkg/datapath, but I'd like to leave it here for now.


// dnsNameManager tracks which api.FQDNSelector are present in policy which
// apply to locally running endpoints.
dnsNameManager *fqdn.NameManager
Expand Down Expand Up @@ -387,6 +389,7 @@ func NewDaemon(ctx context.Context, cancel context.CancelFunc, epMgr *endpointma
netConf: netConf,
mtuConfig: mtuConfig,
datapath: dp,
deviceManager: NewDeviceManager(),
nodeDiscovery: nd,
endpointCreations: newEndpointCreationManager(),
apiLimiterSet: apiLimiterSet,
Expand Down Expand Up @@ -607,7 +610,14 @@ func NewDaemon(ctx context.Context, cancel context.CancelFunc, epMgr *endpointma
// This is because the device detection requires self (Cilium)Node object,
// and the k8s service watcher depends on option.Config.EnableNodePort flag
// which can be modified after the device detection.
handleNativeDevices(isKubeProxyReplacementStrict)
if err := d.deviceManager.Detect(); err != nil {
if areDevicesRequired() {
// Fail hard if devices are required to function.
return nil, nil, fmt.Errorf("failed to detect devices: %w", err)
}
log.WithError(err).Warn("failed to detect devices, disabling BPF NodePort")
disableNodePort()
}
finishKubeProxyReplacementInit(isKubeProxyReplacementStrict)

if k8s.IsEnabled() {
Expand Down
336 changes: 336 additions & 0 deletions daemon/cmd/devices.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright 2016-2021 Authors of Cilium

// This module implements Cilium's network device detection.

package cmd

import (
"fmt"
"net"
"sort"
"strings"

"github.com/cilium/cilium/pkg/datapath/linux/probes"
"github.com/cilium/cilium/pkg/k8s"
"github.com/cilium/cilium/pkg/lock"
"github.com/cilium/cilium/pkg/logging/logfields"
"github.com/cilium/cilium/pkg/mac"
"github.com/cilium/cilium/pkg/node"
"github.com/cilium/cilium/pkg/option"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
)

var (
excludedDevicePrefixes = []string{
"cilium_",
"lo",
"lxc",
"cni",
"docker",
}

// Route filter to look at all routing tables.
routeFilter = netlink.Route{
Table: unix.RT_TABLE_UNSPEC,
}
routeFilterMask = netlink.RT_FILTER_TABLE
)

type DeviceManager struct {
lock.Mutex
devices map[string]struct{}
}

func NewDeviceManager() *DeviceManager {
return &DeviceManager{
devices: make(map[string]struct{}),
}
}

// Detect tries to detect devices to which BPF programs may be loaded.
// See areDevicesRequired() for features that require the device information.
//
// The devices are detected by looking at all the configured global unicast
// routes in the system.
func (dm *DeviceManager) Detect() error {
dm.Lock()
defer dm.Unlock()
dm.devices = make(map[string]struct{})

if err := expandDevices(); err != nil {
return err
}

l3DevOK := true
if !option.Config.EnableHostLegacyRouting {
// Probe whether fast redirect is supported for L3 devices. This will
// invoke bpftool and requires root privileges, so we're only probing
// when necessary.
l3DevOK = supportL3Dev()
pchaigno marked this conversation as resolved.
Show resolved Hide resolved
}

if len(option.Config.Devices) == 0 && areDevicesRequired() {
// Detect the devices from the system routing table by finding the devices
// which have global unicast routes.
family := netlink.FAMILY_ALL
if option.Config.EnableIPv4 && !option.Config.EnableIPv6 {
family = netlink.FAMILY_V4
} else if !option.Config.EnableIPv4 && option.Config.EnableIPv6 {
family = netlink.FAMILY_V6
}

routes, err := netlink.RouteListFiltered(family, &routeFilter, routeFilterMask)
if err != nil {
return fmt.Errorf("cannot retrieve routes for device detection: %w", err)
}
dm.updateDevicesFromRoutes(l3DevOK, routes)
} else {
for _, dev := range option.Config.Devices {
dm.devices[dev] = struct{}{}
}
}

detectDirectRoutingDev := option.Config.EnableNodePort
if option.Config.DirectRoutingDevice != "" {
dm.devices[option.Config.DirectRoutingDevice] = struct{}{}
detectDirectRoutingDev = false
}

detectIPv6MCastDev := option.Config.EnableIPv6NDP
if option.Config.IPv6MCastDevice != "" {
dm.devices[option.Config.IPv6MCastDevice] = struct{}{}
detectIPv6MCastDev = false
}

if detectDirectRoutingDev || detectIPv6MCastDev {
k8sNodeDev := ""
k8sNodeLink, err := findK8SNodeIPLink()
if err == nil {
k8sNodeDev = k8sNodeLink.Attrs().Name
dm.devices[k8sNodeDev] = struct{}{}
} else if k8s.IsEnabled() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might make sense to move the k8s.IsEnabled() check above the detectDirectRoutingDev || detectIPv6MCastDev check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only require k8s when direct routing device or ipv6 mcast device detection is necessary, so it doesn't make sense to move it there as then device detection won't work without k8s. Though do we ever need device detection without k8s? I'm not sure. If not, then we can just make this fail early when k8s is not enabled.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though do we ever need device detection without k8s?

Yes. In the lb-only mode when user specifies multiple devices, but doesn't specify the direct routing device. We should definitely keep the case |devices| == 1 => dr_device = devices[0] regardless of k8s presence.

return fmt.Errorf("k8s is enabled, but still failed to find node IP: %w", err)
}

if detectDirectRoutingDev {
// If only one device found, use that one. Otherwise use the device with k8s node IP.
if len(dm.devices) == 1 {
for dev := range dm.devices {
option.Config.DirectRoutingDevice = dev
break
}
} else if k8sNodeDev != "" {
option.Config.DirectRoutingDevice = k8sNodeDev
} else {
return fmt.Errorf("Unable to determine direct routing device. Use --%s to specify it",
option.DirectRoutingDevice)
}
log.WithField(option.DirectRoutingDevice, option.Config.DirectRoutingDevice).
Info("Direct routing device detected")
}

if detectIPv6MCastDev {
if k8sNodeLink != nil && k8sNodeLink.Attrs().Flags&net.FlagMulticast != 0 {
option.Config.IPv6MCastDevice = k8sNodeDev
log.WithField(option.IPv6MCastDevice, option.Config.IPv6MCastDevice).Info("IPv6 multicast device detected")
} else {
return fmt.Errorf("Unable to determine Multicast device. Use --%s to specify it",
option.IPv6MCastDevice)
}
}
}

option.Config.Devices = dm.getDevices()
log.WithField(logfields.Devices, option.Config.Devices).Info("Detected devices")

return nil
}

// GetDevices returns the current list of devices Cilium should attach programs to.
func (dm *DeviceManager) GetDevices() []string {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow-up with runtime detection of devices the plan is to stop using option.Config.Devices throughout and rather call into device manager to ask for the list of devices as it may mutate over time. Hence I already have the state for the devices and this function here.

See #17187 for the work in progress for the runtime detection and reasoning behind the structure of this file.

dm.Lock()
defer dm.Unlock()
return dm.getDevices()
}

func (dm *DeviceManager) getDevices() []string {
devs := make([]string, 0, len(dm.devices))
for dev := range dm.devices {
devs = append(devs, dev)
}
sort.Strings(devs)
return devs
}

// isViableDevice returns true if the given link is usable and Cilium should attach
// programs to it.
func (dm *DeviceManager) isViableDevice(l3DevOK, hasDefaultRoute bool, link netlink.Link) bool {
name := link.Attrs().Name

// Do not consider any of the excluded devices.
for _, p := range excludedDevicePrefixes {
if strings.HasPrefix(name, p) {
log.WithField(logfields.Device, name).
Debugf("Skipping device as it has excluded prefix '%s'", p)
return false
}
}

// Skip lower bond devices.
if link.Attrs().RawFlags&unix.IFF_SLAVE != 0 {
log.WithField(logfields.Device, name).Debug("Skipping bonded device")
return false
}

// Ignore L3 devices if we cannot support them.
if !l3DevOK && !mac.LinkHasMacAddr(link) {
log.WithField(logfields.Device, name).
Info("Ignoring L3 device; >= 5.8 kernel is required.")
return false
}

// Skip veth devices that don't have a default route.
// This is a workaround for kubernetes-in-docker. We want to avoid
// veth devices in general as they may be leftovers from another CNI.
if !hasDefaultRoute {
_, virtual := link.(*netlink.Veth)
if virtual {
log.WithField(logfields.Device, name).
Debug("Ignoring veth device as it has no default route")
return false
}

}
return true
}

type linkInfo struct {
hasDefaultRoute bool
}

// updateDevicesFromRoutes processes a batch of routes and updates the set of
// devices. Returns true if devices changed.
func (dm *DeviceManager) updateDevicesFromRoutes(l3DevOK bool, routes []netlink.Route) bool {
linkInfos := make(map[int]linkInfo)

// Collect all link indices mentioned in the route update batch
for _, route := range routes {
// Only consider devices that have global unicast routes,
// e.g. skip loopback, multicast and link local routes.
if route.Dst != nil && !route.Dst.IP.IsGlobalUnicast() {
continue
}
pchaigno marked this conversation as resolved.
Show resolved Hide resolved
if route.Table == unix.RT_TABLE_LOCAL {
continue
}
linkInfo := linkInfos[route.LinkIndex]
linkInfo.hasDefaultRoute = linkInfo.hasDefaultRoute || route.Dst == nil
linkInfos[route.LinkIndex] = linkInfo
}

changed := false
for index, info := range linkInfos {
link, err := netlink.LinkByIndex(index)
if err != nil {
log.WithError(err).WithField(logfields.LinkIndex, index).
Warn("Failed to get link by index")
continue
}
name := link.Attrs().Name

// Skip devices we already know.
if _, exists := dm.devices[name]; exists {
continue
}

viable := dm.isViableDevice(l3DevOK, info.hasDefaultRoute, link)
if viable {
dm.devices[name] = struct{}{}
changed = true
}
}
return changed
}

// expandDevices expands all wildcard device names to concrete devices.
// e.g. device "eth+" expands to "eth0,eth1" etc. Non-matching wildcards are ignored.
func expandDevices() error {
allLinks, err := netlink.LinkList()
if err != nil {
return fmt.Errorf("Device wildcard expansion failed to fetch devices: %w", err)
}
expandedDevices := make(map[string]struct{})
for _, iface := range option.Config.Devices {
if strings.HasSuffix(iface, "+") {
prefix := strings.TrimRight(iface, "+")
for _, link := range allLinks {
attrs := link.Attrs()
if strings.HasPrefix(attrs.Name, prefix) {
expandedDevices[attrs.Name] = struct{}{}
}
}
} else {
expandedDevices[iface] = struct{}{}
}
}
if len(option.Config.Devices) > 0 && len(expandedDevices) == 0 {
// User defined devices, but expansion yielded no devices. Fail here to not
// surprise with auto-detection.
return fmt.Errorf("Device wildcard expansion failed to detect devices. Please verify --%s option.",
pchaigno marked this conversation as resolved.
Show resolved Hide resolved
option.Devices)
}

option.Config.Devices = make([]string, 0, len(expandedDevices))
for dev := range expandedDevices {
option.Config.Devices = append(option.Config.Devices, dev)
}
sort.Strings(option.Config.Devices)
return nil
}

func areDevicesRequired() bool {
return option.Config.EnableNodePort ||
option.Config.EnableHostFirewall ||
option.Config.EnableBandwidthManager
}

func findK8SNodeIPLink() (netlink.Link, error) {
nodeIP := node.GetK8sNodeIP()

if nodeIP == nil {
return nil, fmt.Errorf("Failed to find K8s node device as node IP is not known")
}

var family int
if nodeIP.To4() != nil {
family = netlink.FAMILY_V4
} else {
family = netlink.FAMILY_V6
}

if addrs, err := netlink.AddrList(nil, family); err == nil {
for _, a := range addrs {
if a.IP.Equal(nodeIP) {
link, err := netlink.LinkByIndex(a.LinkIndex)
if err != nil {
return nil, err
}
return link, nil
}
}
}
return nil, fmt.Errorf("K8s node device not found")
}

// supportL3Dev returns true if the kernel is new enough to support fast redirection of
// packets coming from L3 devices using bpf_skb_redirect_peer.
func supportL3Dev() bool {
probesManager := probes.NewProbeManager()
if h := probesManager.GetHelpers("sched_cls"); h != nil {
_, found := h["bpf_skb_change_head"]
return found
}
return false
}