Skip to content

Commit

Permalink
cilium: add bandwidth manager
Browse files Browse the repository at this point in the history
Base layer which implements setup of BBR + {MQ/FQ, FQ} as well as
EDT based rate-limiting in BPF. Agent code implements map setup and
handling of egress bandwidth label for Pods.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
borkmann committed Aug 25, 2020
1 parent 423e855 commit 29f4654
Show file tree
Hide file tree
Showing 36 changed files with 590 additions and 30 deletions.
2 changes: 2 additions & 0 deletions bpf/Makefile
Expand Up @@ -56,6 +56,8 @@ LB_OPTIONS = \
-DENABLE_IPV4:-DENABLE_IPSEC:-DENABLE_HOST_SERVICES_UDP:-DENABLE_HOST_SERVICES_TCP:-DENABLE_NODEPORT:-DENABLE_EXTERNAL_IP:-DENABLE_NODEPORT_ACCELERATION:-DENABLE_SESSION_AFFINITY \
-DENABLE_IPV6:-DENABLE_IPSEC:-DENABLE_HOST_SERVICES_UDP:-DENABLE_HOST_SERVICES_TCP:-DENABLE_NODEPORT:-DENABLE_EXTERNAL_IP:-DENABLE_NODEPORT_ACCELERATION:-DENABLE_SESSION_AFFINITY \
-DENABLE_IPV6:-DENABLE_IPSEC:-DENABLE_HOST_SERVICES_UDP:-DENABLE_HOST_SERVICES_TCP:-DENABLE_NODEPORT:-DENABLE_EXTERNAL_IP:-DENABLE_NODEPORT_ACCELERATION:-DENABLE_SESSION_AFFINITY:-DENABLE_SRC_RANGE_CHECK \
-DENABLE_IPV6:-DENABLE_IPSEC:-DENABLE_HOST_SERVICES_UDP:-DENABLE_HOST_SERVICES_TCP:-DENABLE_NODEPORT:-DENABLE_EXTERNAL_IP:-DENABLE_NODEPORT_ACCELERATION:-DENABLE_SESSION_AFFINITY:-DENABLE_BANDWIDTH_MANAGER \
-DENABLE_IPV6:-DENABLE_IPSEC:-DENABLE_HOST_SERVICES_UDP:-DENABLE_HOST_SERVICES_TCP:-DENABLE_NODEPORT:-DENABLE_EXTERNAL_IP:-DENABLE_NODEPORT_ACCELERATION:-DENABLE_SESSION_AFFINITY:-DENABLE_BANDWIDTH_MANAGER:-DENABLE_SRC_RANGE_CHECK

# These options are intended to max out the BPF program complexity. it is load
# tested as well.
Expand Down
2 changes: 2 additions & 0 deletions bpf/bpf_alignchecker.c
Expand Up @@ -81,6 +81,8 @@ int main(void)
DECLARE(struct, lb_affinity_match, iter);
DECLARE(struct, lb4_src_range_key, iter);
DECLARE(struct, lb6_src_range_key, iter);
DECLARE(struct, edt_id, iter);
DECLARE(struct, edt_info, iter);

return 0;
}
14 changes: 12 additions & 2 deletions bpf/bpf_host.c
Expand Up @@ -26,6 +26,7 @@
#endif

#include "lib/common.h"
#include "lib/edt.h"
#include "lib/arp.h"
#include "lib/maps.h"
#include "lib/ipv6.h"
Expand Down Expand Up @@ -883,6 +884,10 @@ int from_netdev(struct __ctx_buff *ctx)
__section("from-host")
int from_host(struct __ctx_buff *ctx)
{
/* Traffic from the host ns going through cilium_host device must
* not be subject to EDT rate-limiting.
*/
edt_set_aggregate(ctx, 0);
return handle_netdev(ctx, true);
}

Expand Down Expand Up @@ -927,14 +932,19 @@ int to_netdev(struct __ctx_buff *ctx __maybe_unused)
ret = DROP_UNKNOWN_L3;
break;
}


out:
if (IS_ERR(ret))
return send_drop_notify_error(ctx, src_id, ret, CTX_ACT_DROP,
METRIC_EGRESS);
#endif /* ENABLE_HOST_FIREWALL */

#if defined(ENABLE_BANDWIDTH_MANAGER)
ret = edt_sched_departure(ctx);
/* No send_drop_notify_error() here given we're rate-limiting. */
if (ret == CTX_ACT_DROP)
return ret;
#endif

#if defined(ENABLE_NODEPORT) && \
(!defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
Expand Down
2 changes: 2 additions & 0 deletions bpf/bpf_lxc.c
Expand Up @@ -16,6 +16,7 @@
#include "lib/config.h"
#include "lib/maps.h"
#include "lib/arp.h"
#include "lib/edt.h"
#include "lib/ipv6.h"
#include "lib/ipv4.h"
#include "lib/icmp6.h"
Expand Down Expand Up @@ -801,6 +802,7 @@ int handle_xgress(struct __ctx_buff *ctx)
int ret;

bpf_clear_meta(ctx);
edt_set_aggregate(ctx, LXC_ID);

send_trace_notify(ctx, TRACE_FROM_LXC, SECLABEL, 0, 0, 0, 0,
TRACE_PAYLOAD_LEN);
Expand Down
6 changes: 6 additions & 0 deletions bpf/include/bpf/ctx/skb.h
Expand Up @@ -68,6 +68,12 @@ ctx_full_len(const struct __sk_buff *ctx)
return ctx->len;
}

static __always_inline __maybe_unused __u32
ctx_wire_len(const struct __sk_buff *ctx)
{
return ctx->wire_len;
}

static __always_inline __maybe_unused void
ctx_store_meta(struct __sk_buff *ctx, const __u32 off, __u32 data)
{
Expand Down
6 changes: 6 additions & 0 deletions bpf/include/bpf/ctx/xdp.h
Expand Up @@ -251,6 +251,12 @@ ctx_full_len(const struct xdp_md *ctx)
return ctx_data_end(ctx) - ctx_data(ctx);
}

static __always_inline __maybe_unused __u32
ctx_wire_len(const struct xdp_md *ctx)
{
return ctx_full_len(ctx);
}

struct bpf_elf_map __section_maps cilium_xdp_scratch = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.size_key = sizeof(int),
Expand Down
3 changes: 3 additions & 0 deletions bpf/include/bpf/helpers_skb.h
Expand Up @@ -44,6 +44,9 @@ static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb,
const struct bpf_tunnel_key *from, __u32 size,
__u32 flags);

/* Packet classification (egress) */
static __u64 BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb);

/* Events for user space */
static int BPF_FUNC_REMAP(skb_event_output, struct __sk_buff *skb, void *map,
__u64 index, const void *data, __u32 size) =
Expand Down
14 changes: 6 additions & 8 deletions bpf/include/linux/bpf.h
Expand Up @@ -3590,19 +3590,12 @@ struct __sk_buff {
/* ... here. */

__u32 data_meta;

#if 0
/* TODO: compiled out for now since this cases verifier breakage
* on older kernels resulting in invalid bpf_context access. Needs
* follow-up investigation.
*/
__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
__u64 tstamp;
__u32 wire_len;
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
__u32 gso_size;
#endif
};

struct bpf_tunnel_key {
Expand Down Expand Up @@ -3927,7 +3920,12 @@ struct bpf_sock_addr {
__u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order.
*/
#if 0 /* TODO, see comment in __sk_buff */

/* TODO: compiled out for now since this cases verifier breakage
* on older kernels resulting in invalid bpf_context access. Needs
* follow-up investigation.
*/
#if 0
__bpf_md_ptr(struct bpf_sock *, sk);
#endif
};
Expand Down
11 changes: 11 additions & 0 deletions bpf/lib/common.h
Expand Up @@ -208,6 +208,17 @@ struct endpoint_info {
__u32 pad[4];
};

struct edt_id {
__u64 id; /* v1 net_cls tag or v2 cgrp id */
};

struct edt_info {
__u64 bps;
__u64 t_last;
__u64 t_horizon_drop;
__u64 pad[4];
};

struct remote_endpoint_info {
__u32 sec_label;
__u32 tunnel_endpoint;
Expand Down
80 changes: 80 additions & 0 deletions bpf/lib/edt.h
@@ -0,0 +1,80 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2020 Authors of Cilium */

#ifndef __EDT_H_
#define __EDT_H_

#include "common.h"
#include "time.h"
#include "maps.h"

#ifdef ENABLE_BANDWIDTH_MANAGER
static __always_inline void edt_set_aggregate(struct __ctx_buff *ctx,
__u32 aggregate)
{
/* 16 bit as current used aggregate, and preserved in host ns. */
ctx->queue_mapping = aggregate;
}

static __always_inline __u32 edt_get_aggregate(struct __ctx_buff *ctx)
{
__u32 aggregate = ctx->queue_mapping;

/* We need to reset queue mapping here such that new mapping will
* be performed based on skb hash. See netdev_pick_tx().
*/
ctx->queue_mapping = 0;

return aggregate;
}

static __always_inline int edt_sched_departure(struct __ctx_buff *ctx)
{
__u64 delay, now, t, t_next;
struct edt_id aggregate;
struct edt_info *info;
__u16 proto;

if (!validate_ethertype(ctx, &proto))
return CTX_ACT_OK;
if (proto != bpf_htons(ETH_P_IP) &&
proto != bpf_htons(ETH_P_IPV6))
return CTX_ACT_OK;

aggregate.id = edt_get_aggregate(ctx);
if (!aggregate.id)
return CTX_ACT_OK;

info = map_lookup_elem(&THROTTLE_MAP, &aggregate);
if (!info)
return CTX_ACT_OK;

now = ktime_get_ns();
t = ctx->tstamp;
if (t < now)
t = now;
delay = ((__u64)ctx_wire_len(ctx)) * NSEC_PER_SEC / info->bps;
t_next = READ_ONCE(info->t_last) + delay;
if (t_next <= t) {
WRITE_ONCE(info->t_last, t);
return CTX_ACT_OK;
}
/* FQ implements a drop horizon, see also 39d010504e6b ("net_sched:
* sch_fq: add horizon attribute"). However, we explicitly need the
* drop horizon here to i) avoid having t_last messed up and ii) to
* potentially allow for per aggregate control.
*/
if (t_next - now >= info->t_horizon_drop)
return CTX_ACT_DROP;
WRITE_ONCE(info->t_last, t_next);
ctx->tstamp = t_next;
return CTX_ACT_OK;
}
#else
static __always_inline void
edt_set_aggregate(struct __ctx_buff *ctx __maybe_unused,
__u32 aggregate __maybe_unused)
{
}
#endif /* ENABLE_BANDWIDTH_MANAGER */
#endif /* __EDT_H_ */
11 changes: 11 additions & 0 deletions bpf/lib/maps.h
Expand Up @@ -42,6 +42,17 @@ struct bpf_elf_map __section_maps POLICY_CALL_MAP = {
};
#endif /* SKIP_POLICY_MAP */

#ifdef ENABLE_BANDWIDTH_MANAGER
struct bpf_elf_map __section_maps THROTTLE_MAP = {
.type = BPF_MAP_TYPE_HASH,
.size_key = sizeof(struct edt_id),
.size_value = sizeof(struct edt_info),
.pinning = PIN_GLOBAL_NS,
.max_elem = THROTTLE_MAP_SIZE,
.flags = BPF_F_NO_PREALLOC,
};
#endif /* ENABLE_BANDWIDTH_MANAGER */

/* Map to link endpoint id to per endpoint cilium_policy map */
#ifdef SOCKMAP
struct bpf_elf_map __section_maps EP_POLICY_MAP = {
Expand Down
10 changes: 5 additions & 5 deletions bpf/lib/overloadable_skb.h
Expand Up @@ -9,11 +9,11 @@ bpf_clear_meta(struct __sk_buff *ctx)
{
__u32 zero = 0;

ctx->cb[0] = zero;
ctx->cb[1] = zero;
ctx->cb[2] = zero;
ctx->cb[3] = zero;
ctx->cb[4] = zero;
WRITE_ONCE(ctx->cb[0], zero);
WRITE_ONCE(ctx->cb[1], zero);
WRITE_ONCE(ctx->cb[2], zero);
WRITE_ONCE(ctx->cb[3], zero);
WRITE_ONCE(ctx->cb[4], zero);
}

/**
Expand Down
2 changes: 2 additions & 0 deletions bpf/node_config.h
Expand Up @@ -91,6 +91,8 @@ DEFINE_IPV6(HOST_IP, 0xbe, 0xef, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0xa, 0x
#define LB4_AFFINITY_MAP test_cilium_lb4_affinity
#define LB6_AFFINITY_MAP test_cilium_lb6_affinity
#define LB_AFFINITY_MATCH_MAP test_cilium_lb_affinity_match
#define THROTTLE_MAP test_cilium_throttle
#define THROTTLE_MAP_SIZE 65536
#define ENABLE_ARP_RESPONDER
#define TUNNEL_ENDPOINT_MAP_SIZE 65536
#define ENDPOINTS_MAP_SIZE 65536
Expand Down
3 changes: 3 additions & 0 deletions bugtool/cmd/configuration.go
Expand Up @@ -54,10 +54,13 @@ func defaultCommands(confDir string, cmdDir string, k8sPods []string) []string {
"ip -6 n",
"ss -t -p -a -i -s",
"ss -u -p -a -i -s",
"tc qdisc show",
"tc -d -s qdisc show",
"uname -a",
"top -b -n 1",
"uptime",
"dmesg --time-format=iso",
"sysctl -a",
"bpftool map show",
"bpftool prog show",
// LB and CT map for debugging services; using bpftool for a reliable dump
Expand Down
10 changes: 9 additions & 1 deletion daemon/cmd/daemon.go
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/cilium/cilium/api/v1/models"
health "github.com/cilium/cilium/cilium-health/launch"
"github.com/cilium/cilium/pkg/bandwidth"
"github.com/cilium/cilium/pkg/bpf"
"github.com/cilium/cilium/pkg/clustermesh"
"github.com/cilium/cilium/pkg/controller"
Expand Down Expand Up @@ -201,6 +202,8 @@ func (d *Daemon) init() error {
sockops.SkmsgDisable()

if !option.Config.DryMode {
bandwidth.InitBandwidthManager()

if err := d.createNodeConfigHeaderfile(); err != nil {
return err
}
Expand Down Expand Up @@ -434,12 +437,17 @@ func NewDaemon(ctx context.Context, epMgr *endpointmanager.EndpointManager, dp d
bootstrapStats.k8sInit.End(true)
}

// Perform an early probe on the underlying kernel on whether BandwidthManager
// can be supported or not. This needs to be done before detectNativeDevices()
// as BandwidthManager needs these to be available for setup.
bandwidth.ProbeBandwidthManager()

// The kube-proxy replacement and host-fw devices detection should happen after
// establishing a connection to kube-apiserver, but before starting a k8s watcher.
// This is because the device detection requires self (Cilium)Node object,
// and the k8s service watcher depends on option.Config.EnableNodePort flag
// which can be modified after the device detection.
detectDevicesForNodePortAndHostFirewall(isKubeProxyReplacementStrict)
detectNativeDevices(isKubeProxyReplacementStrict)
finishKubeProxyReplacementInit(isKubeProxyReplacementStrict)

// BPF masquerade depends on BPF NodePort, so the following checks should
Expand Down
3 changes: 3 additions & 0 deletions daemon/cmd/daemon_main.go
Expand Up @@ -520,6 +520,9 @@ func init() {
flags.Bool(option.EnableSVCSourceRangeCheck, true, "Enable check of service source ranges (currently, only for LoadBalancer)")
option.BindEnv(option.EnableSVCSourceRangeCheck)

flags.Bool(option.EnableBandwidthManager, false, "Enable BPF bandwidth manager")
option.BindEnv(option.EnableBandwidthManager)

flags.String(option.NodePortMode, option.NodePortModeSNAT, "BPF NodePort mode (\"snat\", \"dsr\", \"hybrid\")")
option.BindEnv(option.NodePortMode)

Expand Down
9 changes: 8 additions & 1 deletion daemon/cmd/endpoint.go
Expand Up @@ -26,6 +26,7 @@ import (
. "github.com/cilium/cilium/api/v1/server/restapi/endpoint"
"github.com/cilium/cilium/pkg/annotation"
"github.com/cilium/cilium/pkg/api"
"github.com/cilium/cilium/pkg/bandwidth"
"github.com/cilium/cilium/pkg/endpoint"
endpointid "github.com/cilium/cilium/pkg/endpoint/id"
"github.com/cilium/cilium/pkg/endpoint/regeneration"
Expand Down Expand Up @@ -430,9 +431,15 @@ func (d *Daemon) createEndpoint(ctx context.Context, owner regeneration.Owner, e
if err != nil {
return "", err
}

return p.Annotations[annotation.ProxyVisibility], nil
})
ep.UpdateBandwidthPolicy(func(ns, podName string) (bandwidthEgress string, err error) {
p, err := d.k8sWatcher.GetCachedPod(ns, podName)
if err != nil {
return "", err
}
return p.Annotations[bandwidth.EgressBandwidth], nil
})
}

regenTriggered := ep.UpdateLabels(ctx, addLabels, infoLabels, true)
Expand Down
7 changes: 3 additions & 4 deletions daemon/cmd/kube_proxy_replacement.go
Expand Up @@ -233,11 +233,10 @@ func initKubeProxyReplacementOptions() (strict bool) {
return
}

// detectDevicesForNodePortAndHostFirewall tries to detect bpf_host devices
// (if needed).
func detectDevicesForNodePortAndHostFirewall(strict bool) {
// detectNativeDevices tries to detect bpf_host devices (if needed).
func detectNativeDevices(strict bool) {
detectNodePortDevs := len(option.Config.Devices) == 0 &&
(option.Config.EnableNodePort || option.Config.EnableHostFirewall)
(option.Config.EnableNodePort || option.Config.EnableHostFirewall || option.Config.EnableBandwidthManager)
detectDirectRoutingDev := option.Config.EnableNodePort &&
option.Config.DirectRoutingDevice == ""
if detectNodePortDevs || detectDirectRoutingDev {
Expand Down
Expand Up @@ -400,6 +400,7 @@ data:
iptables-lock-timeout: {{ .Values.global.iptablesLockTimeout | quote }}
{{- end }}
auto-direct-node-routes: {{ .Values.global.autoDirectNodeRoutes | quote }}
enable-bandwidth-manager: {{ .Values.global.bandwidthManager | quote }}
{{- if .Values.global.nativeRoutingCIDR }}
native-routing-cidr: {{ .Values.global.nativeRoutingCIDR }}
{{- end }}
Expand Down

0 comments on commit 29f4654

Please sign in to comment.