Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datapath: Support NodePort BPF on L2-less devices #14858

Merged
merged 5 commits into from
Mar 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bpf/include/bpf/helpers_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ static int BPF_FUNC(skb_change_proto, struct __sk_buff *skb, __u32 proto,
__u32 flags);
static int BPF_FUNC(skb_change_tail, struct __sk_buff *skb, __u32 nlen,
__u32 flags);
static int BPF_FUNC(skb_change_head, struct __sk_buff *skb, __u32 head_room,
__u64 flags);

static int BPF_FUNC(skb_pull_data, struct __sk_buff *skb, __u32 len);

Expand Down
7 changes: 6 additions & 1 deletion bpf/include/linux/if_ether.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@
*/

#define ETH_ALEN 6 /* Octets in one ethernet addr */
#define ETH_HLEN 14 /* Total octets in header. */
/* __ETH_HLEN is out of sync with the kernel's if_ether.h. In Cilium datapath
* we use ETH_HLEN which can be loaded via static data, and for L2-less devs
* it's 0. To avoid replacing every occurrence of ETH_HLEN in the datapath,
* we prefixed the kernel's ETH_HLEN instead.
*/
#define __ETH_HLEN 14 /* Total octets in header. */
brb marked this conversation as resolved.
Show resolved Hide resolved
#define ETH_ZLEN 60 /* Min. octets in frame sans FCS */
#define ETH_DATA_LEN 1500 /* Max. octets in payload */
#define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */
Expand Down
2 changes: 1 addition & 1 deletion bpf/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ case "${MODE}" in
__mac; })"
for NATIVE_DEV in ${NATIVE_DEVS//;/ }; do
IDX=$(cat /sys/class/net/${NATIVE_DEV}/ifindex)
MAC=$(ip link show $NATIVE_DEV | grep ether | awk '{print $2}')
MAC=$(ip link show $NATIVE_DEV | grep ether | awk '{print $2}' || echo "00:00:00:00:00:00")
MAC=$(mac2array $MAC)
MAC_BY_IFINDEX_MACRO="${MAC_BY_IFINDEX_MACRO} case ${IDX}: {union macaddr __tmp = {.addr = ${MAC}}; __mac=__tmp;} break; \\\\\n"
done
Expand Down
30 changes: 26 additions & 4 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <linux/in.h>
#include <linux/socket.h>

#include "eth.h"
#include "endian.h"
#include "mono.h"
#include "config.h"
Expand Down Expand Up @@ -118,6 +119,14 @@ static __always_inline bool validate_ethertype(struct __ctx_buff *ctx,
void *data_end = ctx_data_end(ctx);
struct ethhdr *eth = data;

if (ETH_HLEN == 0) {
/* The packet is received on L2-less device. Determine L3
* protocol from skb->protocol.
*/
*proto = ctx_get_protocol(ctx);
return true;
}

if (data + ETH_HLEN > data_end)
return false;
*proto = eth->h_proto;
Expand All @@ -127,10 +136,11 @@ static __always_inline bool validate_ethertype(struct __ctx_buff *ctx,
}

static __always_inline __maybe_unused bool
__revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_,
void **l3, const __u32 l3_len, const bool pull)
____revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_,
void **l3, const __u32 l3_len, const bool pull,
__u8 eth_hlen)
{
const __u32 tot_len = ETH_HLEN + l3_len;
const __u32 tot_len = eth_hlen + l3_len;
void *data_end;
void *data;

Expand All @@ -146,10 +156,18 @@ __revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_,
*data_ = data;
*data_end_ = data_end;

*l3 = data + ETH_HLEN;
*l3 = data + eth_hlen;
return true;
}

static __always_inline __maybe_unused bool
__revalidate_data_pull(struct __ctx_buff *ctx, void **data, void **data_end,
void **l3, const __u32 l3_len, const bool pull)
{
return ____revalidate_data_pull(ctx, data, data_end, l3, l3_len, pull,
ETH_HLEN);
}

/* revalidate_data_pull() initializes the provided pointers from the ctx and
* ensures that the data is pulled in for access. Should be used the first
* time that the ctx data is accessed, subsequent calls can be made to
Expand All @@ -174,6 +192,10 @@ __revalidate_data_pull(struct __ctx_buff *ctx, void **data_, void **data_end_,
#define revalidate_data(ctx, data, data_end, ip) \
__revalidate_data_pull(ctx, data, data_end, (void **)ip, sizeof(**ip), false)

#define revalidate_data_with_eth_hlen(ctx, data, data_end, ip, eth_len) \
____revalidate_data_pull(ctx, data, data_end, (void **)ip, \
sizeof(**ip), false, eth_len)

/* Macros for working with L3 cilium defined IPV6 addresses */
#define BPF_V6(dst, ...) BPF_V6_1(dst, fetch_ipv6(__VA_ARGS__))
#define BPF_V6_1(dst, ...) BPF_V6_4(dst, __VA_ARGS__)
Expand Down
9 changes: 8 additions & 1 deletion bpf/lib/eth.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <linux/if_ether.h>

#ifndef ETH_HLEN
#define ETH_HLEN 14
#define ETH_HLEN __ETH_HLEN
#endif

#ifndef ETH_ALEN
Expand Down Expand Up @@ -109,4 +109,11 @@ static __always_inline int eth_store_daddr(struct __ctx_buff *ctx,
#endif
}

static __always_inline int eth_store_proto(struct __ctx_buff *ctx,
const __u16 proto, int off)
{
return ctx_store_bytes(ctx, off + ETH_ALEN + ETH_ALEN,
&proto, sizeof(proto), 0);
}

#endif /* __LIB_ETH__ */
96 changes: 94 additions & 2 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,32 @@ static __always_inline bool dsr_is_too_big(struct __ctx_buff *ctx __maybe_unused
return false;
}

static __always_inline int
maybe_add_l2_hdr(struct __ctx_buff *ctx __maybe_unused,
__u32 ifindex __maybe_unused,
bool *l2_hdr_required __maybe_unused)
{
if (IS_L3_DEV(ifindex))
/* NodePort request is going to be redirected to L3 dev, so skip
* L2 addr settings.
*/
*l2_hdr_required = false;
else if (ETH_HLEN == 0) {
/* NodePort request is going to be redirected from L3 to L2 dev,
* so we need to create L2 hdr first.
*/
__u16 proto = ctx_get_protocol(ctx);

if (ctx_change_head(ctx, __ETH_HLEN, 0))
return DROP_INVALID;

if (eth_store_proto(ctx, proto, 0) < 0)
return DROP_WRITE_ERROR;
}
brb marked this conversation as resolved.
Show resolved Hide resolved

return 0;
}

#ifdef ENABLE_IPV6
static __always_inline bool nodeport_uses_dsr6(const struct ipv6_ct_tuple *tuple)
{
Expand Down Expand Up @@ -492,6 +518,7 @@ int tail_nodeport_ipv6_dsr(struct __ctx_buff *ctx)
struct ipv6hdr *ip6;
union v6addr addr;
int ret, ohead = 0;
bool l2_hdr_required = true;

if (!revalidate_data(ctx, &data, &data_end, &ip6)) {
ret = DROP_INVALID;
Expand Down Expand Up @@ -522,6 +549,16 @@ int tail_nodeport_ipv6_dsr(struct __ctx_buff *ctx)
goto drop_err;
}

ret = maybe_add_l2_hdr(ctx, DIRECT_ROUTING_DEV_IFINDEX,
&l2_hdr_required);
if (ret != 0)
goto drop_err;
if (!l2_hdr_required)
goto out_send;
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end, &ip6,
__ETH_HLEN))
return DROP_INVALID;

if (nodeport_lb_hairpin())
dmac = map_lookup_elem(&NODEPORT_NEIGH6, &ip6->daddr);
if (dmac) {
Expand Down Expand Up @@ -560,6 +597,7 @@ int tail_nodeport_ipv6_dsr(struct __ctx_buff *ctx)
}
}

out_send:
return ctx_redirect(ctx, fib_params.l.ifindex, 0);
drop_err:
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
Expand All @@ -585,6 +623,7 @@ int tail_nodeport_nat_ipv6(struct __ctx_buff *ctx)
union macaddr *dmac = NULL;
void *data, *data_end;
struct ipv6hdr *ip6;
bool l2_hdr_required = true;

target.addr = tmp;
#ifdef ENCAP_IFINDEX
Expand Down Expand Up @@ -653,6 +692,16 @@ int tail_nodeport_nat_ipv6(struct __ctx_buff *ctx)
goto drop_err;
}

ret = maybe_add_l2_hdr(ctx, DIRECT_ROUTING_DEV_IFINDEX,
&l2_hdr_required);
if (ret != 0)
goto drop_err;
if (!l2_hdr_required)
goto out_send;
brb marked this conversation as resolved.
Show resolved Hide resolved
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end, &ip6,
__ETH_HLEN))
return DROP_INVALID;

if (nodeport_lb_hairpin())
dmac = map_lookup_elem(&NODEPORT_NEIGH6, &ip6->daddr);
if (dmac) {
Expand Down Expand Up @@ -691,7 +740,7 @@ int tail_nodeport_nat_ipv6(struct __ctx_buff *ctx)
goto drop_err;
}
}
out_send: __maybe_unused
out_send:
return ctx_redirect(ctx, fib_params.l.ifindex, 0);
drop_err:
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP,
Expand Down Expand Up @@ -875,6 +924,7 @@ static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, int *ifindex
struct bpf_fib_lookup fib_params = {};
union macaddr *dmac = NULL;
__u32 monitor = 0;
bool l2_hdr_required = true;

if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
Expand Down Expand Up @@ -930,6 +980,15 @@ static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, int *ifindex
}
#endif

ret = maybe_add_l2_hdr(ctx, *ifindex, &l2_hdr_required);
if (ret != 0)
return ret;
if (!l2_hdr_required)
return CTX_ACT_OK;
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end,
&ip6, __ETH_HLEN))
return DROP_INVALID;

if (fib_lookup_bypass())
dmac = map_lookup_elem(&NODEPORT_NEIGH6, &tuple.daddr);
if (dmac) {
Expand Down Expand Up @@ -1422,6 +1481,7 @@ int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx)
void *data, *data_end;
int ret, ohead = 0;
struct iphdr *ip4;
bool l2_hdr_required = true;

if (!revalidate_data(ctx, &data, &data_end, &ip4)) {
ret = DROP_INVALID;
Expand Down Expand Up @@ -1449,6 +1509,16 @@ int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx)
goto drop_err;
}

ret = maybe_add_l2_hdr(ctx, DIRECT_ROUTING_DEV_IFINDEX,
&l2_hdr_required);
if (ret != 0)
goto drop_err;
if (!l2_hdr_required)
goto out_send;
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end, &ip4,
__ETH_HLEN))
return DROP_INVALID;

if (nodeport_lb_hairpin())
dmac = map_lookup_elem(&NODEPORT_NEIGH4, &ip4->daddr);
if (dmac) {
Expand Down Expand Up @@ -1485,6 +1555,7 @@ int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx)
}
}

out_send:
return ctx_redirect(ctx, fib_params.l.ifindex, 0);
drop_err:
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
Expand All @@ -1509,6 +1580,7 @@ int tail_nodeport_nat_ipv4(struct __ctx_buff *ctx)
union macaddr *dmac = NULL;
void *data, *data_end;
struct iphdr *ip4;
bool l2_hdr_required = true;

target.addr = IPV4_DIRECT_ROUTING;
#ifdef ENCAP_IFINDEX
Expand Down Expand Up @@ -1575,6 +1647,16 @@ int tail_nodeport_nat_ipv4(struct __ctx_buff *ctx)
goto drop_err;
}

ret = maybe_add_l2_hdr(ctx, DIRECT_ROUTING_DEV_IFINDEX,
&l2_hdr_required);
if (ret != 0)
goto drop_err;
if (!l2_hdr_required)
goto out_send;
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end, &ip4,
__ETH_HLEN))
return DROP_INVALID;

if (nodeport_lb_hairpin())
dmac = map_lookup_elem(&NODEPORT_NEIGH4, &ip4->daddr);
if (dmac) {
Expand Down Expand Up @@ -1611,7 +1693,7 @@ int tail_nodeport_nat_ipv4(struct __ctx_buff *ctx)
goto drop_err;
}
}
out_send: __maybe_unused
out_send:
return ctx_redirect(ctx, fib_params.l.ifindex, 0);
drop_err:
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP,
Expand Down Expand Up @@ -1806,6 +1888,7 @@ static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex
struct bpf_fib_lookup fib_params = {};
union macaddr *dmac = NULL;
__u32 monitor = 0;
bool l2_hdr_required = true;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
Expand Down Expand Up @@ -1857,6 +1940,15 @@ static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex
}
#endif

ret = maybe_add_l2_hdr(ctx, *ifindex, &l2_hdr_required);
if (ret != 0)
return ret;
if (!l2_hdr_required)
return CTX_ACT_OK;
else if (!revalidate_data_with_eth_hlen(ctx, &data, &data_end,
&ip4, __ETH_HLEN))
return DROP_INVALID;

if (fib_lookup_bypass())
dmac = map_lookup_elem(&NODEPORT_NEIGH4, &ip4->daddr);
if (dmac) {
Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/overloadable_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,10 @@ ctx_set_xfer(struct __sk_buff *ctx __maybe_unused, __u32 meta __maybe_unused)
/* Only possible from XDP -> SKB. */
}

static __always_inline __maybe_unused int
ctx_change_head(struct __sk_buff *ctx, __u32 head_room, __u64 flags)
{
return skb_change_head(ctx, head_room, flags);
}

#endif /* __LIB_OVERLOADABLE_SKB_H_ */
8 changes: 8 additions & 0 deletions bpf/lib/overloadable_xdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,12 @@ static __always_inline __maybe_unused void ctx_set_xfer(struct xdp_md *ctx,
ctx_store_meta(ctx, XFER_MARKER, meta);
}

static __always_inline __maybe_unused int
ctx_change_head(struct xdp_md *ctx __maybe_unused,
__u32 head_room __maybe_unused,
__u64 flags __maybe_unused)
{
return 0; /* Only intended for SKB context. */
}

#endif /* __LIB_OVERLOADABLE_XDP_H_ */
1 change: 1 addition & 0 deletions bpf/node_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ DEFINE_IPV6(HOST_IP, 0xbe, 0xef, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0xa, 0x
# define IPV6_RSS_PREFIX IPV6_DIRECT_ROUTING
# define IPV6_RSS_PREFIX_BITS 128
# endif
#define IS_L3_DEV(ifindex) false
#endif

#ifdef ENABLE_SRC_RANGE_CHECK
Expand Down
Loading