Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpf: avoid SNAT tracking for overlay traffic #31082

Merged
merged 5 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 18 additions & 28 deletions bpf/bpf_host.c
Expand Up @@ -1311,9 +1311,8 @@ int cil_from_host(struct __ctx_buff *ctx)

#if defined(ENABLE_ENCRYPTED_OVERLAY)
/*
* If the traffic is indeed overlay traffic and it should be encrypted
* CTX_ACT_REDIRECT is returned, unless an error occurred, and the caller can
* return this code to TC.
* If the traffic should be encrypted then CTX_ACT_REDIRECT is returned.
* Unless an error occurred, and the caller can return this code to TC.
*
* CTX_ACT_OK is returned if the traffic should continue normal processing.
*
Expand All @@ -1322,23 +1321,12 @@ int cil_from_host(struct __ctx_buff *ctx)
static __always_inline int do_encrypt_overlay(struct __ctx_buff *ctx)
{
int ret = CTX_ACT_OK;
__u16 proto = 0;
struct iphdr __maybe_unused *ipv4;
void __maybe_unused *data, *data_end = NULL;

/* we require a valid layer 2 to proceed */
if (!validate_ethertype(ctx, &proto))
return ret;

if (proto != bpf_htons(ETH_P_IP))
return ret;

if (!revalidate_data(ctx, &data, &data_end, &ipv4))
return DROP_INVALID;

if (!vxlan_skb_is_vxlan_v4(data, data_end, ipv4, TUNNEL_PORT))
ldelossa marked this conversation as resolved.
Show resolved Hide resolved
return ret;

if (vxlan_get_vni(data, data_end, ipv4) == ENCRYPTED_OVERLAY_ID)
ret = encrypt_overlay_and_redirect(ctx, data, data_end, ipv4);

Expand Down Expand Up @@ -1443,21 +1431,23 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
#endif

#if defined(ENABLE_ENCRYPTED_OVERLAY)
/* Determine if this is overlay traffic that should be recirculated
* to the stack for XFRM encryption.
*/
ret = do_encrypt_overlay(ctx);
if (ret == CTX_ACT_REDIRECT) {
/* we are redirecting back into the stack, so TRACE_TO_STACK
* for tracepoint
if (ctx_is_overlay(ctx)) {
/* Determine if this is overlay traffic that should be recirculated
* to the stack for XFRM encryption.
*/
send_trace_notify(ctx, TRACE_TO_STACK, 0, 0, 0,
0, TRACE_REASON_ENCRYPT_OVERLAY, 0);
return ret;
ret = do_encrypt_overlay(ctx);
if (ret == CTX_ACT_REDIRECT) {
/* we are redirecting back into the stack, so TRACE_TO_STACK
* for tracepoint
*/
send_trace_notify(ctx, TRACE_TO_STACK, 0, 0, 0,
0, TRACE_REASON_ENCRYPT_OVERLAY, 0);
return ret;
}
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP,
METRIC_EGRESS);
}
else if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP,
METRIC_EGRESS);
#endif /* ENABLE_ENCRYPTED_OVERLAY */

#ifdef ENABLE_WIREGUARD
Expand Down Expand Up @@ -1492,7 +1482,7 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
#endif

#ifdef ENABLE_NODEPORT
if (!ctx_snat_done(ctx)) {
if (!ctx_snat_done(ctx) && !ctx_is_overlay(ctx)) {
/*
* handle_nat_fwd tail calls in the majority of cases,
* so control might never return to this program.
Expand Down
16 changes: 10 additions & 6 deletions bpf/bpf_overlay.c
Expand Up @@ -734,6 +734,7 @@ int cil_from_overlay(struct __ctx_buff *ctx)
__section_entry
int cil_to_overlay(struct __ctx_buff *ctx)
{
bool snat_done __maybe_unused = ctx_snat_done(ctx);
struct trace_ctx __maybe_unused trace;
int ret = TC_ACT_OK;
__u32 cluster_id __maybe_unused = 0;
Expand All @@ -755,19 +756,22 @@ int cil_to_overlay(struct __ctx_buff *ctx)
}
#endif

#ifdef ENABLE_NODEPORT
if (ctx_snat_done(ctx)) {
ret = CTX_ACT_OK;
goto out;
}

/* This must be after above ctx_snat_done, since the MARK_MAGIC_CLUSTER_ID
* is a super set of the MARK_MAGIC_SNAT_DONE. They will never be used together,
* but SNAT check should always take presedence.
*/
#ifdef ENABLE_CLUSTER_AWARE_ADDRESSING
cluster_id = ctx_get_cluster_id_mark(ctx);
#endif

ctx_set_overlay_mark(ctx);
ldelossa marked this conversation as resolved.
Show resolved Hide resolved

#ifdef ENABLE_NODEPORT
if (snat_done) {
ret = CTX_ACT_OK;
goto out;
}

ret = handle_nat_fwd(ctx, cluster_id, &trace, &ext_err);
out:
#endif
Expand Down
3 changes: 2 additions & 1 deletion bpf/lib/common.h
Expand Up @@ -55,7 +55,7 @@

#if defined(ENCAP_IFINDEX) || defined(ENABLE_EGRESS_GATEWAY_COMMON) || \
(defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE)
#define HAVE_ENCAP
#define HAVE_ENCAP 1

/* NOT_VTEP_DST is passed to an encapsulation function when the
* destination of the tunnel is not a VTEP.
Expand Down Expand Up @@ -727,6 +727,7 @@ enum metric_dir {
#define MARK_MAGIC_IDENTITY 0x0F00 /* mark carries identity */
#define MARK_MAGIC_TO_PROXY 0x0200
#define MARK_MAGIC_SNAT_DONE 0x0300
#define MARK_MAGIC_OVERLAY 0x0400

#define MARK_MAGIC_KEY_MASK 0xFF00

Expand Down
46 changes: 32 additions & 14 deletions bpf/lib/nodeport.h
gentoo-root marked this conversation as resolved.
Show resolved Hide resolved
Expand Up @@ -218,7 +218,8 @@ static __always_inline int nodeport_snat_fwd_ipv6(struct __ctx_buff *ctx,
goto out;

/* See the equivalent v4 path for comment */
ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

out:
if (ret == NAT_PUNT_TO_STACK)
Expand Down Expand Up @@ -1201,7 +1202,9 @@ int tail_nodeport_nat_egress_ipv6(struct __ctx_buff *ctx)
if (IS_ERR(ret))
goto drop_err;

ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#ifdef TUNNEL_MODE
if (tunnel_endpoint) {
__be16 src_port;
Expand Down Expand Up @@ -1476,8 +1479,8 @@ static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err __maybe_unused)
nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, bool *snat_done,
struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
{
struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
struct lb6_reverse_nat *nat_info;
Expand Down Expand Up @@ -1523,7 +1526,7 @@ nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
*snat_done = true;
}

return CTX_ACT_OK;
Expand Down Expand Up @@ -1570,20 +1573,24 @@ static __always_inline int
__handle_nat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err)
{
bool snat_done = false;
int ret;

ret = nodeport_rev_dnat_fwd_ipv6(ctx, trace, ext_err);
ret = nodeport_rev_dnat_fwd_ipv6(ctx, &snat_done, trace, ext_err);
if (ret != CTX_ACT_OK)
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE_IPV6)
if (!ctx_snat_done(ctx))
if (!snat_done)
ret = tail_call_internal(ctx, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD,
ext_err);
#endif

if (is_defined(IS_BPF_HOST) && snat_done)
ctx_snat_done_set(ctx);

return ret;
}

Expand Down Expand Up @@ -1708,8 +1715,13 @@ static __always_inline int nodeport_snat_fwd_ipv4(struct __ctx_buff *ctx,
/* If multiple netdevs process an outgoing packet, then this packets will
* be handled multiple times by the "to-netdev" section. This can lead
* to multiple SNATs. To prevent from that, set the SNAT done flag.
*
* XDP doesn't need the flag (there's no egress prog that would utilize it),
* and for overlay traffic it makes no difference whether the inner packet
* was SNATed.
*/
ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#if defined(ENABLE_EGRESS_GATEWAY_COMMON) && defined(IS_BPF_HOST)
if (target.egress_gateway)
Expand Down Expand Up @@ -2728,7 +2740,9 @@ int tail_nodeport_nat_egress_ipv4(struct __ctx_buff *ctx)
if (IS_ERR(ret))
goto drop_err;

ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#ifdef TUNNEL_MODE
if (tunnel_endpoint) {
__be16 src_port;
Expand Down Expand Up @@ -3041,8 +3055,8 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err __maybe_unused)
nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, bool *snat_done,
struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
{
struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
int ret, l3_off = ETH_HLEN, l4_off;
Expand Down Expand Up @@ -3102,7 +3116,7 @@ nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
*snat_done = true;

#ifdef ENABLE_DSR
#if defined(ENABLE_HIGH_SCALE_IPCACHE) && \
Expand Down Expand Up @@ -3170,23 +3184,27 @@ static __always_inline int
__handle_nat_fwd_ipv4(struct __ctx_buff *ctx, __u32 cluster_id __maybe_unused,
struct trace_ctx *trace, __s8 *ext_err)
{
bool snat_done = false;
int ret;

ret = nodeport_rev_dnat_fwd_ipv4(ctx, trace, ext_err);
ret = nodeport_rev_dnat_fwd_ipv4(ctx, &snat_done, trace, ext_err);
if (ret != CTX_ACT_OK)
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE_IPV4) || \
(defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT))
if (!ctx_snat_done(ctx)) {
if (!snat_done) {
ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
ret = tail_call_internal(ctx, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD,
ext_err);
}
#endif

if (is_defined(IS_BPF_HOST) && snat_done)
ctx_snat_done_set(ctx);

return ret;
}

Expand Down
14 changes: 14 additions & 0 deletions bpf/lib/overloadable_skb.h
Expand Up @@ -234,6 +234,20 @@ static __always_inline bool ctx_snat_done(const struct __sk_buff *ctx)
return (ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_SNAT_DONE;
}

static __always_inline void ctx_set_overlay_mark(struct __sk_buff *ctx)
{
ctx->mark &= ~MARK_MAGIC_HOST_MASK;
ldelossa marked this conversation as resolved.
Show resolved Hide resolved
ctx->mark |= MARK_MAGIC_OVERLAY;
}

static __always_inline bool ctx_is_overlay(const struct __sk_buff *ctx)
{
if (!is_defined(HAVE_ENCAP))
return false;

return (ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_OVERLAY;
}

#ifdef HAVE_ENCAP
static __always_inline __maybe_unused int
ctx_set_encap_info(struct __sk_buff *ctx, __u32 src_ip,
Expand Down
31 changes: 0 additions & 31 deletions bpf/lib/vxlan.h
Expand Up @@ -10,37 +10,6 @@
#include <linux/if_ether.h>
#include "lib/csum.h"

/*
* Returns true if the skb associated with data pointers is a vxlan encapsulated
* packet.
*
* The determination is made by comparing the UDP destination port with
* the tunnel_port provided to the function.
*/
static __always_inline bool
vxlan_skb_is_vxlan_v4(const void *data, const void *data_end,
const struct iphdr *ipv4, const __u16 tunnel_port)
{
struct udphdr *udp = NULL;
__u32 l3_size = 0;

if (ipv4->protocol != IPPROTO_UDP)
return false;

l3_size = ipv4->ihl * 4;

if (data + sizeof(struct ethhdr) + l3_size + sizeof(struct udphdr)
+ sizeof(struct vxlanhdr) > data_end)
return false;

udp = (struct udphdr *)(data + sizeof(struct ethhdr) + l3_size);

if (udp->dest == bpf_htons(tunnel_port))
return true;

return false;
}

/*
* Returns the VNI in the native host's endian format of a xvlan encap'd packet.
*
Expand Down
3 changes: 3 additions & 0 deletions bpf/lib/wireguard.h
Expand Up @@ -72,6 +72,9 @@ wg_maybe_redirect_to_encrypt(struct __ctx_buff *ctx)
*
* This also handles IPv6, as IPv6 pkts are encapsulated w/
* IPv4 tunneling.
*
* TODO: in v1.17, we can trust that to-overlay will mark all
julianwiedmann marked this conversation as resolved.
Show resolved Hide resolved
* traffic. Then replace this with ctx_is_overlay().
*/
if (ip4->protocol == IPPROTO_UDP) {
int l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
Expand Down
40 changes: 0 additions & 40 deletions bpf/tests/vxlan_helpers_tests.c
Expand Up @@ -74,46 +74,6 @@ mk_packet(struct __ctx_buff *ctx) {
return 0;
}

PKTGEN("tc", "vxlan_skb_is_vxlan_v4_success")
static __always_inline int
pktgen_vxlan_mock_check1(struct __ctx_buff *ctx) {
return mk_packet(ctx);
}

CHECK("tc", "vxlan_skb_is_vxlan_v4_success")
int check1(struct __ctx_buff *ctx)
{
test_init();

void *data, *data_end = NULL;
struct iphdr *ipv4 = NULL;

assert(revalidate_data(ctx, &data, &data_end, &ipv4));
assert(vxlan_skb_is_vxlan_v4(data, data_end, ipv4, TUNNEL_PORT));

test_finish();
}

PKTGEN("tc", "vxlan_skb_is_vxlan_v4_failure")
static __always_inline int
pktgen_vxlan_mock_check2(struct __ctx_buff *ctx) {
return mk_packet(ctx);
}

CHECK("tc", "vxlan_skb_is_vxlan_v4_failure")
int check2(struct __ctx_buff *ctx)
{
test_init();

void *data, *data_end = NULL;
struct iphdr *ipv4 = NULL;

assert(revalidate_data(ctx, &data, &data_end, &ipv4));
assert(!vxlan_skb_is_vxlan_v4(data, data_end, ipv4, TUNNEL_PORT_BAD));

test_finish();
}

PKTGEN("tc", "vxlan_get_vni_success")
static __always_inline int
pktgen_vxlan_mock_check3(struct __ctx_buff *ctx) {
Expand Down