Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.15] bpf: avoid SNAT tracking for overlay traffic #31785

Merged
merged 6 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
64 changes: 35 additions & 29 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,8 @@ int tail_handle_ipv6_from_netdev(struct __ctx_buff *ctx)

# ifdef ENABLE_HOST_FIREWALL
static __always_inline int
handle_to_netdev_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace, __s8 *ext_err)
handle_to_netdev_ipv6(struct __ctx_buff *ctx, __u32 src_sec_identity,
struct trace_ctx *trace, __s8 *ext_err)
{
void *data, *data_end;
struct ipv6hdr *ip6;
Expand All @@ -509,9 +510,8 @@ handle_to_netdev_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace, __s8 *ext
return ret;
}

if ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_HOST)
srcid = HOST_ID;
srcid = resolve_srcid_ipv6(ctx, ip6, srcid, &ipcache_srcid, true);
srcid = resolve_srcid_ipv6(ctx, ip6, src_sec_identity,
&ipcache_srcid, true);

/* to-netdev is attached to the egress path of the native device. */
return ipv6_host_policy_egress(ctx, srcid, ipcache_srcid, ip6, trace, ext_err);
Expand Down Expand Up @@ -941,19 +941,18 @@ int tail_handle_ipv4_from_netdev(struct __ctx_buff *ctx)

#ifdef ENABLE_HOST_FIREWALL
static __always_inline int
handle_to_netdev_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace, __s8 *ext_err)
handle_to_netdev_ipv4(struct __ctx_buff *ctx, __u32 src_sec_identity,
struct trace_ctx *trace, __s8 *ext_err)
{
void *data, *data_end;
struct iphdr *ip4;
__u32 src_id = 0, ipcache_srcid = 0;

if ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_HOST)
src_id = HOST_ID;

if (!revalidate_data_pull(ctx, &data, &data_end, &ip4))
return DROP_INVALID;

src_id = resolve_srcid_ipv4(ctx, ip4, src_id, &ipcache_srcid, true);
src_id = resolve_srcid_ipv4(ctx, ip4, src_sec_identity,
&ipcache_srcid, true);

/* We need to pass the srcid from ipcache to host firewall. See
* comment in ipv4_host_policy_egress() for details.
Expand Down Expand Up @@ -1318,17 +1317,22 @@ int cil_from_host(struct __ctx_buff *ctx)
__section_entry
int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
{
__u32 magic = ctx->mark & MARK_MAGIC_HOST_MASK;
struct trace_ctx trace = {
.reason = TRACE_REASON_UNKNOWN,
.monitor = 0,
};
__u32 __maybe_unused vlan_id;
__u32 src_sec_identity = 0;
int ret = CTX_ACT_OK;
__s8 ext_err = 0;
#ifdef ENABLE_HOST_FIREWALL
__u16 proto = 0;
#endif

if (magic == MARK_MAGIC_HOST || magic == MARK_MAGIC_OVERLAY)
src_sec_identity = HOST_ID;

/* Filter allowed vlan id's and pass them back to kernel.
*/
if (ctx->vlan_present) {
Expand All @@ -1337,23 +1341,21 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
if (allow_vlan(ctx->ifindex, vlan_id))
return CTX_ACT_OK;
else
return send_drop_notify_error(ctx, 0, DROP_VLAN_FILTERED,
return send_drop_notify_error(ctx, src_sec_identity,
DROP_VLAN_FILTERED,
CTX_ACT_DROP, METRIC_EGRESS);
}
}

#if defined(ENABLE_L7_LB)
{
__u32 magic = ctx->mark & MARK_MAGIC_HOST_MASK;
if (magic == MARK_MAGIC_PROXY_EGRESS_EPID) {
__u32 lxc_id = get_epid(ctx);

if (magic == MARK_MAGIC_PROXY_EGRESS_EPID) {
__u32 lxc_id = get_epid(ctx);

ctx->mark = 0;
tail_call_dynamic(ctx, &POLICY_EGRESSCALL_MAP, lxc_id);
return send_drop_notify_error(ctx, 0, DROP_MISSED_TAIL_CALL,
CTX_ACT_DROP, METRIC_EGRESS);
}
ctx->mark = 0;
tail_call_dynamic(ctx, &POLICY_EGRESSCALL_MAP, lxc_id);
return send_drop_notify_error(ctx, src_sec_identity,
DROP_MISSED_TAIL_CALL,
CTX_ACT_DROP, METRIC_EGRESS);
}
#endif

Expand All @@ -1376,12 +1378,14 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
# endif
# ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
ret = handle_to_netdev_ipv6(ctx, &trace, &ext_err);
ret = handle_to_netdev_ipv6(ctx, src_sec_identity,
&trace, &ext_err);
break;
# endif
# ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP): {
ret = handle_to_netdev_ipv4(ctx, &trace, &ext_err);
ret = handle_to_netdev_ipv4(ctx, src_sec_identity,
&trace, &ext_err);
break;
}
# endif
Expand All @@ -1391,8 +1395,9 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
}
out:
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, 0, ret, ext_err,
CTX_ACT_DROP, METRIC_EGRESS);
return send_drop_notify_error_ext(ctx, src_sec_identity,
ret, ext_err, CTX_ACT_DROP,
METRIC_EGRESS);

skip_host_firewall:
#endif /* ENABLE_HOST_FIREWALL */
Expand Down Expand Up @@ -1422,12 +1427,13 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
if (ret == CTX_ACT_REDIRECT)
return ret;
else if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP,
METRIC_EGRESS);
return send_drop_notify_error(ctx, src_sec_identity, ret,
CTX_ACT_DROP, METRIC_EGRESS);

#if defined(ENCRYPTION_STRICT_MODE)
if (!strict_allow(ctx))
return send_drop_notify_error(ctx, 0, DROP_UNENCRYPTED_TRAFFIC,
return send_drop_notify_error(ctx, src_sec_identity,
DROP_UNENCRYPTED_TRAFFIC,
CTX_ACT_DROP, METRIC_EGRESS);
#endif /* ENCRYPTION_STRICT_MODE */
#endif /* ENABLE_WIREGUARD */
Expand All @@ -1439,7 +1445,7 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
#endif

#ifdef ENABLE_NODEPORT
if (!ctx_snat_done(ctx)) {
if (!ctx_snat_done(ctx) && !ctx_is_overlay(ctx)) {
/*
* handle_nat_fwd tail calls in the majority of cases,
* so control might never return to this program.
Expand All @@ -1454,7 +1460,7 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
exit:
#endif
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, 0, ret, ext_err,
return send_drop_notify_error_ext(ctx, src_sec_identity, ret, ext_err,
CTX_ACT_DROP, METRIC_EGRESS);
send_trace_notify(ctx, TRACE_TO_NETWORK, 0, 0, 0,
0, trace.reason, trace.monitor);
Expand Down
16 changes: 10 additions & 6 deletions bpf/bpf_overlay.c
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ int cil_from_overlay(struct __ctx_buff *ctx)
__section_entry
int cil_to_overlay(struct __ctx_buff *ctx)
{
bool snat_done __maybe_unused = ctx_snat_done(ctx);
struct trace_ctx __maybe_unused trace;
int ret = TC_ACT_OK;
__u32 cluster_id __maybe_unused = 0;
Expand All @@ -756,19 +757,22 @@ int cil_to_overlay(struct __ctx_buff *ctx)
}
#endif

#ifdef ENABLE_NODEPORT
if (ctx_snat_done(ctx)) {
ret = CTX_ACT_OK;
goto out;
}

/* This must be after above ctx_snat_done, since the MARK_MAGIC_CLUSTER_ID
* is a super set of the MARK_MAGIC_SNAT_DONE. They will never be used together,
* but SNAT check should always take presedence.
*/
#ifdef ENABLE_CLUSTER_AWARE_ADDRESSING
cluster_id = ctx_get_cluster_id_mark(ctx);
#endif

ctx_set_overlay_mark(ctx);

#ifdef ENABLE_NODEPORT
if (snat_done) {
ret = CTX_ACT_OK;
goto out;
}

ret = handle_nat_fwd(ctx, cluster_id, &trace, &ext_err);
out:
#endif
Expand Down
3 changes: 2 additions & 1 deletion bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

#if defined(ENCAP_IFINDEX) || defined(ENABLE_EGRESS_GATEWAY_COMMON) || \
(defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE)
#define HAVE_ENCAP
#define HAVE_ENCAP 1

/* NOT_VTEP_DST is passed to an encapsulation function when the
* destination of the tunnel is not a VTEP.
Expand Down Expand Up @@ -706,6 +706,7 @@ enum metric_dir {
#define MARK_MAGIC_IDENTITY 0x0F00 /* mark carries identity */
#define MARK_MAGIC_TO_PROXY 0x0200
#define MARK_MAGIC_SNAT_DONE 0x0300
#define MARK_MAGIC_OVERLAY 0x0400

#define MARK_MAGIC_KEY_MASK 0xFF00

Expand Down
46 changes: 32 additions & 14 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ static __always_inline int nodeport_snat_fwd_ipv6(struct __ctx_buff *ctx,
goto out;

/* See the equivalent v4 path for comment */
ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

out:
if (ret == NAT_PUNT_TO_STACK)
Expand Down Expand Up @@ -1210,7 +1211,9 @@ int tail_nodeport_nat_egress_ipv6(struct __ctx_buff *ctx)
if (IS_ERR(ret))
goto drop_err;

ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#ifdef TUNNEL_MODE
if (tunnel_endpoint) {
__be16 src_port;
Expand Down Expand Up @@ -1469,8 +1472,8 @@ static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err __maybe_unused)
nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, bool *snat_done,
struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
{
struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
struct lb6_reverse_nat *nat_info;
Expand Down Expand Up @@ -1518,7 +1521,7 @@ nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
*snat_done = true;
}

return CTX_ACT_OK;
Expand Down Expand Up @@ -1565,21 +1568,25 @@ static __always_inline int
__handle_nat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err)
{
bool snat_done = false;
int ret;

ret = nodeport_rev_dnat_fwd_ipv6(ctx, trace, ext_err);
ret = nodeport_rev_dnat_fwd_ipv6(ctx, &snat_done, trace, ext_err);
if (ret != CTX_ACT_OK)
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE_IPV6)
if (!ctx_snat_done(ctx)) {
if (!snat_done) {
ep_tail_call(ctx, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD);
ret = DROP_MISSED_TAIL_CALL;
}
#endif

if (is_defined(IS_BPF_HOST) && snat_done)
ctx_snat_done_set(ctx);

return ret;
}

Expand Down Expand Up @@ -1704,8 +1711,13 @@ static __always_inline int nodeport_snat_fwd_ipv4(struct __ctx_buff *ctx,
/* If multiple netdevs process an outgoing packet, then this packets will
* be handled multiple times by the "to-netdev" section. This can lead
* to multiple SNATs. To prevent from that, set the SNAT done flag.
*
* XDP doesn't need the flag (there's no egress prog that would utilize it),
* and for overlay traffic it makes no difference whether the inner packet
* was SNATed.
*/
ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#if defined(ENABLE_EGRESS_GATEWAY_COMMON) && defined(IS_BPF_HOST)
if (target.egress_gateway)
Expand Down Expand Up @@ -2704,7 +2716,9 @@ int tail_nodeport_nat_egress_ipv4(struct __ctx_buff *ctx)
if (IS_ERR(ret))
goto drop_err;

ctx_snat_done_set(ctx);
if (is_defined(IS_BPF_HOST))
ctx_snat_done_set(ctx);

#ifdef TUNNEL_MODE
if (tunnel_endpoint) {
__be16 src_port;
Expand Down Expand Up @@ -2984,8 +2998,8 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
__s8 *ext_err __maybe_unused)
nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, bool *snat_done,
struct trace_ctx *trace, __s8 *ext_err __maybe_unused)
{
struct bpf_fib_lookup_padded fib_params __maybe_unused = {};
int ret, l3_off = ETH_HLEN, l4_off;
Expand Down Expand Up @@ -3046,7 +3060,7 @@ nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
*snat_done = true;

#ifdef ENABLE_DSR
#if defined(ENABLE_HIGH_SCALE_IPCACHE) && \
Expand Down Expand Up @@ -3116,23 +3130,27 @@ static __always_inline int
__handle_nat_fwd_ipv4(struct __ctx_buff *ctx, __u32 cluster_id __maybe_unused,
struct trace_ctx *trace, __s8 *ext_err)
{
bool snat_done = false;
int ret;

ret = nodeport_rev_dnat_fwd_ipv4(ctx, trace, ext_err);
ret = nodeport_rev_dnat_fwd_ipv4(ctx, &snat_done, trace, ext_err);
if (ret != CTX_ACT_OK)
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE_IPV4) || \
(defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT))
if (!ctx_snat_done(ctx)) {
if (!snat_done) {
ctx_store_meta(ctx, CB_CLUSTER_ID_EGRESS, cluster_id);
ep_tail_call(ctx, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD);
ret = DROP_MISSED_TAIL_CALL;
}
#endif

if (is_defined(IS_BPF_HOST) && snat_done)
ctx_snat_done_set(ctx);

return ret;
}

Expand Down
14 changes: 14 additions & 0 deletions bpf/lib/overloadable_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,20 @@ static __always_inline bool ctx_snat_done(const struct __sk_buff *ctx)
return (ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_SNAT_DONE;
}

static __always_inline void ctx_set_overlay_mark(struct __sk_buff *ctx)
{
ctx->mark &= ~MARK_MAGIC_HOST_MASK;
ctx->mark |= MARK_MAGIC_OVERLAY;
}

static __always_inline bool ctx_is_overlay(const struct __sk_buff *ctx)
{
if (!is_defined(HAVE_ENCAP))
return false;

return (ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_OVERLAY;
}

#ifdef HAVE_ENCAP
static __always_inline __maybe_unused int
ctx_set_encap_info(struct __sk_buff *ctx, __u32 src_ip,
Expand Down
1 change: 0 additions & 1 deletion bpf/tests/ipsec_from_host_tunnel.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
/* Copyright Authors of Cilium */

#define TUNNEL_MODE
#define HAVE_ENCAP
#define ENABLE_ROUTING

#define EXPECTED_STATUS_CODE CTX_ACT_REDIRECT
Expand Down