Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-introduce 2005 route table to fix L7 proxy issues #29530

Merged
merged 4 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 49 additions & 8 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ handle_ipv6_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
struct remote_endpoint_info *info = NULL;
struct endpoint_info *ep;
int ret;
__u8 encrypt_key __maybe_unused = 0;
bool from_ingress_proxy = tc_index_from_ingress_proxy(ctx);

if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
Expand Down Expand Up @@ -345,10 +347,16 @@ handle_ipv6_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
dst = (union v6addr *) &ip6->daddr;
info = lookup_ip6_remote_endpoint(dst, 0);

#ifdef ENABLE_IPSEC
/* See IPv4 comment. */
if (from_ingress_proxy && info)
encrypt_key = get_min_encrypt_key(info->key);
#endif

#ifdef TUNNEL_MODE
if (info != NULL && info->tunnel_endpoint != 0) {
return encap_and_redirect_with_nodeid(ctx, info->tunnel_endpoint,
secctx, info->sec_identity,
encrypt_key, secctx, info->sec_identity,
&trace);
} else {
struct tunnel_key key = {};
Expand All @@ -360,16 +368,25 @@ handle_ipv6_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
key.ip6.p4 = 0;
key.family = ENDPOINT_KEY_IPV6;

ret = encap_and_redirect_netdev(ctx, &key, secctx, &trace);
ret = encap_and_redirect_netdev(ctx, &key, encrypt_key, secctx, &trace);
if (ret != DROP_NO_TUNNEL_ENDPOINT)
return ret;
}
#endif

if (!info || identity_is_world_ipv6(info->sec_identity)) {
if (!info || (!from_ingress_proxy &&
identity_is_world_ipv6(info->sec_identity))) {
/* See IPv4 comment. */
return DROP_UNROUTABLE;
}

#if defined(ENABLE_IPSEC) && !defined(TUNNEL_MODE)
/* See IPv4 comment. */
if (from_ingress_proxy && info->tunnel_endpoint && encrypt_key)
return set_ipsec_encrypt(ctx, encrypt_key, info->tunnel_endpoint,
info->sec_identity, true);
#endif

return CTX_ACT_OK;
}

Expand Down Expand Up @@ -638,6 +655,8 @@ handle_ipv4_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
struct remote_endpoint_info *info;
struct endpoint_info *ep;
int ret;
__u8 encrypt_key __maybe_unused = 0;
bool from_ingress_proxy = tc_index_from_ingress_proxy(ctx);

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
Expand Down Expand Up @@ -762,10 +781,16 @@ handle_ipv4_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,

info = lookup_ip4_remote_endpoint(ip4->daddr, 0);

#ifdef ENABLE_IPSEC
/* We encrypt host to remote pod packets only if they are from ingress proxy. */
if (from_ingress_proxy && info)
encrypt_key = get_min_encrypt_key(info->key);
#endif

#ifdef TUNNEL_MODE
if (info != NULL && info->tunnel_endpoint != 0) {
return encap_and_redirect_with_nodeid(ctx, info->tunnel_endpoint,
secctx, info->sec_identity,
encrypt_key, secctx, info->sec_identity,
&trace);
} else {
/* IPv4 lookup key: daddr & IPV4_MASK */
Expand All @@ -775,13 +800,14 @@ handle_ipv4_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
key.family = ENDPOINT_KEY_IPV4;

cilium_dbg(ctx, DBG_NETDEV_ENCAP4, key.ip4, secctx);
ret = encap_and_redirect_netdev(ctx, &key, secctx, &trace);
ret = encap_and_redirect_netdev(ctx, &key, encrypt_key, secctx, &trace);
if (ret != DROP_NO_TUNNEL_ENDPOINT)
return ret;
}
#endif

if (!info || identity_is_world_ipv4(info->sec_identity)) {
if (!info || (!from_ingress_proxy &&
identity_is_world_ipv4(info->sec_identity))) {
/* We have received a packet for which no ipcache entry exists,
* we do not know what to do with this packet, drop it.
*
Expand All @@ -790,9 +816,21 @@ handle_ipv4_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
* entry. Therefore we need to test for WORLD_ID. It is clearly
* wrong to route a ctx to cilium_host for which we don't know
* anything about it as otherwise we'll run into a routing loop.
*
* Note that we do not drop packets from ingress proxy even if
* they are going to WORLD_ID. This is to avoid
* https://github.com/cilium/cilium/issues/21954.
*/
return DROP_UNROUTABLE;
}

#if defined(ENABLE_IPSEC) && !defined(TUNNEL_MODE)
/* We encrypt host to remote pod packets only if they are from ingress proxy. */
if (from_ingress_proxy && info->tunnel_endpoint && encrypt_key)
return set_ipsec_encrypt(ctx, encrypt_key, info->tunnel_endpoint,
info->sec_identity, true);
#endif

return CTX_ACT_OK;
}

Expand Down Expand Up @@ -936,7 +974,7 @@ static __always_inline int do_netdev_encrypt_encap(struct __ctx_buff *ctx, __u32

ctx->mark = 0;
bpf_clear_meta(ctx);
return encap_and_redirect_with_nodeid(ctx, ep->tunnel_endpoint,
return encap_and_redirect_with_nodeid(ctx, ep->tunnel_endpoint, 0,
src_id, 0, &trace);
}
#endif /* ENABLE_IPSEC && TUNNEL_MODE */
Expand Down Expand Up @@ -1417,7 +1455,10 @@ int cil_to_host(struct __ctx_buff *ctx)
__u32 src_id = 0;
__s8 ext_err = 0;

if ((magic & 0xFFFF) == MARK_MAGIC_TO_PROXY) {
if ((magic & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_ENCRYPT) {
ctx->mark = magic; /* CB_ENCRYPT_MAGIC */
src_id = ctx_load_meta(ctx, CB_ENCRYPT_IDENTITY);
} else if ((magic & 0xFFFF) == MARK_MAGIC_TO_PROXY) {
/* Upper 16 bits may carry proxy port number */
__be16 port = magic >> 16;

Expand Down
14 changes: 6 additions & 8 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -692,10 +692,9 @@ static __always_inline int handle_ipv6_from_lxc(struct __ctx_buff *ctx, __u32 *d
#ifndef TUNNEL_MODE
# ifdef ENABLE_IPSEC
if (encrypt_key && tunnel_endpoint) {
ret = set_ipsec_encrypt_mark(ctx, encrypt_key, tunnel_endpoint);
ret = set_ipsec_encrypt(ctx, encrypt_key, tunnel_endpoint, SECLABEL_IPV6, false);
if (unlikely(ret != CTX_ACT_OK))
return ret;
set_identity_meta(ctx, SECLABEL_IPV6);
} else
# endif /* ENABLE_IPSEC */
#endif /* TUNNEL_MODE */
Expand Down Expand Up @@ -1251,10 +1250,9 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx, __u32 *d
#ifndef TUNNEL_MODE
# ifdef ENABLE_IPSEC
if (encrypt_key && tunnel_endpoint) {
ret = set_ipsec_encrypt_mark(ctx, encrypt_key, tunnel_endpoint);
ret = set_ipsec_encrypt(ctx, encrypt_key, tunnel_endpoint, SECLABEL_IPV4, false);
if (unlikely(ret != CTX_ACT_OK))
return ret;
set_identity_meta(ctx, SECLABEL_IPV4);
} else
# endif /* ENABLE_IPSEC */
#endif /* TUNNEL_MODE */
Expand Down Expand Up @@ -1465,7 +1463,7 @@ ipv6_policy(struct __ctx_buff *ctx, struct ipv6hdr *ip6, int ifindex, __u32 src_
/* If packet is coming from the ingress proxy we have to skip
* redirection to the ingress proxy as we would loop forever.
*/
skip_ingress_proxy = tc_index_skip_ingress_proxy(ctx);
skip_ingress_proxy = tc_index_from_ingress_proxy(ctx);

ct_buffer = map_lookup_elem(&CT_TAIL_CALL_BUFFER6, &zero);
if (!ct_buffer)
Expand All @@ -1488,7 +1486,7 @@ ipv6_policy(struct __ctx_buff *ctx, struct ipv6hdr *ip6, int ifindex, __u32 src_
* Always redirect connections that originated from L7 LB.
*/
if (ct_state_is_from_l7lb(ct_state) ||
(ct_state->proxy_redirect && !tc_index_skip_egress_proxy(ctx))) {
(ct_state->proxy_redirect && !tc_index_from_egress_proxy(ctx))) {
/* This is a reply, the proxy port does not need to be embedded
* into ctx->mark and *proxy_port can be left unset.
*/
Expand Down Expand Up @@ -1792,7 +1790,7 @@ ipv4_policy(struct __ctx_buff *ctx, struct iphdr *ip4, int ifindex, __u32 src_la
/* If packet is coming from the ingress proxy we have to skip
* redirection to the ingress proxy as we would loop forever.
*/
skip_ingress_proxy = tc_index_skip_ingress_proxy(ctx);
skip_ingress_proxy = tc_index_from_ingress_proxy(ctx);

orig_sip = ip4->saddr;

Expand Down Expand Up @@ -1825,7 +1823,7 @@ ipv4_policy(struct __ctx_buff *ctx, struct iphdr *ip4, int ifindex, __u32 src_la
/* Skip policy enforcement for return traffic. */
if (ret == CT_REPLY || ret == CT_RELATED) {
if (ct_state_is_from_l7lb(ct_state) ||
(ct_state->proxy_redirect && !tc_index_skip_egress_proxy(ctx))) {
(ct_state->proxy_redirect && !tc_index_from_egress_proxy(ctx))) {
/* This is a reply, the proxy port does not need to be embedded
* into ctx->mark and *proxy_port can be left unset.
*/
Expand Down
5 changes: 3 additions & 2 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -776,8 +776,8 @@ static __always_inline __u32 or_encrypt_key(__u8 key)
* cilium_host @egress
* bpf_host -> bpf_lxc
*/
#define TC_INDEX_F_SKIP_INGRESS_PROXY 1
#define TC_INDEX_F_SKIP_EGRESS_PROXY 2
#define TC_INDEX_F_FROM_INGRESS_PROXY 1
#define TC_INDEX_F_FROM_EGRESS_PROXY 2
#define TC_INDEX_F_SKIP_NODEPORT 4
#define TC_INDEX_F_SKIP_RECIRCULATION 8
#define TC_INDEX_F_SKIP_HOST_FIREWALL 16
Expand All @@ -797,6 +797,7 @@ enum {
#define CB_PORT CB_SRC_LABEL /* Alias, non-overlapping */
#define CB_HINT CB_SRC_LABEL /* Alias, non-overlapping */
#define CB_PROXY_MAGIC CB_SRC_LABEL /* Alias, non-overlapping */
#define CB_ENCRYPT_MAGIC CB_SRC_LABEL /* Alias, non-overlapping */
#define CB_DST_ENDPOINT_ID CB_SRC_LABEL /* Alias, non-overlapping */
#define CB_SRV6_SID_1 CB_SRC_LABEL /* Alias, non-overlapping */
CB_IFINDEX,
Expand Down
24 changes: 19 additions & 5 deletions bpf/lib/encap.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,16 @@ __encap_and_redirect_with_nodeid(struct __ctx_buff *ctx, __u32 src_ip __maybe_un
*/
static __always_inline int
encap_and_redirect_with_nodeid(struct __ctx_buff *ctx, __be32 tunnel_endpoint,
__u8 encrypt_key __maybe_unused,
__u32 seclabel, __u32 dstid,
const struct trace_ctx *trace)
{
#ifdef ENABLE_IPSEC
if (encrypt_key)
return set_ipsec_encrypt(ctx, encrypt_key, tunnel_endpoint,
seclabel, true);
#endif

return __encap_and_redirect_with_nodeid(ctx, 0, tunnel_endpoint,
seclabel, dstid, NOT_VTEP_DST,
trace);
Expand All @@ -90,7 +97,7 @@ __encap_and_redirect_lxc(struct __ctx_buff *ctx, __be32 tunnel_endpoint,
#ifdef ENABLE_IPSEC
if (encrypt_key)
return set_ipsec_encrypt(ctx, encrypt_key, tunnel_endpoint,
seclabel);
seclabel, false);
#endif

#if !defined(ENABLE_NODEPORT) && defined(ENABLE_HOST_FIREWALL)
Expand All @@ -110,7 +117,7 @@ __encap_and_redirect_lxc(struct __ctx_buff *ctx, __be32 tunnel_endpoint,
/* tell caller that this packet needs to go through the stack: */
return CTX_ACT_OK;
#else
return encap_and_redirect_with_nodeid(ctx, tunnel_endpoint, seclabel,
return encap_and_redirect_with_nodeid(ctx, tunnel_endpoint, 0, seclabel,
dstid, trace);
#endif /* !ENABLE_NODEPORT && ENABLE_HOST_FIREWALL */
}
Expand Down Expand Up @@ -158,16 +165,17 @@ encap_and_redirect_lxc(struct __ctx_buff *ctx,
__u8 min_encrypt_key = get_min_encrypt_key(tunnel->key);

return set_ipsec_encrypt(ctx, min_encrypt_key, tunnel->ip4,
seclabel);
seclabel, false);
}
# endif
return encap_and_redirect_with_nodeid(ctx, tunnel->ip4, seclabel, dstid,
return encap_and_redirect_with_nodeid(ctx, tunnel->ip4, 0, seclabel, dstid,
trace);
#endif /* ENABLE_HIGH_SCALE_IPCACHE */
}

static __always_inline int
encap_and_redirect_netdev(struct __ctx_buff *ctx, struct tunnel_key *k,
__u8 encrypt_key __maybe_unused,
__u32 seclabel, const struct trace_ctx *trace)
{
struct tunnel_value *tunnel;
Expand All @@ -176,7 +184,13 @@ encap_and_redirect_netdev(struct __ctx_buff *ctx, struct tunnel_key *k,
if (!tunnel)
return DROP_NO_TUNNEL_ENDPOINT;

return encap_and_redirect_with_nodeid(ctx, tunnel->ip4, seclabel, 0,
#ifdef ENABLE_IPSEC
if (encrypt_key)
return set_ipsec_encrypt(ctx, encrypt_key, tunnel->ip4,
seclabel, true);
#endif

return encap_and_redirect_with_nodeid(ctx, tunnel->ip4, 0, seclabel, 0,
trace);
}
#endif /* TUNNEL_MODE || ENABLE_HIGH_SCALE_IPCACHE */
Expand Down
32 changes: 15 additions & 17 deletions bpf/lib/encrypt.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,17 @@ static __always_inline __u8 get_min_encrypt_key(__u8 peer_key __maybe_unused)

#ifdef ENABLE_IPSEC
static __always_inline int
set_ipsec_encrypt_mark(struct __ctx_buff *ctx, __u8 key, __u32 tunnel_endpoint)
set_ipsec_encrypt(struct __ctx_buff *ctx, __u8 key, __u32 tunnel_endpoint,
__u32 seclabel, bool use_meta)
{
/* IPSec is performed by the stack on any packets with the
* MARK_MAGIC_ENCRYPT bit set. During the process though we
* lose the lxc context (seclabel and tunnel endpoint). The
* tunnel endpoint can be looked up from daddr but the sec
* label is stashed in the mark or cb, and extracted in
* bpf_host to send ctx onto tunnel for encap.
*/

struct node_key node_ip = {};
__u16 *node_id;

Expand All @@ -54,23 +63,12 @@ set_ipsec_encrypt_mark(struct __ctx_buff *ctx, __u8 key, __u32 tunnel_endpoint)
if (!node_id)
return DROP_NO_NODE_ID;

set_encrypt_key_mark(ctx, key, *node_id);
return CTX_ACT_OK;
}

static __always_inline int
set_ipsec_encrypt(struct __ctx_buff *ctx, __u8 key, __u32 tunnel_endpoint,
__u32 seclabel)
{
/* IPSec is performed by the stack on any packets with the
* MARK_MAGIC_ENCRYPT bit set. During the process though we
* lose the lxc context (seclabel and tunnel endpoint). The
* tunnel endpoint can be looked up from daddr but the sec
* label is stashed in the mark and extracted in bpf_host
* to send ctx onto tunnel for encap.
*/
set_identity_meta(ctx, seclabel);
return set_ipsec_encrypt_mark(ctx, key, tunnel_endpoint);
if (use_meta)
set_encrypt_key_meta(ctx, key, *node_id);
else
set_encrypt_key_mark(ctx, key, *node_id);
return CTX_ACT_OK;
}

static __always_inline int
Expand Down
4 changes: 2 additions & 2 deletions bpf/lib/identity.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,14 @@ static __always_inline __u32 inherit_identity_from_host(struct __ctx_buff *ctx,
*/
if (magic == MARK_MAGIC_PROXY_INGRESS) {
*identity = get_identity(ctx);
ctx->tc_index |= TC_INDEX_F_SKIP_INGRESS_PROXY;
ctx->tc_index |= TC_INDEX_F_FROM_INGRESS_PROXY;
/* (Return) packets from the egress proxy must skip the redirection to
* the proxy, as the packet would loop and/or the connection be reset
* otherwise.
*/
} else if (magic == MARK_MAGIC_PROXY_EGRESS) {
*identity = get_identity(ctx);
ctx->tc_index |= TC_INDEX_F_SKIP_EGRESS_PROXY;
ctx->tc_index |= TC_INDEX_F_FROM_EGRESS_PROXY;
} else if (magic == MARK_MAGIC_IDENTITY) {
*identity = get_identity(ctx);
} else if (magic == MARK_MAGIC_HOST) {
Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/overloadable_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ set_encrypt_key_mark(struct __sk_buff *ctx, __u8 key, __u32 node_id)
ctx->mark = or_encrypt_key(key) | node_id << 16;
}

static __always_inline __maybe_unused void
set_encrypt_key_meta(struct __sk_buff *ctx, __u8 key, __u32 node_id)
{
ctx->cb[CB_ENCRYPT_MAGIC] = or_encrypt_key(key) | node_id << 16;
}

/**
* set_cluster_id_mark - sets the cluster_id mark.
*/
Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/overloadable_xdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ set_encrypt_key_mark(struct xdp_md *ctx __maybe_unused, __u8 key __maybe_unused,
{
}

static __always_inline __maybe_unused void
set_encrypt_key_meta(struct __sk_buff *ctx __maybe_unused, __u8 key __maybe_unused,
__u32 node_id __maybe_unused)
{
}

static __always_inline __maybe_unused void
ctx_set_cluster_id_mark(struct xdp_md *ctx __maybe_unused, __u32 cluster_id __maybe_unused)
{
Expand Down