Skip to content

Commit

Permalink
bpf: Don't encrypt on path hostns -> remote pod
Browse files Browse the repository at this point in the history
In pod-to-pod encryption with IPsec and tunneling, Cilium currently
encrypts traffic on the path hostns -> remote pod even though traffic
is in plain-text on the path remote pod -> hostns. When using native
routing, neither of those paths is encrypted because traffic from the
hostns doesn't go through the bpf_host BPF program.

Cilium's Transparent Encryption with IPsec aims at encrypting pod-to-pod
traffic. It is therefore unclear why we are encrypting traffic from the
hostns. The simple fact that only one direction of the connection is
encrypted begs the question of its usefulness. It's possible that this
traffic was encrypted by mistake: some of this logic is necessary for
node-to-node encryption with IPsec (not supported anymore) and
pod-to-pod encryption may have been somewhat simplified to encrypt
*-to-pod traffic.

Encrypting traffic from the hostns nevertheless creates several issues.
First, this situation creates a path asymmetry between the forward and
reply paths of hostns<>remote pod connections. Path asymmetry issues are
well known to be a source of bugs, from of '--ctstate INVALID -j DROP'
iptables rules to NAT issues.

Second, Gray recently uncovered a separate bug which, when combined with
this encryption from hostns, can prevent Cilium from starting. That
separate bug is still being investigated but it seems to cause the
reload of bpf_host to depend on Cilium connecting to etcd in a
clustermesh context. If this etcd is a remote pod, Cilium connects to it
on hostns -> remote pod path. The bpf_host program being unloaded[1], it
fails. We end up in a cyclic dependency: bpf_host requires connectivity
to etcd, connectivity to etcd requires bpf_host.

This commit therefore removes encryption with IPsec for the path hostns
-> remote pod when using tunneling (already unencrypted when using
native routing).

1 - More specifically, in Gray's case, the bpf_host program is already
    loaded, but it needs to be reloaded because the IPsec XFRM config
    changed. Without this reload, encryption fails.
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
  • Loading branch information
pchaigno committed May 22, 2023
1 parent fd6fa25 commit 5fe2b2d
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 67 deletions.
22 changes: 6 additions & 16 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
/* Pass unknown ICMPv6 NS to stack */
#define ACTION_UNKNOWN_ICMP6_NS CTX_ACT_OK

/* CB_PROXY_MAGIC overlaps with CB_ENCRYPT_MAGIC */
#define ENCRYPT_OR_PROXY_MAGIC 0

#ifndef VLAN_FILTER
# define VLAN_FILTER(ifindex, vlan_id) return false;
#endif
Expand Down Expand Up @@ -344,13 +341,9 @@ handle_ipv6_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,

#ifdef TUNNEL_MODE
if (info != NULL && info->tunnel_endpoint != 0) {
/* If IPSEC is needed recirc through ingress to use xfrm stack
* and then result will routed back through bpf_netdev on egress
* but with encrypt marks.
*/
return encap_and_redirect_with_nodeid(ctx, info->tunnel_endpoint,
info->key, info->node_id,
secctx, info->sec_identity,
info->node_id, secctx,
info->sec_identity,
&trace);
} else {
struct tunnel_key key = {};
Expand Down Expand Up @@ -762,8 +755,8 @@ handle_ipv4_cont(struct __ctx_buff *ctx, __u32 secctx, const bool from_host,
#ifdef TUNNEL_MODE
if (info != NULL && info->tunnel_endpoint != 0) {
return encap_and_redirect_with_nodeid(ctx, info->tunnel_endpoint,
info->key, info->node_id,
secctx, info->sec_identity,
info->node_id, secctx,
info->sec_identity,
&trace);
} else {
/* IPv4 lookup key: daddr & IPV4_MASK */
Expand Down Expand Up @@ -1464,7 +1457,7 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
__section("to-host")
int cil_to_host(struct __ctx_buff *ctx)
{
__u32 magic = ctx_load_meta(ctx, ENCRYPT_OR_PROXY_MAGIC);
__u32 magic = ctx_load_meta(ctx, CB_PROXY_MAGIC);
__u16 __maybe_unused proto = 0;
struct trace_ctx trace = {
.reason = TRACE_REASON_UNKNOWN,
Expand All @@ -1475,10 +1468,7 @@ int cil_to_host(struct __ctx_buff *ctx)
__u32 src_id = 0;
__s8 ext_err = 0;

if ((magic & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_ENCRYPT) {
ctx->mark = magic; /* CB_ENCRYPT_MAGIC */
src_id = ctx_load_meta(ctx, CB_ENCRYPT_IDENTITY);
} else if ((magic & 0xFFFF) == MARK_MAGIC_TO_PROXY) {
if ((magic & 0xFFFF) == MARK_MAGIC_TO_PROXY) {
/* Upper 16 bits may carry proxy port number */
__be16 port = magic >> 16;

Expand Down
41 changes: 2 additions & 39 deletions bpf/lib/encap.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,6 @@

#ifdef HAVE_ENCAP
#ifdef ENABLE_IPSEC
static __always_inline int
encap_and_redirect_nomark_ipsec(struct __ctx_buff *ctx, __u8 key,
__u16 node_id, __u32 seclabel)
{
/* Traffic from local host in tunnel mode will be passed to
* cilium_host. In non-IPSec case traffic with non-local dst
* will then be redirected to tunnel device. In IPSec case
* though we need to traverse xfrm path still. The mark +
* cb[4] hints will not survive a veth pair xmit to ingress
* however so below encap_and_redirect_ipsec will not work.
* Instead pass hints via cb[0], cb[4] (cb is not cleared
* by dev_ctx_forward) and catch hints with bpf_host
* prog that will populate mark/cb as expected by xfrm and 2nd
* traversal into bpf_host. Remember we can't use cb[0-3]
* in both cases because xfrm layer would overwrite them. We
* use cb[4] here so it doesn't need to be reset by
* bpf_host.
*/
set_encrypt_key_meta(ctx, key, node_id);
set_identity_meta(ctx, seclabel);
return CTX_ACT_OK;
}

static __always_inline int
encap_and_redirect_ipsec(struct __ctx_buff *ctx, __u8 key, __u16 node_id,
__u32 seclabel)
Expand Down Expand Up @@ -112,21 +89,16 @@ __encap_and_redirect_with_nodeid(struct __ctx_buff *ctx, __be32 tunnel_endpoint,
}

/* encap_and_redirect_with_nodeid returns CTX_ACT_OK after ctx meta-data is
* set (eg. when IPSec is enabled). Caller should pass the ctx to the stack at this
* point. Otherwise returns CTX_ACT_REDIRECT on successful redirect to tunnel device.
* set. Caller should pass the ctx to the stack at this point. Otherwise
* returns CTX_ACT_REDIRECT on successful redirect to tunnel device.
* On error returns CTX_ACT_DROP or DROP_WRITE_ERROR.
*/
static __always_inline int
encap_and_redirect_with_nodeid(struct __ctx_buff *ctx, __be32 tunnel_endpoint,
__u8 key __maybe_unused,
__u16 node_id __maybe_unused,
__u32 seclabel, __u32 dstid,
const struct trace_ctx *trace)
{
#ifdef ENABLE_IPSEC
if (key)
return encap_and_redirect_nomark_ipsec(ctx, key, node_id, seclabel);
#endif
return __encap_and_redirect_with_nodeid(ctx, tunnel_endpoint, seclabel, dstid, NOT_VTEP_DST,
trace);
}
Expand Down Expand Up @@ -219,15 +191,6 @@ encap_and_redirect_netdev(struct __ctx_buff *ctx, struct tunnel_key *k,
if (!tunnel)
return DROP_NO_TUNNEL_ENDPOINT;

#ifdef ENABLE_IPSEC
if (tunnel->key) {
__u8 key = get_min_encrypt_key(tunnel->key);

return encap_and_redirect_nomark_ipsec(ctx, key,
tunnel->node_id,
seclabel);
}
#endif
return __encap_and_redirect_with_nodeid(ctx, tunnel->ip4, seclabel,
0, NOT_VTEP_DST, trace);
}
Expand Down
6 changes: 0 additions & 6 deletions bpf/lib/overloadable_skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,6 @@ set_encrypt_key_mark(struct __sk_buff *ctx, __u8 key, __u32 node_id)
ctx->mark = or_encrypt_key(key) | node_id << 16;
}

static __always_inline __maybe_unused void
set_encrypt_key_meta(struct __sk_buff *ctx, __u8 key, __u32 node_id)
{
ctx->cb[CB_ENCRYPT_MAGIC] = or_encrypt_key(key) | node_id << 16;
}

/**
* set_cluster_id_mark - sets the cluster_id mark.
*/
Expand Down
6 changes: 0 additions & 6 deletions bpf/lib/overloadable_xdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ set_encrypt_key_mark(struct xdp_md *ctx __maybe_unused, __u8 key __maybe_unused,
{
}

static __always_inline __maybe_unused void
set_encrypt_key_meta(struct xdp_md *ctx __maybe_unused, __u8 key __maybe_unused,
__u32 node_id __maybe_unused)
{
}

static __always_inline __maybe_unused void
ctx_set_cluster_id_mark(struct xdp_md *ctx __maybe_unused, __u32 cluster_id __maybe_unused)
{
Expand Down

0 comments on commit 5fe2b2d

Please sign in to comment.