Skip to content

Commit

Permalink
bpf: Support proxy using original source address and port.
Browse files Browse the repository at this point in the history
Add a 'proxy_redirect' bit to the conntrack entry so that the reply
direction packets on proxy upstream connections using the original
source address and port in addition to the original destination
address and port can be redirected back to the local stack for local
delivery.

iptables rules are added to mark packets matching a transparent socket
as going to the host proxy.

Signed-off-by: Jarno Rajahalme <jarno@covalent.io>
  • Loading branch information
jrajahalme authored and ianvernon committed Aug 15, 2019
1 parent 0c7f066 commit 830adba
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 37 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# cilium-envoy from github.com/cilium/proxy
#
FROM quay.io/cilium/cilium-envoy:d68c2561fae4c83960969a7aaa2a186c3b30e17a as cilium-envoy
FROM quay.io/cilium/cilium-envoy:4922a4bb7f6a76eb8bbbae9afa38564633aee263 as cilium-envoy

#
# Cilium incremental build. Should be fast given builder-deps is up-to-date!
Expand Down
48 changes: 38 additions & 10 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,

reason = ret;

// Check it this is return traffic to an ingress proxy.
if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect) {
// Stack will do a socket match and deliver locally
return skb_redirect_to_proxy(skb, 0);
}

if (!revalidate_data(skb, &data, &data_end, &ip6))
return DROP_INVALID;

Expand Down Expand Up @@ -186,7 +192,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
* reverse NAT.
*/
ct_state_new.src_sec_id = SECLABEL;
ret = ct_create6(get_ct_map6(tuple), tuple, skb, CT_EGRESS, &ct_state_new);
ret = ct_create6(get_ct_map6(tuple), tuple, skb, CT_EGRESS, &ct_state_new, verdict > 0);
if (IS_ERR(ret))
return ret;
monitor = TRACE_PAYLOAD_LEN;
Expand Down Expand Up @@ -460,6 +466,12 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)

reason = ret;

// Check it this is return traffic to an ingress proxy.
if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect) {
// Stack will do a socket match and deliver locally
return skb_redirect_to_proxy(skb, 0);
}

/* Determine the destination category for policy fallback. */
if (1) {
struct remote_endpoint_info *info;
Expand Down Expand Up @@ -500,7 +512,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
*/
ct_state_new.src_sec_id = SECLABEL;
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS,
&ct_state_new);
&ct_state_new, verdict > 0);
if (IS_ERR(ret))
return ret;
break;
Expand Down Expand Up @@ -739,7 +751,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s
int ret, l4_off, verdict, hdrlen;
struct ct_state ct_state = {};
struct ct_state ct_state_new = {};
bool skip_proxy = false;
bool skip_ingress_proxy = false;
union v6addr orig_dip = {};
__u32 monitor = 0;

Expand All @@ -755,7 +767,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s

/* If packet is coming from the ingress proxy we have to skip
* redirection to the ingress proxy as we would loop forever. */
skip_proxy = tc_index_skip_proxy(skb);
skip_ingress_proxy = tc_index_skip_ingress_proxy(skb);

hdrlen = ipv6_hdrlen(skb, ETH_HLEN, &tuple.nexthdr);
if (hdrlen < 0)
Expand Down Expand Up @@ -790,6 +802,14 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s

*reason = ret;

// Check it this is return traffic to an egress proxy.
// Do not redirect again if the packet is coming from the egress proxy.
if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect &&
!tc_index_skip_egress_proxy(skb)) {
// Stack will do a socket match and deliver locally
return skb_redirect_to_proxy(skb, 0);
}

if (unlikely(ct_state.rev_nat_index)) {
int ret2;

Expand All @@ -816,14 +836,14 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s
return verdict;
}

if (skip_proxy)
if (skip_ingress_proxy)
verdict = 0;

if (ret == CT_NEW) {
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ct_state_new.node_port = ct_state.node_port;
ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new);
ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new, verdict > 0);
if (IS_ERR(ret))
return ret;

Expand Down Expand Up @@ -948,7 +968,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s
int ret, verdict, l3_off = ETH_HLEN, l4_off;
struct ct_state ct_state = {};
struct ct_state ct_state_new = {};
bool skip_proxy = false;
bool skip_ingress_proxy = false;
__be32 orig_dip, orig_sip;
bool is_fragment = false;
__u32 monitor = 0;
Expand All @@ -961,7 +981,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s

/* If packet is coming from the ingress proxy we have to skip
* redirection to the inggress proxy as we would loop forever. */
skip_proxy = tc_index_skip_proxy(skb);
skip_ingress_proxy = tc_index_skip_ingress_proxy(skb);

tuple.daddr = ip4->daddr;
tuple.saddr = ip4->saddr;
Expand All @@ -979,6 +999,14 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s

*reason = ret;

// Check it this is return traffic to an egress proxy.
// Do not redirect again if the packet is coming from the egress proxy.
if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect &&
!tc_index_skip_egress_proxy(skb)) {
// Stack will do a socket match and deliver locally
return skb_redirect_to_proxy(skb, 0);
}

#ifdef ENABLE_NAT46
if (skb->cb[CB_NAT46_STATE] == NAT46) {
ep_tail_call(skb, CILIUM_CALL_NAT46);
Expand Down Expand Up @@ -1015,14 +1043,14 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s
return verdict;
}

if (skip_proxy)
if (skip_ingress_proxy)
verdict = 0;

if (ret == CT_NEW) {
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ct_state_new.node_port = ct_state.node_port;
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new);
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new, verdict > 0);
if (IS_ERR(ret))
return ret;

Expand Down
11 changes: 7 additions & 4 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,9 @@ static inline void __inline__ set_encrypt_key_cb(struct __sk_buff *skb, __u8 key
* cilium_host @egress
* bpf_host -> bpf_lxc
*/
#define TC_INDEX_F_SKIP_PROXY 1
#define TC_INDEX_F_SKIP_NODEPORT 2
#define TC_INDEX_F_SKIP_INGRESS_PROXY 1
#define TC_INDEX_F_SKIP_EGRESS_PROXY 2
#define TC_INDEX_F_SKIP_NODEPORT 3

/* skb->cb[] usage: */
enum {
Expand Down Expand Up @@ -506,7 +507,8 @@ struct ct_entry {
lb_loopback:1,
seen_non_syn:1,
node_port:1,
reserve:10;
proxy_redirect:1, // Connection is redirected to a proxy
reserved:9;
__u16 rev_nat_index;
__u16 backend_id; /* Populated only in v1.6+ BPF code. */

Expand Down Expand Up @@ -608,7 +610,8 @@ struct ct_state {
__u16 rev_nat_index;
__u16 loopback:1,
node_port:1,
reserved:14;
proxy_redirect:1, // Connection is redirected to a proxy
reserved:13;
__be16 orig_dport;
__be32 addr;
__be32 svc_addr;
Expand Down
15 changes: 11 additions & 4 deletions bpf/lib/conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ static inline __u8 __inline__ __ct_lookup(void *map, struct __sk_buff *skb,
ct_state->rev_nat_index = entry->rev_nat_index;
ct_state->loopback = entry->lb_loopback;
ct_state->node_port = entry->node_port;
ct_state->proxy_redirect = entry->proxy_redirect;
/* To support seamless upgrade from an earlier service
* implementation, we store references to the backend
* in the "ct_entry.rx_bytes" field.
Expand Down Expand Up @@ -647,13 +648,16 @@ ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple,
/* Offset must point to IPv6 */
static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple,
struct __sk_buff *skb, int dir,
struct ct_state *ct_state)
struct ct_state *ct_state, bool proxy_redirect)
{
/* Create entry in original direction */
struct ct_entry entry = { };
bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
union tcp_flags seen_flags = { .value = 0 };

/* Note if this is a proxy connection so that replies can be redirected back to the proxy. */
entry.proxy_redirect = proxy_redirect;

/* See the ct_create4 comments re the rx_bytes hack */
if (dir == CT_SERVICE) {
entry.backend_id = 0;
Expand Down Expand Up @@ -737,13 +741,16 @@ ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple,

static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple,
struct __sk_buff *skb, int dir,
struct ct_state *ct_state)
struct ct_state *ct_state, bool proxy_redirect)
{
/* Create entry in original direction */
struct ct_entry entry = { };
bool is_tcp = tuple->nexthdr == IPPROTO_TCP;
union tcp_flags seen_flags = { .value = 0 };

/* Note if this is a proxy connection so that replies can be redirected back to the proxy. */
entry.proxy_redirect = proxy_redirect;

entry.lb_loopback = ct_state->loopback;
entry.node_port = ct_state->node_port;

Expand Down Expand Up @@ -859,7 +866,7 @@ ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple,

static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple,
struct __sk_buff *skb, int dir,
struct ct_state *ct_state)
struct ct_state *ct_state, bool from_proxy)
{
return 0;
}
Expand All @@ -878,7 +885,7 @@ ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple,

static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple,
struct __sk_buff *skb, int dir,
struct ct_state *ct_state)
struct ct_state *ct_state, bool from_proxy)
{
return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions bpf/lib/lb.h
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ static inline int __inline__ lb6_local(void *map, struct __sk_buff *skb,
}
state->backend_id = slave_svc->backend_id;
state->rev_nat_index = svc_v2->rev_nat_index;
ret = ct_create6(map, tuple, skb, CT_SERVICE, state);
ret = ct_create6(map, tuple, skb, CT_SERVICE, state, false);
/* Fail closed, if the conntrack entry create fails drop
* service lookup.
*/
Expand Down Expand Up @@ -895,7 +895,7 @@ static inline int __inline__ lb4_local(void *map, struct __sk_buff *skb,
}
state->backend_id = slave_svc->backend_id;
state->rev_nat_index = svc_v2->rev_nat_index;
ret = ct_create4(map, tuple, skb, CT_SERVICE, state);
ret = ct_create4(map, tuple, skb, CT_SERVICE, state, false);
/* Fail closed, if the conntrack entry create fails drop
* service lookup.
*/
Expand Down
25 changes: 19 additions & 6 deletions bpf/lib/lxc.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,14 @@ skb_redirect_to_proxy(struct __sk_buff *skb, __be16 proxy_port)
{
skb->mark = MARK_MAGIC_TO_PROXY | proxy_port << 16;


#ifdef HOST_REDIRECT_TO_INGRESS
cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_PRE, proxy_port);
/* In this case, the DBG_CAPTURE_PROXY_POST will be sent from the
* programm attached to HOST_IFINDEX. */
return redirect(HOST_IFINDEX, BPF_F_INGRESS);
#else
cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_POST, proxy_port);
skb_change_type(skb, PACKET_HOST); // Required ingress packets from overlay
skb_change_type(skb, PACKET_HOST); // Required for ingress packets from overlay
return TC_ACT_OK;
#endif
}
Expand Down Expand Up @@ -142,16 +141,30 @@ skb_redirect_to_proxy_hairpin(struct __sk_buff *skb, __be16 proxy_port)
}

/**
* tc_index_is_from_proxy - returns true if packet originates from ingress proxy
* tc_index_skip_ingress_proxy - returns true if packet originates from ingress proxy
*/
static inline bool __inline__ tc_index_skip_ingress_proxy(struct __sk_buff *skb)
{
volatile __u32 tc_index = skb->tc_index;
#ifdef DEBUG
if (tc_index & TC_INDEX_F_SKIP_INGRESS_PROXY)
cilium_dbg(skb, DBG_SKIP_PROXY, tc_index, 0);
#endif

return tc_index & TC_INDEX_F_SKIP_INGRESS_PROXY;
}

/**
* tc_index_skip_egress_proxy - returns true if packet originates from egress proxy
*/
static inline bool __inline__ tc_index_skip_proxy(struct __sk_buff *skb)
static inline bool __inline__ tc_index_skip_egress_proxy(struct __sk_buff *skb)
{
volatile __u32 tc_index = skb->tc_index;
#ifdef DEBUG
if (tc_index & TC_INDEX_F_SKIP_PROXY)
if (tc_index & TC_INDEX_F_SKIP_EGRESS_PROXY)
cilium_dbg(skb, DBG_SKIP_PROXY, tc_index, 0);
#endif

return tc_index & TC_INDEX_F_SKIP_PROXY;
return tc_index & TC_INDEX_F_SKIP_EGRESS_PROXY;
}
#endif /* __LIB_LXC_H_ */
4 changes: 2 additions & 2 deletions bpf/lib/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ static __always_inline int snat_v4_track_local(struct __sk_buff *skb,
return ret;
} else if (ret == CT_NEW) {
ret = ct_create4(get_ct_map4(&tmp), &tmp, skb, where,
&ct_state);
&ct_state, false);
if (IS_ERR(ret))
return ret;
}
Expand Down Expand Up @@ -707,7 +707,7 @@ static __always_inline int snat_v6_track_local(struct __sk_buff *skb,
return ret;
} else if (ret == CT_NEW) {
ret = ct_create6(get_ct_map6(&tmp), &tmp, skb, where,
&ct_state);
&ct_state, false);
if (IS_ERR(ret))
return ret;
}
Expand Down
8 changes: 4 additions & 4 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,14 @@ static inline int nodeport_lb6(struct __sk_buff *skb, __u32 src_identity)
ct_state_new.src_sec_id = SECLABEL;
ct_state_new.node_port = 1;
ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_EGRESS,
&ct_state_new);
&ct_state_new, false);
if (IS_ERR(ret))
return ret;
if (backend_local) {
ct_flip_tuple_dir6(&tuple);
ct_state_new.rev_nat_index = 0;
ret = ct_create6(get_ct_map6(&tuple), &tuple, skb,
CT_INGRESS, &ct_state_new);
CT_INGRESS, &ct_state_new, false);
if (IS_ERR(ret))
return ret;
}
Expand Down Expand Up @@ -660,14 +660,14 @@ static inline int nodeport_lb4(struct __sk_buff *skb, __u32 src_identity)
ct_state_new.src_sec_id = SECLABEL;
ct_state_new.node_port = 1;
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS,
&ct_state_new);
&ct_state_new, false);
if (IS_ERR(ret))
return ret;
if (backend_local) {
ct_flip_tuple_dir4(&tuple);
ct_state_new.rev_nat_index = 0;
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb,
CT_INGRESS, &ct_state_new);
CT_INGRESS, &ct_state_new, false);
if (IS_ERR(ret))
return ret;
}
Expand Down
8 changes: 6 additions & 2 deletions bpf/lib/policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ static inline bool __inline__ inherit_identity_from_host(struct __sk_buff *skb,

/* Packets from the ingress proxy must skip the proxy when the
* destination endpoint evaluates the policy. As the packet
* would loop otherwise. */
* would loop and/or the connection be reset otherwise. */
if (magic == MARK_MAGIC_PROXY_INGRESS) {
*identity = get_identity(skb);
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
skb->tc_index |= TC_INDEX_F_SKIP_INGRESS_PROXY;
from_proxy = true;
/* (Return) packets from the egress proxy must skip the
* redirection to the proxy, as the packet would loop and/or
* the connection be reset otherwise. */
} else if (magic == MARK_MAGIC_PROXY_EGRESS) {
*identity = get_identity(skb);
skb->tc_index |= TC_INDEX_F_SKIP_EGRESS_PROXY;
from_proxy = true;
} else if (magic == MARK_MAGIC_IDENTITY) {
*identity = get_identity(skb);
Expand Down
2 changes: 1 addition & 1 deletion daemon/bpf.sha
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
GO_BINDATA_SHA1SUM=5f74bda1ce6ddce5d28a8ecc568855518285fa3e
GO_BINDATA_SHA1SUM=837bf5028d36a597ee3681acc6dcbea1cb5eb90f
BPF_FILES=../bpf/COPYING ../bpf/Makefile ../bpf/Makefile.bpf ../bpf/bpf_alignchecker.c ../bpf/bpf_features.h ../bpf/bpf_hostdev_ingress.c ../bpf/bpf_ipsec.c ../bpf/bpf_lb.c ../bpf/bpf_lxc.c ../bpf/bpf_netdev.c ../bpf/bpf_network.c ../bpf/bpf_overlay.c ../bpf/bpf_sock.c ../bpf/bpf_xdp.c ../bpf/cilium-map-migrate.c ../bpf/filter_config.h ../bpf/include/bpf/api.h ../bpf/include/elf/elf.h ../bpf/include/elf/gelf.h ../bpf/include/elf/libelf.h ../bpf/include/iproute2/bpf_elf.h ../bpf/include/linux/bpf.h ../bpf/include/linux/bpf_common.h ../bpf/include/linux/byteorder.h ../bpf/include/linux/byteorder/big_endian.h ../bpf/include/linux/byteorder/little_endian.h ../bpf/include/linux/icmp.h ../bpf/include/linux/icmpv6.h ../bpf/include/linux/if_arp.h ../bpf/include/linux/if_ether.h ../bpf/include/linux/if_packet.h ../bpf/include/linux/in.h ../bpf/include/linux/in6.h ../bpf/include/linux/ioctl.h ../bpf/include/linux/ip.h ../bpf/include/linux/ipv6.h ../bpf/include/linux/perf_event.h ../bpf/include/linux/swab.h ../bpf/include/linux/tcp.h ../bpf/include/linux/type_mapper.h ../bpf/include/linux/udp.h ../bpf/init.sh ../bpf/lib/arp.h ../bpf/lib/common.h ../bpf/lib/config.h ../bpf/lib/conntrack.h ../bpf/lib/conntrack_map.h ../bpf/lib/csum.h ../bpf/lib/dbg.h ../bpf/lib/drop.h ../bpf/lib/encap.h ../bpf/lib/eps.h ../bpf/lib/eth.h ../bpf/lib/events.h ../bpf/lib/icmp6.h ../bpf/lib/ipv4.h ../bpf/lib/ipv6.h ../bpf/lib/l3.h ../bpf/lib/l4.h ../bpf/lib/lb.h ../bpf/lib/lxc.h ../bpf/lib/maps.h ../bpf/lib/metrics.h ../bpf/lib/nat.h ../bpf/lib/nat46.h ../bpf/lib/nodeport.h ../bpf/lib/policy.h ../bpf/lib/tailcall.h ../bpf/lib/trace.h ../bpf/lib/utils.h ../bpf/lib/xdp.h ../bpf/lxc_config.h ../bpf/netdev_config.h ../bpf/node_config.h ../bpf/probes/raw_change_tail.t ../bpf/probes/raw_fib_lookup.t ../bpf/probes/raw_insn.h ../bpf/probes/raw_invalidate_hash.t ../bpf/probes/raw_lpm_map.t ../bpf/probes/raw_lru_map.t ../bpf/probes/raw_main.c ../bpf/probes/raw_map_val_adj.t ../bpf/probes/raw_mark_map_val.t ../bpf/probes/raw_sock_cookie.t ../bpf/run_probes.sh ../bpf/sockops/Makefile ../bpf/sockops/bpf_redir.c ../bpf/sockops/bpf_sockops.c ../bpf/sockops/bpf_sockops.h ../bpf/sockops/sockops_config.h

0 comments on commit 830adba

Please sign in to comment.