From 830adba1c0245a2f144b5fdd9dee5977e751491d Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 15 Aug 2019 11:40:03 -0700 Subject: [PATCH] bpf: Support proxy using original source address and port. Add a 'proxy_redirect' bit to the conntrack entry so that the reply direction packets on proxy upstream connections using the original source address and port in addition to the original destination address and port can be redirected back to the local stack for local delivery. iptables rules are added to mark packets matching a transparent socket as going to the host proxy. Signed-off-by: Jarno Rajahalme --- Dockerfile | 2 +- bpf/bpf_lxc.c | 48 ++++++++++++++++++++++++------- bpf/lib/common.h | 11 ++++--- bpf/lib/conntrack.h | 15 +++++++--- bpf/lib/lb.h | 4 +-- bpf/lib/lxc.h | 25 ++++++++++++---- bpf/lib/nat.h | 4 +-- bpf/lib/nodeport.h | 8 +++--- bpf/lib/policy.h | 8 ++++-- daemon/bpf.sha | 2 +- pkg/datapath/iptables/iptables.go | 36 ++++++++++++++++++++++- 11 files changed, 126 insertions(+), 37 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4d6698ccd1476..e99bb1dcb12ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # # cilium-envoy from github.com/cilium/proxy # -FROM quay.io/cilium/cilium-envoy:d68c2561fae4c83960969a7aaa2a186c3b30e17a as cilium-envoy +FROM quay.io/cilium/cilium-envoy:4922a4bb7f6a76eb8bbbae9afa38564633aee263 as cilium-envoy # # Cilium incremental build. Should be fast given builder-deps is up-to-date! diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c index 98d84b25123de..73c7c41343e6b 100644 --- a/bpf/bpf_lxc.c +++ b/bpf/bpf_lxc.c @@ -144,6 +144,12 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb, reason = ret; + // Check it this is return traffic to an ingress proxy. + if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect) { + // Stack will do a socket match and deliver locally + return skb_redirect_to_proxy(skb, 0); + } + if (!revalidate_data(skb, &data, &data_end, &ip6)) return DROP_INVALID; @@ -186,7 +192,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb, * reverse NAT. */ ct_state_new.src_sec_id = SECLABEL; - ret = ct_create6(get_ct_map6(tuple), tuple, skb, CT_EGRESS, &ct_state_new); + ret = ct_create6(get_ct_map6(tuple), tuple, skb, CT_EGRESS, &ct_state_new, verdict > 0); if (IS_ERR(ret)) return ret; monitor = TRACE_PAYLOAD_LEN; @@ -460,6 +466,12 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID) reason = ret; + // Check it this is return traffic to an ingress proxy. + if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect) { + // Stack will do a socket match and deliver locally + return skb_redirect_to_proxy(skb, 0); + } + /* Determine the destination category for policy fallback. */ if (1) { struct remote_endpoint_info *info; @@ -500,7 +512,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID) */ ct_state_new.src_sec_id = SECLABEL; ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS, - &ct_state_new); + &ct_state_new, verdict > 0); if (IS_ERR(ret)) return ret; break; @@ -739,7 +751,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s int ret, l4_off, verdict, hdrlen; struct ct_state ct_state = {}; struct ct_state ct_state_new = {}; - bool skip_proxy = false; + bool skip_ingress_proxy = false; union v6addr orig_dip = {}; __u32 monitor = 0; @@ -755,7 +767,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s /* If packet is coming from the ingress proxy we have to skip * redirection to the ingress proxy as we would loop forever. */ - skip_proxy = tc_index_skip_proxy(skb); + skip_ingress_proxy = tc_index_skip_ingress_proxy(skb); hdrlen = ipv6_hdrlen(skb, ETH_HLEN, &tuple.nexthdr); if (hdrlen < 0) @@ -790,6 +802,14 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s *reason = ret; + // Check it this is return traffic to an egress proxy. + // Do not redirect again if the packet is coming from the egress proxy. + if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect && + !tc_index_skip_egress_proxy(skb)) { + // Stack will do a socket match and deliver locally + return skb_redirect_to_proxy(skb, 0); + } + if (unlikely(ct_state.rev_nat_index)) { int ret2; @@ -816,14 +836,14 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s return verdict; } - if (skip_proxy) + if (skip_ingress_proxy) verdict = 0; if (ret == CT_NEW) { ct_state_new.orig_dport = tuple.dport; ct_state_new.src_sec_id = src_label; ct_state_new.node_port = ct_state.node_port; - ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new); + ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new, verdict > 0); if (IS_ERR(ret)) return ret; @@ -948,7 +968,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s int ret, verdict, l3_off = ETH_HLEN, l4_off; struct ct_state ct_state = {}; struct ct_state ct_state_new = {}; - bool skip_proxy = false; + bool skip_ingress_proxy = false; __be32 orig_dip, orig_sip; bool is_fragment = false; __u32 monitor = 0; @@ -961,7 +981,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s /* If packet is coming from the ingress proxy we have to skip * redirection to the inggress proxy as we would loop forever. */ - skip_proxy = tc_index_skip_proxy(skb); + skip_ingress_proxy = tc_index_skip_ingress_proxy(skb); tuple.daddr = ip4->daddr; tuple.saddr = ip4->saddr; @@ -979,6 +999,14 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s *reason = ret; + // Check it this is return traffic to an egress proxy. + // Do not redirect again if the packet is coming from the egress proxy. + if ((ret == CT_REPLY || ret == CT_RELATED) && ct_state.proxy_redirect && + !tc_index_skip_egress_proxy(skb)) { + // Stack will do a socket match and deliver locally + return skb_redirect_to_proxy(skb, 0); + } + #ifdef ENABLE_NAT46 if (skb->cb[CB_NAT46_STATE] == NAT46) { ep_tail_call(skb, CILIUM_CALL_NAT46); @@ -1015,14 +1043,14 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, __u8 *reason, s return verdict; } - if (skip_proxy) + if (skip_ingress_proxy) verdict = 0; if (ret == CT_NEW) { ct_state_new.orig_dport = tuple.dport; ct_state_new.src_sec_id = src_label; ct_state_new.node_port = ct_state.node_port; - ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new); + ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new, verdict > 0); if (IS_ERR(ret)) return ret; diff --git a/bpf/lib/common.h b/bpf/lib/common.h index 863f3965bd5a4..a035f9cf5d2cb 100644 --- a/bpf/lib/common.h +++ b/bpf/lib/common.h @@ -426,8 +426,9 @@ static inline void __inline__ set_encrypt_key_cb(struct __sk_buff *skb, __u8 key * cilium_host @egress * bpf_host -> bpf_lxc */ -#define TC_INDEX_F_SKIP_PROXY 1 -#define TC_INDEX_F_SKIP_NODEPORT 2 +#define TC_INDEX_F_SKIP_INGRESS_PROXY 1 +#define TC_INDEX_F_SKIP_EGRESS_PROXY 2 +#define TC_INDEX_F_SKIP_NODEPORT 3 /* skb->cb[] usage: */ enum { @@ -506,7 +507,8 @@ struct ct_entry { lb_loopback:1, seen_non_syn:1, node_port:1, - reserve:10; + proxy_redirect:1, // Connection is redirected to a proxy + reserved:9; __u16 rev_nat_index; __u16 backend_id; /* Populated only in v1.6+ BPF code. */ @@ -608,7 +610,8 @@ struct ct_state { __u16 rev_nat_index; __u16 loopback:1, node_port:1, - reserved:14; + proxy_redirect:1, // Connection is redirected to a proxy + reserved:13; __be16 orig_dport; __be32 addr; __be32 svc_addr; diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h index 5f74ba332584f..0efebc009cdc5 100644 --- a/bpf/lib/conntrack.h +++ b/bpf/lib/conntrack.h @@ -231,6 +231,7 @@ static inline __u8 __inline__ __ct_lookup(void *map, struct __sk_buff *skb, ct_state->rev_nat_index = entry->rev_nat_index; ct_state->loopback = entry->lb_loopback; ct_state->node_port = entry->node_port; + ct_state->proxy_redirect = entry->proxy_redirect; /* To support seamless upgrade from an earlier service * implementation, we store references to the backend * in the "ct_entry.rx_bytes" field. @@ -647,13 +648,16 @@ ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple, /* Offset must point to IPv6 */ static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple, struct __sk_buff *skb, int dir, - struct ct_state *ct_state) + struct ct_state *ct_state, bool proxy_redirect) { /* Create entry in original direction */ struct ct_entry entry = { }; bool is_tcp = tuple->nexthdr == IPPROTO_TCP; union tcp_flags seen_flags = { .value = 0 }; + /* Note if this is a proxy connection so that replies can be redirected back to the proxy. */ + entry.proxy_redirect = proxy_redirect; + /* See the ct_create4 comments re the rx_bytes hack */ if (dir == CT_SERVICE) { entry.backend_id = 0; @@ -737,13 +741,16 @@ ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple, static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple, struct __sk_buff *skb, int dir, - struct ct_state *ct_state) + struct ct_state *ct_state, bool proxy_redirect) { /* Create entry in original direction */ struct ct_entry entry = { }; bool is_tcp = tuple->nexthdr == IPPROTO_TCP; union tcp_flags seen_flags = { .value = 0 }; + /* Note if this is a proxy connection so that replies can be redirected back to the proxy. */ + entry.proxy_redirect = proxy_redirect; + entry.lb_loopback = ct_state->loopback; entry.node_port = ct_state->node_port; @@ -859,7 +866,7 @@ ct_update6_rev_nat_index(void *map, struct ipv6_ct_tuple *tuple, static inline int __inline__ ct_create6(void *map, struct ipv6_ct_tuple *tuple, struct __sk_buff *skb, int dir, - struct ct_state *ct_state) + struct ct_state *ct_state, bool from_proxy) { return 0; } @@ -878,7 +885,7 @@ ct_update4_rev_nat_index(void *map, struct ipv4_ct_tuple *tuple, static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple, struct __sk_buff *skb, int dir, - struct ct_state *ct_state) + struct ct_state *ct_state, bool from_proxy) { return 0; } diff --git a/bpf/lib/lb.h b/bpf/lib/lb.h index 7125569b5318c..cb4d3e746ec10 100644 --- a/bpf/lib/lb.h +++ b/bpf/lib/lb.h @@ -534,7 +534,7 @@ static inline int __inline__ lb6_local(void *map, struct __sk_buff *skb, } state->backend_id = slave_svc->backend_id; state->rev_nat_index = svc_v2->rev_nat_index; - ret = ct_create6(map, tuple, skb, CT_SERVICE, state); + ret = ct_create6(map, tuple, skb, CT_SERVICE, state, false); /* Fail closed, if the conntrack entry create fails drop * service lookup. */ @@ -895,7 +895,7 @@ static inline int __inline__ lb4_local(void *map, struct __sk_buff *skb, } state->backend_id = slave_svc->backend_id; state->rev_nat_index = svc_v2->rev_nat_index; - ret = ct_create4(map, tuple, skb, CT_SERVICE, state); + ret = ct_create4(map, tuple, skb, CT_SERVICE, state, false); /* Fail closed, if the conntrack entry create fails drop * service lookup. */ diff --git a/bpf/lib/lxc.h b/bpf/lib/lxc.h index 712911bc704ba..3080f8a877690 100644 --- a/bpf/lib/lxc.h +++ b/bpf/lib/lxc.h @@ -99,7 +99,6 @@ skb_redirect_to_proxy(struct __sk_buff *skb, __be16 proxy_port) { skb->mark = MARK_MAGIC_TO_PROXY | proxy_port << 16; - #ifdef HOST_REDIRECT_TO_INGRESS cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_PRE, proxy_port); /* In this case, the DBG_CAPTURE_PROXY_POST will be sent from the @@ -107,7 +106,7 @@ skb_redirect_to_proxy(struct __sk_buff *skb, __be16 proxy_port) return redirect(HOST_IFINDEX, BPF_F_INGRESS); #else cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_POST, proxy_port); - skb_change_type(skb, PACKET_HOST); // Required ingress packets from overlay + skb_change_type(skb, PACKET_HOST); // Required for ingress packets from overlay return TC_ACT_OK; #endif } @@ -142,16 +141,30 @@ skb_redirect_to_proxy_hairpin(struct __sk_buff *skb, __be16 proxy_port) } /** - * tc_index_is_from_proxy - returns true if packet originates from ingress proxy + * tc_index_skip_ingress_proxy - returns true if packet originates from ingress proxy + */ +static inline bool __inline__ tc_index_skip_ingress_proxy(struct __sk_buff *skb) +{ + volatile __u32 tc_index = skb->tc_index; +#ifdef DEBUG + if (tc_index & TC_INDEX_F_SKIP_INGRESS_PROXY) + cilium_dbg(skb, DBG_SKIP_PROXY, tc_index, 0); +#endif + + return tc_index & TC_INDEX_F_SKIP_INGRESS_PROXY; +} + +/** + * tc_index_skip_egress_proxy - returns true if packet originates from egress proxy */ -static inline bool __inline__ tc_index_skip_proxy(struct __sk_buff *skb) +static inline bool __inline__ tc_index_skip_egress_proxy(struct __sk_buff *skb) { volatile __u32 tc_index = skb->tc_index; #ifdef DEBUG - if (tc_index & TC_INDEX_F_SKIP_PROXY) + if (tc_index & TC_INDEX_F_SKIP_EGRESS_PROXY) cilium_dbg(skb, DBG_SKIP_PROXY, tc_index, 0); #endif - return tc_index & TC_INDEX_F_SKIP_PROXY; + return tc_index & TC_INDEX_F_SKIP_EGRESS_PROXY; } #endif /* __LIB_LXC_H_ */ diff --git a/bpf/lib/nat.h b/bpf/lib/nat.h index d598c5a629ead..8312d066ac29d 100644 --- a/bpf/lib/nat.h +++ b/bpf/lib/nat.h @@ -287,7 +287,7 @@ static __always_inline int snat_v4_track_local(struct __sk_buff *skb, return ret; } else if (ret == CT_NEW) { ret = ct_create4(get_ct_map4(&tmp), &tmp, skb, where, - &ct_state); + &ct_state, false); if (IS_ERR(ret)) return ret; } @@ -707,7 +707,7 @@ static __always_inline int snat_v6_track_local(struct __sk_buff *skb, return ret; } else if (ret == CT_NEW) { ret = ct_create6(get_ct_map6(&tmp), &tmp, skb, where, - &ct_state); + &ct_state, false); if (IS_ERR(ret)) return ret; } diff --git a/bpf/lib/nodeport.h b/bpf/lib/nodeport.h index 82e7630277c12..1fc9236e58f1a 100644 --- a/bpf/lib/nodeport.h +++ b/bpf/lib/nodeport.h @@ -294,14 +294,14 @@ static inline int nodeport_lb6(struct __sk_buff *skb, __u32 src_identity) ct_state_new.src_sec_id = SECLABEL; ct_state_new.node_port = 1; ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_EGRESS, - &ct_state_new); + &ct_state_new, false); if (IS_ERR(ret)) return ret; if (backend_local) { ct_flip_tuple_dir6(&tuple); ct_state_new.rev_nat_index = 0; ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, - CT_INGRESS, &ct_state_new); + CT_INGRESS, &ct_state_new, false); if (IS_ERR(ret)) return ret; } @@ -660,14 +660,14 @@ static inline int nodeport_lb4(struct __sk_buff *skb, __u32 src_identity) ct_state_new.src_sec_id = SECLABEL; ct_state_new.node_port = 1; ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS, - &ct_state_new); + &ct_state_new, false); if (IS_ERR(ret)) return ret; if (backend_local) { ct_flip_tuple_dir4(&tuple); ct_state_new.rev_nat_index = 0; ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, - CT_INGRESS, &ct_state_new); + CT_INGRESS, &ct_state_new, false); if (IS_ERR(ret)) return ret; } diff --git a/bpf/lib/policy.h b/bpf/lib/policy.h index 1e675335ae4da..5ad0520edab6f 100644 --- a/bpf/lib/policy.h +++ b/bpf/lib/policy.h @@ -30,13 +30,17 @@ static inline bool __inline__ inherit_identity_from_host(struct __sk_buff *skb, /* Packets from the ingress proxy must skip the proxy when the * destination endpoint evaluates the policy. As the packet - * would loop otherwise. */ + * would loop and/or the connection be reset otherwise. */ if (magic == MARK_MAGIC_PROXY_INGRESS) { *identity = get_identity(skb); - skb->tc_index |= TC_INDEX_F_SKIP_PROXY; + skb->tc_index |= TC_INDEX_F_SKIP_INGRESS_PROXY; from_proxy = true; + /* (Return) packets from the egress proxy must skip the + * redirection to the proxy, as the packet would loop and/or + * the connection be reset otherwise. */ } else if (magic == MARK_MAGIC_PROXY_EGRESS) { *identity = get_identity(skb); + skb->tc_index |= TC_INDEX_F_SKIP_EGRESS_PROXY; from_proxy = true; } else if (magic == MARK_MAGIC_IDENTITY) { *identity = get_identity(skb); diff --git a/daemon/bpf.sha b/daemon/bpf.sha index 8da5a519efd07..3ece105c3d760 100644 --- a/daemon/bpf.sha +++ b/daemon/bpf.sha @@ -1,2 +1,2 @@ -GO_BINDATA_SHA1SUM=5f74bda1ce6ddce5d28a8ecc568855518285fa3e +GO_BINDATA_SHA1SUM=837bf5028d36a597ee3681acc6dcbea1cb5eb90f BPF_FILES=../bpf/COPYING ../bpf/Makefile ../bpf/Makefile.bpf ../bpf/bpf_alignchecker.c ../bpf/bpf_features.h ../bpf/bpf_hostdev_ingress.c ../bpf/bpf_ipsec.c ../bpf/bpf_lb.c ../bpf/bpf_lxc.c ../bpf/bpf_netdev.c ../bpf/bpf_network.c ../bpf/bpf_overlay.c ../bpf/bpf_sock.c ../bpf/bpf_xdp.c ../bpf/cilium-map-migrate.c ../bpf/filter_config.h ../bpf/include/bpf/api.h ../bpf/include/elf/elf.h ../bpf/include/elf/gelf.h ../bpf/include/elf/libelf.h ../bpf/include/iproute2/bpf_elf.h ../bpf/include/linux/bpf.h ../bpf/include/linux/bpf_common.h ../bpf/include/linux/byteorder.h ../bpf/include/linux/byteorder/big_endian.h ../bpf/include/linux/byteorder/little_endian.h ../bpf/include/linux/icmp.h ../bpf/include/linux/icmpv6.h ../bpf/include/linux/if_arp.h ../bpf/include/linux/if_ether.h ../bpf/include/linux/if_packet.h ../bpf/include/linux/in.h ../bpf/include/linux/in6.h ../bpf/include/linux/ioctl.h ../bpf/include/linux/ip.h ../bpf/include/linux/ipv6.h ../bpf/include/linux/perf_event.h ../bpf/include/linux/swab.h ../bpf/include/linux/tcp.h ../bpf/include/linux/type_mapper.h ../bpf/include/linux/udp.h ../bpf/init.sh ../bpf/lib/arp.h ../bpf/lib/common.h ../bpf/lib/config.h ../bpf/lib/conntrack.h ../bpf/lib/conntrack_map.h ../bpf/lib/csum.h ../bpf/lib/dbg.h ../bpf/lib/drop.h ../bpf/lib/encap.h ../bpf/lib/eps.h ../bpf/lib/eth.h ../bpf/lib/events.h ../bpf/lib/icmp6.h ../bpf/lib/ipv4.h ../bpf/lib/ipv6.h ../bpf/lib/l3.h ../bpf/lib/l4.h ../bpf/lib/lb.h ../bpf/lib/lxc.h ../bpf/lib/maps.h ../bpf/lib/metrics.h ../bpf/lib/nat.h ../bpf/lib/nat46.h ../bpf/lib/nodeport.h ../bpf/lib/policy.h ../bpf/lib/tailcall.h ../bpf/lib/trace.h ../bpf/lib/utils.h ../bpf/lib/xdp.h ../bpf/lxc_config.h ../bpf/netdev_config.h ../bpf/node_config.h ../bpf/probes/raw_change_tail.t ../bpf/probes/raw_fib_lookup.t ../bpf/probes/raw_insn.h ../bpf/probes/raw_invalidate_hash.t ../bpf/probes/raw_lpm_map.t ../bpf/probes/raw_lru_map.t ../bpf/probes/raw_main.c ../bpf/probes/raw_map_val_adj.t ../bpf/probes/raw_mark_map_val.t ../bpf/probes/raw_sock_cookie.t ../bpf/run_probes.sh ../bpf/sockops/Makefile ../bpf/sockops/bpf_redir.c ../bpf/sockops/bpf_sockops.c ../bpf/sockops/bpf_sockops.h ../bpf/sockops/sockops_config.h diff --git a/pkg/datapath/iptables/iptables.go b/pkg/datapath/iptables/iptables.go index e2cb10c5680cb..8a97c2715c787 100644 --- a/pkg/datapath/iptables/iptables.go +++ b/pkg/datapath/iptables/iptables.go @@ -621,7 +621,41 @@ func (m *IptablesManager) InstallRules(ifName string) error { return err } + toProxyMark := fmt.Sprintf("%#08x", linux_defaults.MagicMarkIsToProxy) + + if option.Config.EnableIPv6 { + // Mark host proxy transparent connections to be routed to the local stack. + // This comes before the TPROXY rules in the chain, and setting the mark + // without the proxy port number will make the TPROXY rule to not match, + // as we do not want to try to tproxy packets that are going to the stack + // already. + // This rule is needed for couple of reasons: + // 1. route return traffic to the proxy + // 2. route original direction traffic that would otherwise be intercepted + // by ip_early_demux + if err := runProg("ip6tables", append( + m.waitArgs, + "-t", "mangle", + "-A", ciliumPreMangleChain, + "-m", "socket", "--transparent", "--nowildcard", + "-m", "comment", "--comment", "cilium: mark transparent proxy traffic to be routed locally", + "-j", "MARK", "--set-mark", toProxyMark), false); err != nil { + return err + } + } + if option.Config.EnableIPv4 { + // See comment above for the IPv6 case. + if err := runProg("iptables", append( + m.waitArgs, + "-t", "mangle", + "-A", ciliumPreMangleChain, + "-m", "socket", "--transparent", "--nowildcard", + "-m", "comment", "--comment", "cilium: mark transparent proxy traffic to be routed locally", + "-j", "MARK", "--set-mark", toProxyMark), false); err != nil { + return err + } + // Clear the Kubernetes masquerading mark bit to skip source PAT // performed by kube-proxy for all packets destined for Cilium. Cilium // installs a dedicated rule which does the source PAT to the right @@ -634,7 +668,7 @@ func (m *IptablesManager) InstallRules(ifName string) error { "-o", localDeliveryInterface, "-m", "mark", "!", "--mark", matchFromIPSecDecrypt, // Don't match ipsec traffic "-m", "mark", "!", "--mark", matchFromIPSecEncrypt, // Don't match ipsec traffic - "-m", "comment", "--comment", "cilium: clear masq bit for pkts to "+ifName, + "-m", "comment", "--comment", "cilium: clear masq bit for pkts to "+localDeliveryInterface, "-j", "MARK", "--set-xmark", clearMasqBit), false); err != nil { return err }