proxy: Distinguish between proxy and other local processes

The datapath used a simple sip == HOST_IP to detect local proxy traffic to bypass the proxy on the second pass through. This logic is flawed and causes the proxy to be bypassed in the following legitimate situations: * local process uses HOST_IP as source and talks to local endpoint * both egress and ingress proxy are injected between two local endpoints This commit fixes this situation by: * Introducing a new custom dialer for the proxy. This is required because net.Dial() does not allow to set a socket option between creating the socket and calling connect(). Access to the socket is required to set the SO_MARK before the connect() to ensure that the first SYN packet contains the proper SO_MARK. * Replace net.Listen() with custom code so we can set the SO_MARK before calling syscall.Listen() to ensure that all child sockets inherit the SO_MARK. This ensures that even the SYN+ACK generated by the kernel will haver proper packet markings. * Extending the SO_MARK to contain a magic marker in the lower 12 bits of skb->mark (0xFEA for ingress proxy, 0xFEB for egress proxy). Thus allowing to detect packets from the proxy by matching against the magic marker. The identity is moved to the upper 16 bits of skb->mark The skb->mark is cleared on veth traversal so we match against the magic marker at egress on the way out to cilium_host and set a flag in tc_index to indicate skipping the proxy. tc_index is preserved across veth boundaries. The mark is matched in a IP routing rule and causes packets from the proxies to use a different routing table. This allows to route all packets from proxies through cilium_host regardless of their destination. The BPF program attached to cilium_host can then perform the proxy reverse translation and route accordingly. * Only skipping the ingress proxy if the packet is coming from the ingress proxy. If the packet is coming from an egress proxy, it may still need to go through the ingress proxy of the destination endpoint. * Fixing the BPF program attached to cilium_host to recognize host IPs which are outside of the cluster prefix so it can route back into the host. * Adding a test to tests/10-proxy.sh which covers both an egress and ingress proxy in the path. Signed-off-by: Thomas Graf <thomas@cilium.io>
cilium · Sep 5, 2017 · f89a22c · f89a22c
1 parent 3ab2ea9
commit f89a22c
Show file tree

Hide file tree

Showing 10 changed files with 616 additions and 144 deletions.
diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c
@@ -801,11 +801,10 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
 	void *data_end = (void *) (long) skb->data_end;
 	struct ipv6hdr *ip6 = data + ETH_HLEN;
 	struct csum_offset csum_off = {};
-	union v6addr host_ip = {};
 	int ret, l4_off, verdict;
 	struct ct_state ct_state = {};
 	struct ct_state ct_state_new = {};
-	bool orig_was_proxy;
+	bool skip_proxy;
 	union v6addr orig_dip = {};
 
 	if (data + sizeof(struct ipv6hdr) + ETH_HLEN > data_end)
@@ -818,8 +817,9 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
 	ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr);
 	ipv6_addr_copy(&orig_dip, (union v6addr *) &ip6->daddr);
 
-	BPF_V6(host_ip, HOST_IP);
-	orig_was_proxy = ipv6_addrcmp((union v6addr *) &ip6->saddr, &host_ip) == 0;
+	/* If packet is coming from the egress proxy we have to skip
+	 * redirection to the egress proxy as we would loop forever. */
+	skip_proxy = tc_index_skip_proxy(skb);
 
 	l4_off = ETH_HLEN + ipv6_hdrlen(skb, ETH_HLEN, &tuple.nexthdr);
 	csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
@@ -869,7 +869,7 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
 		ct_state_new.orig_dport = tuple.dport;
 		ct_state_new.src_sec_id = src_label;
 		ret = ct_create6(&CT_MAP6, &tuple, skb, CT_INGRESS, &ct_state_new,
-				 orig_was_proxy);
+				 skip_proxy);
 		if (IS_ERR(ret))
 			return ret;
 
@@ -879,6 +879,9 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
 	if (ct_state.proxy_port && (ret == CT_NEW || ret == CT_ESTABLISHED)) {
 		union macaddr host_mac = HOST_IFINDEX_MAC;
 		union macaddr router_mac = NODE_MAC;
+		union v6addr host_ip = {};
+
+		BPF_V6(host_ip, HOST_IP);
 
 		ret = ipv6_redirect_to_host_port(skb, &csum_off, l4_off,
 						 ct_state.proxy_port, tuple.dport,
@@ -909,7 +912,7 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
 	int ret, verdict, l4_off;
 	struct ct_state ct_state = {};
 	struct ct_state ct_state_new = {};
-	bool orig_was_proxy;
+	bool skip_proxy;
 	__be32 orig_dip;
 
 	if (data + sizeof(*ip4) + ETH_HLEN > data_end)
@@ -918,7 +921,10 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
 	policy_clear_mark(skb);
 	tuple.nexthdr = ip4->protocol;
 
-	orig_was_proxy = ip4->saddr == IPV4_GATEWAY;
+	/* If packet is coming from the egress proxy we have to skip
+	 * redirection to the egress proxy as we would loop forever. */
+	skip_proxy = tc_index_skip_proxy(skb);
+
 	tuple.daddr = ip4->daddr;
 	tuple.saddr = ip4->saddr;
 	orig_dip = ip4->daddr;
@@ -959,7 +965,7 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
 		ct_state_new.orig_dport = tuple.dport;
 		ct_state_new.src_sec_id = src_label;
 		ret = ct_create4(&CT_MAP4, &tuple, skb, CT_INGRESS, &ct_state_new,
-				 orig_was_proxy);
+				 skip_proxy);
 		if (IS_ERR(ret))
 			return ret;
 

diff --git a/bpf/bpf_netdev.c b/bpf/bpf_netdev.c
@@ -59,6 +59,7 @@ static inline __u32 derive_sec_ctx(struct __sk_buff *skb, const union v6addr *no
 #endif
 }
 
+#ifdef FROM_HOST
 static inline int __inline__
 reverse_proxy6(struct __sk_buff *skb, int l4_off, struct ipv6hdr *ip6, __u8 nh)
 {
@@ -110,21 +111,67 @@ reverse_proxy6(struct __sk_buff *skb, int l4_off, struct ipv6hdr *ip6, __u8 nh)
 			return DROP_CSUM_L4;
 	}
 
+	/* Packets which have been translated back from the proxy must
+	 * skip any potential ingress proxy at the endpoint
+	 */
+	skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
+
 	return 0;
 }
+#endif
+
+#ifdef FROM_HOST
+static inline void __inline__ handle_identity_from_proxy(struct __sk_buff *skb, __u32 *identity)
+{
+	int ret;
+
+	/* For packets from the proxy the identity can be specified via
+	 * skb->mark */
+	if ((ret = mark_is_from_proxy(skb))) {
+		*identity = get_identity_via_proxy(skb);
+
+		/* Packets from the ingress proxy must skip the proxy when the
+		 * destination endpoint evaluates the policy. As the packet
+		 * would loop otherwise. */
+		if (ret == SOURCE_INGRESS_PROXY)
+			skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
+	}
+
+	/* Reset packet mark to avoid hitting routing rules again */
+	skb->mark = 0;
+}
+#endif
+
+#ifdef FROM_HOST
+static inline int rewrite_dmac_to_host(struct __sk_buff *skb)
+{
+	/* When attached to cilium_host, we rewrite the DMAC to the mac of
+	 * cilium_host (peer) to ensure the packet is being considered to be
+	 * addressed to the host (PACKET_HOST) */
+	union macaddr cilium_net_mac = CILIUM_NET_MAC;
+
+	/* Rewrite to destination MAC of cilium_net (remote peer) */
+	if (eth_store_daddr(skb, (__u8 *) &cilium_net_mac.addr, 0) < 0)
+		return send_drop_notify_error(skb, DROP_WRITE_ERROR, TC_ACT_OK);
+
+	return TC_ACT_OK;
+}
+#endif
 
 static inline int handle_ipv6(struct __sk_buff *skb)
 {
 	union v6addr node_ip = { };
-	void *data = (void *) (long) skb->data;
-	void *data_end = (void *) (long) skb->data_end;
-	struct ipv6hdr *ip6 = data + ETH_HLEN;
-	union v6addr *dst = (union v6addr *) &ip6->daddr;
+	void *data, *data_end;
+	struct ipv6hdr *ip6;
+	union v6addr *dst;
 	int l4_off, l3_off = ETH_HLEN;
 	struct endpoint_info *ep;
 	__u8 nexthdr;
 	__u32 flowlabel;
-	int ret;
+
+	data = (void *) (long) skb->data;
+	data_end = (void *) (long) skb->data_end;
+	ip6 = data + ETH_HLEN;
 
 	if (data + l3_off + sizeof(*ip6) > data_end)
 		return DROP_INVALID;
@@ -141,50 +188,61 @@ static inline int handle_ipv6(struct __sk_buff *skb)
 #endif
 
 	BPF_V6(node_ip, ROUTER_IP);
-
 	flowlabel = derive_sec_ctx(skb, &node_ip, ip6);
+
 #ifdef FROM_HOST
-	/* For packets from the host, the identity can be specified via skb->mark */
-	if (skb->mark) {
-		flowlabel = skb->mark;
-	}
-#endif
+	if (1) {
+		int ret;
+
+		handle_identity_from_proxy(skb, &flowlabel);
 
-	if (likely(ipv6_match_prefix_96(dst, &node_ip))) {
 		ret = reverse_proxy6(skb, l4_off, ip6, ip6->nexthdr);
+		/* DIRECT PACKET READ INVALID */
 		if (IS_ERR(ret))
 			return ret;
 
-		data = (void *) (long) skb->data;
-		data_end = (void *) (long) skb->data_end;
-		ip6 = data + ETH_HLEN;
-		if (data + sizeof(*ip6) + ETH_HLEN > data_end)
-			return DROP_INVALID;
-
-		/* Lookup IPv4 address in list of local endpoints */
-		if ((ep = lookup_ip6_endpoint(ip6)) != NULL) {
-			/* Let through packets to the node-ip so they are
-			 * processed by the local ip stack */
-			if (ep->flags & ENDPOINT_F_HOST)
-				return TC_ACT_OK;
-
-			return ipv6_local_delivery(skb, l3_off, l4_off, flowlabel, ip6, nexthdr, ep);
-		} else {
-#ifdef ENCAP_IFINDEX
-			struct endpoint_key key = {};
+		/* If we are attached to cilium_host at egress, this will
+		 * rewrite the destination mac address to the MAC of cilium_net */
+		ret = rewrite_dmac_to_host(skb);
+		/* DIRECT PACKET READ INVALID */
+		if (IS_ERR(ret))
+			return ret;
+	}
 
-			/* IPv6 lookup key: daddr/96 */
-			dst = (union v6addr *) &ip6->daddr;
-			key.ip6.p1 = dst->p1;
-			key.ip6.p2 = dst->p2;
-			key.ip6.p3 = dst->p3;
-			key.ip6.p4 = 0;
-			key.family = ENDPOINT_KEY_IPV6;
+	data = (void *) (long) skb->data;
+	data_end = (void *) (long) skb->data_end;
+	ip6 = data + ETH_HLEN;
 
-			return encap_and_redirect(skb, &key, flowlabel);
+	if (data + sizeof(*ip6) + ETH_HLEN > data_end)
+		return DROP_INVALID;
 #endif
-		}
+
+	/* Lookup IPv4 address in list of local endpoints */
+	if ((ep = lookup_ip6_endpoint(ip6)) != NULL) {
+		/* Let through packets to the node-ip so they are
+		 * processed by the local ip stack */
+		if (ep->flags & ENDPOINT_F_HOST)
+			return TC_ACT_OK;
+
+		return ipv6_local_delivery(skb, l3_off, l4_off, flowlabel, ip6, nexthdr, ep);
+	}
+
+#ifdef ENCAP_IFINDEX
+	dst = (union v6addr *) &ip6->daddr;
+	if (likely(ipv6_match_prefix_96(dst, &node_ip))) {
+		struct endpoint_key key = {};
+
+		/* IPv6 lookup key: daddr/96 */
+		dst = (union v6addr *) &ip6->daddr;
+		key.ip6.p1 = dst->p1;
+		key.ip6.p2 = dst->p2;
+		key.ip6.p3 = dst->p3;
+		key.ip6.p4 = 0;
+		key.family = ENDPOINT_KEY_IPV6;
+
+		return encap_and_redirect(skb, &key, flowlabel);
 	}
+#endif
 
 	return TC_ACT_OK;
 }
@@ -204,6 +262,7 @@ static inline __u32 derive_ipv4_sec_ctx(struct __sk_buff *skb, struct iphdr *ip4
 #endif
 }
 
+#ifdef FROM_HOST
 static inline int __inline__
 reverse_proxy(struct __sk_buff *skb, int l4_off, struct iphdr *ip4,
 	      struct ipv4_ct_tuple *tuple)
@@ -258,71 +317,85 @@ reverse_proxy(struct __sk_buff *skb, int l4_off, struct iphdr *ip4,
 	    csum_l4_replace(skb, l4_off, &csum, old_saddr, new_saddr, 4 | BPF_F_PSEUDO_HDR) < 0)
 		return DROP_CSUM_L4;
 
+	/* Packets which have been translated back from the proxy must
+	 * skip any potential ingress proxy at the endpoint
+	 */
+	skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
+
 	cilium_trace_capture(skb, DBG_CAPTURE_PROXY_POST, 0);
 
 	return 0;
 }
+#endif
 
 static inline int handle_ipv4(struct __sk_buff *skb)
 {
-	void *data = (void *) (long) skb->data;
-	void *data_end = (void *) (long) skb->data_end;
-	struct iphdr *ip4 = data + ETH_HLEN;
+	struct ipv4_ct_tuple tuple = {};
+	struct endpoint_info *ep;
+	void *data, *data_end;
+	struct iphdr *ip4;
+	int l4_off;
+	__u32 secctx;
+
+	data = (void *) (long) skb->data;
+	data_end = (void *) (long) skb->data_end;
+	ip4 = data + ETH_HLEN;
 
 	if (data + sizeof(*ip4) + ETH_HLEN > data_end)
 		return DROP_INVALID;
 
-#ifdef ENABLE_IPV4
-	/* Check if destination is within our cluster prefix */
-	if ((ip4->daddr & IPV4_CLUSTER_MASK) == IPV4_CLUSTER_RANGE) {
-		struct ipv4_ct_tuple tuple = {};
-		struct endpoint_info *ep;
-		__u32 secctx;
-		int ret, l4_off;
+	l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
+	secctx = derive_ipv4_sec_ctx(skb, ip4);
+	tuple.nexthdr = ip4->protocol;
 
-		l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
-		secctx = derive_ipv4_sec_ctx(skb, ip4);
 #ifdef FROM_HOST
-		if (skb->mark) {
-			/* For packets from the host, the identity can be specified via skb->mark */
-			secctx = skb->mark;
-		}
-#endif
-		tuple.nexthdr = ip4->protocol;
+	if (1) {
+		int ret;
 
-		cilium_trace(skb, DBG_NETDEV_IN_CLUSTER, secctx, 0);
+		handle_identity_from_proxy(skb, &secctx);
 
 		ret = reverse_proxy(skb, l4_off, ip4, &tuple);
 		/* DIRECT PACKET READ INVALID */
 		if (IS_ERR(ret))
 			return ret;
 
-		data = (void *) (long) skb->data;
-		data_end = (void *) (long) skb->data_end;
-		ip4 = data + ETH_HLEN;
-		if (data + sizeof(*ip4) + ETH_HLEN > data_end)
-			return DROP_INVALID;
-
-		/* Lookup IPv4 address in list of local endpoints */
-		if ((ep = lookup_ip4_endpoint(ip4)) != NULL) {
-			/* Let through packets to the node-ip so they are
-			 * processed by the local ip stack */
-			if (ep->flags & ENDPOINT_F_HOST)
-				return TC_ACT_OK;
-
-			return ipv4_local_delivery(skb, ETH_HLEN, l4_off, secctx, ip4, ep);
-		} else {
-#ifdef ENCAP_IFINDEX
-			/* IPv4 lookup key: daddr & IPV4_MASK */
-			struct endpoint_key key = {};
+		/* If we are attached to cilium_host at egress, this will
+		 * rewrite the destination mac address to the MAC of cilium_net */
+		ret = rewrite_dmac_to_host(skb);
+		/* DIRECT PACKET READ INVALID */
+		if (IS_ERR(ret))
+			return ret;
+	}
 
-			key.ip4 = ip4->daddr & IPV4_MASK;
-			key.family = ENDPOINT_KEY_IPV4;
+	data = (void *) (long) skb->data;
+	data_end = (void *) (long) skb->data_end;
+	ip4 = data + ETH_HLEN;
 
-			cilium_trace(skb, DBG_NETDEV_ENCAP4, key.ip4, secctx);
-			return encap_and_redirect(skb, &key, secctx);
+	if (data + sizeof(*ip4) + ETH_HLEN > data_end)
+		return DROP_INVALID;
 #endif
-		}
+
+	/* Lookup IPv4 address in list of local endpoints and host IPs */
+	if ((ep = lookup_ip4_endpoint(ip4)) != NULL) {
+		/* Let through packets to the node-ip so they are
+		 * processed by the local ip stack */
+		if (ep->flags & ENDPOINT_F_HOST)
+			return TC_ACT_OK;
+
+		return ipv4_local_delivery(skb, ETH_HLEN, l4_off, secctx, ip4, ep);
+	}
+
+#ifdef ENCAP_IFINDEX
+	/* Check if destination is within our cluster prefix */
+	if ((ip4->daddr & IPV4_CLUSTER_MASK) == IPV4_CLUSTER_RANGE) {
+		/* IPv4 lookup key: daddr & IPV4_MASK */
+		struct endpoint_key key = {};
+
+		key.ip4 = ip4->daddr & IPV4_MASK;
+		key.family = ENDPOINT_KEY_IPV4;
+
+		cilium_trace(skb, DBG_NETDEV_ENCAP4, key.ip4, secctx);
+		return encap_and_redirect(skb, &key, secctx);
 	}
 #endif