Skip to content

Commit

Permalink
proxy: Distinguish between proxy and other local processes
Browse files Browse the repository at this point in the history
The datapath used a simple sip == HOST_IP to detect local proxy traffic
to bypass the proxy on the second pass through. This logic is flawed and
causes the proxy to be bypassed in the following legitimate situations:
 * local process uses HOST_IP as source and talks to local endpoint
 * both egress and ingress proxy are injected between two local
   endpoints

This commit fixes this situation by:

 * Introducing a new custom dialer for the proxy. This is required
   because net.Dial() does not allow to set a socket option between
   creating the socket and calling connect(). Access to the socket
   is required to set the SO_MARK before the connect() to ensure
   that the first SYN packet contains the proper SO_MARK.

 * Replace net.Listen() with custom code so we can set the SO_MARK
   before calling syscall.Listen() to ensure that all child sockets
   inherit the SO_MARK. This ensures that even the SYN+ACK generated
   by the kernel will haver proper packet markings.

 * Extending the SO_MARK to contain a magic marker in the lower 12 bits
   of skb->mark (0xFEA for ingress proxy, 0xFEB for egress proxy). Thus
   allowing to detect packets from the proxy by matching against the
   magic marker. The identity is moved to the upper 16 bits of skb->mark

   The skb->mark is cleared on veth traversal so we match against the
   magic marker at egress on the way out to cilium_host and set a flag
   in tc_index to indicate skipping the proxy. tc_index is preserved
   across veth boundaries.

   The mark is matched in a IP routing rule and causes packets from the
   proxies to use a different routing table. This allows to route all
   packets from proxies through cilium_host regardless of their
   destination. The BPF program attached to cilium_host can then perform
   the proxy reverse translation and route accordingly.

 * Only skipping the ingress proxy if the packet is coming from the
   ingress proxy. If the packet is coming from an egress proxy,
   it may still need to go through the ingress proxy of the destination
   endpoint.

 * Fixing the BPF program attached to cilium_host to recognize host IPs
   which are outside of the cluster prefix so it can route back into the
   host.

 * Adding a test to tests/10-proxy.sh which covers both an egress and
   ingress proxy in the path.

Signed-off-by: Thomas Graf <thomas@cilium.io>
  • Loading branch information
tgraf committed Sep 5, 2017
1 parent 3ab2ea9 commit f89a22c
Show file tree
Hide file tree
Showing 10 changed files with 616 additions and 144 deletions.
22 changes: 14 additions & 8 deletions bpf/bpf_lxc.c
Expand Up @@ -801,11 +801,10 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
void *data_end = (void *) (long) skb->data_end;
struct ipv6hdr *ip6 = data + ETH_HLEN;
struct csum_offset csum_off = {};
union v6addr host_ip = {};
int ret, l4_off, verdict;
struct ct_state ct_state = {};
struct ct_state ct_state_new = {};
bool orig_was_proxy;
bool skip_proxy;
union v6addr orig_dip = {};

if (data + sizeof(struct ipv6hdr) + ETH_HLEN > data_end)
Expand All @@ -818,8 +817,9 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
ipv6_addr_copy(&tuple.saddr, (union v6addr *) &ip6->saddr);
ipv6_addr_copy(&orig_dip, (union v6addr *) &ip6->daddr);

BPF_V6(host_ip, HOST_IP);
orig_was_proxy = ipv6_addrcmp((union v6addr *) &ip6->saddr, &host_ip) == 0;
/* If packet is coming from the egress proxy we have to skip
* redirection to the egress proxy as we would loop forever. */
skip_proxy = tc_index_skip_proxy(skb);

l4_off = ETH_HLEN + ipv6_hdrlen(skb, ETH_HLEN, &tuple.nexthdr);
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
Expand Down Expand Up @@ -869,7 +869,7 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ret = ct_create6(&CT_MAP6, &tuple, skb, CT_INGRESS, &ct_state_new,
orig_was_proxy);
skip_proxy);
if (IS_ERR(ret))
return ret;

Expand All @@ -879,6 +879,9 @@ static inline int __inline__ ipv6_policy(struct __sk_buff *skb, int ifindex, __u
if (ct_state.proxy_port && (ret == CT_NEW || ret == CT_ESTABLISHED)) {
union macaddr host_mac = HOST_IFINDEX_MAC;
union macaddr router_mac = NODE_MAC;
union v6addr host_ip = {};

BPF_V6(host_ip, HOST_IP);

ret = ipv6_redirect_to_host_port(skb, &csum_off, l4_off,
ct_state.proxy_port, tuple.dport,
Expand Down Expand Up @@ -909,7 +912,7 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
int ret, verdict, l4_off;
struct ct_state ct_state = {};
struct ct_state ct_state_new = {};
bool orig_was_proxy;
bool skip_proxy;
__be32 orig_dip;

if (data + sizeof(*ip4) + ETH_HLEN > data_end)
Expand All @@ -918,7 +921,10 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
policy_clear_mark(skb);
tuple.nexthdr = ip4->protocol;

orig_was_proxy = ip4->saddr == IPV4_GATEWAY;
/* If packet is coming from the egress proxy we have to skip
* redirection to the egress proxy as we would loop forever. */
skip_proxy = tc_index_skip_proxy(skb);

tuple.daddr = ip4->daddr;
tuple.saddr = ip4->saddr;
orig_dip = ip4->daddr;
Expand Down Expand Up @@ -959,7 +965,7 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ret = ct_create4(&CT_MAP4, &tuple, skb, CT_INGRESS, &ct_state_new,
orig_was_proxy);
skip_proxy);
if (IS_ERR(ret))
return ret;

Expand Down
233 changes: 153 additions & 80 deletions bpf/bpf_netdev.c
Expand Up @@ -59,6 +59,7 @@ static inline __u32 derive_sec_ctx(struct __sk_buff *skb, const union v6addr *no
#endif
}

#ifdef FROM_HOST
static inline int __inline__
reverse_proxy6(struct __sk_buff *skb, int l4_off, struct ipv6hdr *ip6, __u8 nh)
{
Expand Down Expand Up @@ -110,21 +111,67 @@ reverse_proxy6(struct __sk_buff *skb, int l4_off, struct ipv6hdr *ip6, __u8 nh)
return DROP_CSUM_L4;
}

/* Packets which have been translated back from the proxy must
* skip any potential ingress proxy at the endpoint
*/
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;

return 0;
}
#endif

#ifdef FROM_HOST
static inline void __inline__ handle_identity_from_proxy(struct __sk_buff *skb, __u32 *identity)
{
int ret;

/* For packets from the proxy the identity can be specified via
* skb->mark */
if ((ret = mark_is_from_proxy(skb))) {
*identity = get_identity_via_proxy(skb);

/* Packets from the ingress proxy must skip the proxy when the
* destination endpoint evaluates the policy. As the packet
* would loop otherwise. */
if (ret == SOURCE_INGRESS_PROXY)
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
}

/* Reset packet mark to avoid hitting routing rules again */
skb->mark = 0;
}
#endif

#ifdef FROM_HOST
static inline int rewrite_dmac_to_host(struct __sk_buff *skb)
{
/* When attached to cilium_host, we rewrite the DMAC to the mac of
* cilium_host (peer) to ensure the packet is being considered to be
* addressed to the host (PACKET_HOST) */
union macaddr cilium_net_mac = CILIUM_NET_MAC;

/* Rewrite to destination MAC of cilium_net (remote peer) */
if (eth_store_daddr(skb, (__u8 *) &cilium_net_mac.addr, 0) < 0)
return send_drop_notify_error(skb, DROP_WRITE_ERROR, TC_ACT_OK);

return TC_ACT_OK;
}
#endif

static inline int handle_ipv6(struct __sk_buff *skb)
{
union v6addr node_ip = { };
void *data = (void *) (long) skb->data;
void *data_end = (void *) (long) skb->data_end;
struct ipv6hdr *ip6 = data + ETH_HLEN;
union v6addr *dst = (union v6addr *) &ip6->daddr;
void *data, *data_end;
struct ipv6hdr *ip6;
union v6addr *dst;
int l4_off, l3_off = ETH_HLEN;
struct endpoint_info *ep;
__u8 nexthdr;
__u32 flowlabel;
int ret;

data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip6 = data + ETH_HLEN;

if (data + l3_off + sizeof(*ip6) > data_end)
return DROP_INVALID;
Expand All @@ -141,50 +188,61 @@ static inline int handle_ipv6(struct __sk_buff *skb)
#endif

BPF_V6(node_ip, ROUTER_IP);

flowlabel = derive_sec_ctx(skb, &node_ip, ip6);

#ifdef FROM_HOST
/* For packets from the host, the identity can be specified via skb->mark */
if (skb->mark) {
flowlabel = skb->mark;
}
#endif
if (1) {
int ret;

handle_identity_from_proxy(skb, &flowlabel);

if (likely(ipv6_match_prefix_96(dst, &node_ip))) {
ret = reverse_proxy6(skb, l4_off, ip6, ip6->nexthdr);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;

data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip6 = data + ETH_HLEN;
if (data + sizeof(*ip6) + ETH_HLEN > data_end)
return DROP_INVALID;

/* Lookup IPv4 address in list of local endpoints */
if ((ep = lookup_ip6_endpoint(ip6)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
return TC_ACT_OK;

return ipv6_local_delivery(skb, l3_off, l4_off, flowlabel, ip6, nexthdr, ep);
} else {
#ifdef ENCAP_IFINDEX
struct endpoint_key key = {};
/* If we are attached to cilium_host at egress, this will
* rewrite the destination mac address to the MAC of cilium_net */
ret = rewrite_dmac_to_host(skb);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
}

/* IPv6 lookup key: daddr/96 */
dst = (union v6addr *) &ip6->daddr;
key.ip6.p1 = dst->p1;
key.ip6.p2 = dst->p2;
key.ip6.p3 = dst->p3;
key.ip6.p4 = 0;
key.family = ENDPOINT_KEY_IPV6;
data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip6 = data + ETH_HLEN;

return encap_and_redirect(skb, &key, flowlabel);
if (data + sizeof(*ip6) + ETH_HLEN > data_end)
return DROP_INVALID;
#endif
}

/* Lookup IPv4 address in list of local endpoints */
if ((ep = lookup_ip6_endpoint(ip6)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
return TC_ACT_OK;

return ipv6_local_delivery(skb, l3_off, l4_off, flowlabel, ip6, nexthdr, ep);
}

#ifdef ENCAP_IFINDEX
dst = (union v6addr *) &ip6->daddr;
if (likely(ipv6_match_prefix_96(dst, &node_ip))) {
struct endpoint_key key = {};

/* IPv6 lookup key: daddr/96 */
dst = (union v6addr *) &ip6->daddr;
key.ip6.p1 = dst->p1;
key.ip6.p2 = dst->p2;
key.ip6.p3 = dst->p3;
key.ip6.p4 = 0;
key.family = ENDPOINT_KEY_IPV6;

return encap_and_redirect(skb, &key, flowlabel);
}
#endif

return TC_ACT_OK;
}
Expand All @@ -204,6 +262,7 @@ static inline __u32 derive_ipv4_sec_ctx(struct __sk_buff *skb, struct iphdr *ip4
#endif
}

#ifdef FROM_HOST
static inline int __inline__
reverse_proxy(struct __sk_buff *skb, int l4_off, struct iphdr *ip4,
struct ipv4_ct_tuple *tuple)
Expand Down Expand Up @@ -258,71 +317,85 @@ reverse_proxy(struct __sk_buff *skb, int l4_off, struct iphdr *ip4,
csum_l4_replace(skb, l4_off, &csum, old_saddr, new_saddr, 4 | BPF_F_PSEUDO_HDR) < 0)
return DROP_CSUM_L4;

/* Packets which have been translated back from the proxy must
* skip any potential ingress proxy at the endpoint
*/
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;

cilium_trace_capture(skb, DBG_CAPTURE_PROXY_POST, 0);

return 0;
}
#endif

static inline int handle_ipv4(struct __sk_buff *skb)
{
void *data = (void *) (long) skb->data;
void *data_end = (void *) (long) skb->data_end;
struct iphdr *ip4 = data + ETH_HLEN;
struct ipv4_ct_tuple tuple = {};
struct endpoint_info *ep;
void *data, *data_end;
struct iphdr *ip4;
int l4_off;
__u32 secctx;

data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip4 = data + ETH_HLEN;

if (data + sizeof(*ip4) + ETH_HLEN > data_end)
return DROP_INVALID;

#ifdef ENABLE_IPV4
/* Check if destination is within our cluster prefix */
if ((ip4->daddr & IPV4_CLUSTER_MASK) == IPV4_CLUSTER_RANGE) {
struct ipv4_ct_tuple tuple = {};
struct endpoint_info *ep;
__u32 secctx;
int ret, l4_off;
l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
secctx = derive_ipv4_sec_ctx(skb, ip4);
tuple.nexthdr = ip4->protocol;

l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
secctx = derive_ipv4_sec_ctx(skb, ip4);
#ifdef FROM_HOST
if (skb->mark) {
/* For packets from the host, the identity can be specified via skb->mark */
secctx = skb->mark;
}
#endif
tuple.nexthdr = ip4->protocol;
if (1) {
int ret;

cilium_trace(skb, DBG_NETDEV_IN_CLUSTER, secctx, 0);
handle_identity_from_proxy(skb, &secctx);

ret = reverse_proxy(skb, l4_off, ip4, &tuple);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;

data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip4 = data + ETH_HLEN;
if (data + sizeof(*ip4) + ETH_HLEN > data_end)
return DROP_INVALID;

/* Lookup IPv4 address in list of local endpoints */
if ((ep = lookup_ip4_endpoint(ip4)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
return TC_ACT_OK;

return ipv4_local_delivery(skb, ETH_HLEN, l4_off, secctx, ip4, ep);
} else {
#ifdef ENCAP_IFINDEX
/* IPv4 lookup key: daddr & IPV4_MASK */
struct endpoint_key key = {};
/* If we are attached to cilium_host at egress, this will
* rewrite the destination mac address to the MAC of cilium_net */
ret = rewrite_dmac_to_host(skb);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
}

key.ip4 = ip4->daddr & IPV4_MASK;
key.family = ENDPOINT_KEY_IPV4;
data = (void *) (long) skb->data;
data_end = (void *) (long) skb->data_end;
ip4 = data + ETH_HLEN;

cilium_trace(skb, DBG_NETDEV_ENCAP4, key.ip4, secctx);
return encap_and_redirect(skb, &key, secctx);
if (data + sizeof(*ip4) + ETH_HLEN > data_end)
return DROP_INVALID;
#endif
}

/* Lookup IPv4 address in list of local endpoints and host IPs */
if ((ep = lookup_ip4_endpoint(ip4)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
return TC_ACT_OK;

return ipv4_local_delivery(skb, ETH_HLEN, l4_off, secctx, ip4, ep);
}

#ifdef ENCAP_IFINDEX
/* Check if destination is within our cluster prefix */
if ((ip4->daddr & IPV4_CLUSTER_MASK) == IPV4_CLUSTER_RANGE) {
/* IPv4 lookup key: daddr & IPV4_MASK */
struct endpoint_key key = {};

key.ip4 = ip4->daddr & IPV4_MASK;
key.family = ENDPOINT_KEY_IPV4;

cilium_trace(skb, DBG_NETDEV_ENCAP4, key.ip4, secctx);
return encap_and_redirect(skb, &key, secctx);
}
#endif

Expand Down

0 comments on commit f89a22c

Please sign in to comment.