Skip to content

Commit

Permalink
bpf: Add to-container section to bpf_lxc
Browse files Browse the repository at this point in the history
This makes the datapath logic previously available via cilium_host also
available via an egress hook on host facing veth peer. The section is
called to-container and will perform the following capabilities:

 * ingress policy
 * LB reverse NAT

This enables for the Linux routing layer to route directly into a veth pair or
similar which then forwards into another network namespace.

Signed-off-by: Thomas Graf <thomas@cilium.io>
  • Loading branch information
tgraf committed May 9, 2019
1 parent e0baae7 commit 25a80df
Show file tree
Hide file tree
Showing 10 changed files with 299 additions and 46 deletions.
2 changes: 1 addition & 1 deletion bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ include ../Makefile.defs

SUBDIRS = sockops

BPF_SIMPLE = bpf_netdev.o bpf_overlay.o bpf_xdp.o bpf_ipsec.o bpf_network.o bpf_alignchecker.o
BPF_SIMPLE = bpf_netdev.o bpf_overlay.o bpf_xdp.o bpf_ipsec.o bpf_network.o bpf_alignchecker.o bpf_hostdev_ingress.c
BPF = bpf_lxc.o bpf_lb.o bpf_sock.o $(BPF_SIMPLE)
SCRIPTS = init.sh join_ep.sh run_probes.sh spawn_netns.sh

Expand Down
41 changes: 41 additions & 0 deletions bpf/bpf_hostdev_ingress.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (C) 2019 Authors of Cilium
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/if_packet.h>

#include <node_config.h>
#include <netdev_config.h>
#include <bpf/api.h>

#include <stdint.h>
#include <stdio.h>

#include "lib/common.h"

__section("to-host")
int to_host(struct __sk_buff *skb)
{
// Upper 16 bits may carry proxy port number, clear it out
__u32 magic = skb->cb[0] & 0xFFFF;
if (magic == MARK_MAGIC_TO_PROXY) {
skb->mark = skb->cb[0];
skb->cb[0] = 0;
}
return TC_ACT_OK;
}

BPF_LICENSE("GPL");
197 changes: 189 additions & 8 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
return TC_ACT_OK;
}

declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV6_TO_LXC)
declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY)
int tail_ipv6_policy(struct __sk_buff *skb)
{
int ret, ifindex = skb->cb[CB_IFINDEX];
Expand All @@ -822,11 +822,73 @@ int tail_ipv6_policy(struct __sk_buff *skb)
skb->cb[0] = skb->mark; // essential for proxy ingress, see bpf_ipsec.c
return ret;
}

declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV6_TO_ENDPOINT)
int tail_ipv6_to_endpoint(struct __sk_buff *skb)
{
__u32 src_identity = skb->cb[CB_SRC_LABEL];
struct ep_config *cfg = lookup_ep_config();
int ret, forwarding_reason;
void *data, *data_end;
struct ipv6hdr *ip6;

if (!revalidate_data(skb, &data, &data_end, &ip6)) {
ret = DROP_INVALID;
goto out;
}

if (cfg == NULL) {
ret = DROP_NO_CONFIG;
goto out;
}

/* Packets from the proxy will already have a real identity. */
if (identity_is_reserved(src_identity)) {
union v6addr *src = (union v6addr *) &ip6->saddr;
struct remote_endpoint_info *info;

info = ipcache_lookup6(&IPCACHE_MAP, src, V6_CACHE_KEY_LEN);
if (info != NULL) {
__u32 sec_label = info->sec_label;
if (sec_label) {
/* When SNAT is enabled on traffic ingressing
* into Cilium, all traffic from the world will
* have a source IP of the host. It will only
* actually be from the host if "src_identity"
* (passed into this function) reports the src
* as the host. So we can ignore the ipcache
* if it reports the source as HOST_ID.
*/
if (sec_label != HOST_ID)
src_identity = sec_label;
}
}
cilium_dbg(skb, info ? DBG_IP_ID_MAP_SUCCEED6 : DBG_IP_ID_MAP_FAILED6,
((__u32 *) src)[3], src_identity);
}

cilium_dbg(skb, DBG_LOCAL_DELIVERY, LXC_ID, SECLABEL);

#if defined LOCAL_DELIVERY_METRICS
update_metrics(skb->len, METRIC_INGRESS, REASON_FORWARDED);
#endif

skb->cb[CB_SRC_LABEL] = 0;
ret = ipv6_policy(skb, 0, src_identity, &forwarding_reason, cfg);

out:
if (IS_ERR(ret))
return send_drop_notify(skb, src_identity, SECLABEL, LXC_ID,
ret, TC_ACT_SHOT, METRIC_INGRESS);

return ret;
}

#endif /* ENABLE_IPV6 */

#ifdef ENABLE_IPV4
static inline int __inline__
ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding_reason, struct ep_config *cfg)
ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding_reason, struct ep_config *cfg, __u16 *proxy_port)
{
struct ipv4_ct_tuple tuple = {};
void *data, *data_end;
Expand Down Expand Up @@ -919,10 +981,11 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
return DROP_INVALID;

if (redirect_to_proxy(verdict, *forwarding_reason)) {
*proxy_port = verdict;
// Trace the packet before its forwarded to proxy
send_trace_notify(skb, TRACE_TO_PROXY, src_label, SECLABEL,
0, ifindex, *forwarding_reason, monitor);
return skb_redirect_to_proxy(skb, verdict);
return TC_ACT_OK;
} else { // Not redirected to host / proxy.
send_trace_notify(skb, TRACE_TO_LXC, src_label, SECLABEL,
LXC_ID, ifindex, *forwarding_reason, monitor);
Expand All @@ -935,26 +998,96 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
return TC_ACT_OK;
}

declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV4_TO_LXC)
declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY)
int tail_ipv4_policy(struct __sk_buff *skb)
{
struct ep_config *cfg = lookup_ep_config();
int ret, ifindex = skb->cb[CB_IFINDEX];
__u32 src_label = skb->cb[CB_SRC_LABEL];
int forwarding_reason = 0;
__u16 proxy_port = 0;

skb->cb[CB_SRC_LABEL] = 0;
if (cfg)
ret = ipv4_policy(skb, ifindex, src_label, &forwarding_reason, cfg);
ret = ipv4_policy(skb, ifindex, src_label, &forwarding_reason, cfg, &proxy_port);
else
ret = DROP_NO_CONFIG;
if (IS_ERR(ret))
return send_drop_notify(skb, src_label, SECLABEL, LXC_ID,
ret, TC_ACT_SHOT, METRIC_INGRESS);

if (proxy_port != 0) {
ret = skb_redirect_to_proxy(skb, proxy_port);
}

skb->cb[0] = skb->mark; // essential for proxy ingress, see bpf_ipsec.c
return ret;
}

declare_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)), CILIUM_CALL_IPV4_TO_ENDPOINT)
int tail_ipv4_to_endpoint(struct __sk_buff *skb)
{
__u32 src_identity = skb->cb[CB_SRC_LABEL];
struct ep_config *cfg = lookup_ep_config();
int ret, forwarding_reason;
void *data, *data_end;
struct iphdr *ip4;
__u16 proxy_port = 0;

if (!revalidate_data(skb, &data, &data_end, &ip4)) {
ret = DROP_INVALID;
goto out;
}

if (cfg == NULL) {
ret = DROP_NO_CONFIG;
goto out;
}

/* Packets from the proxy will already have a real identity. */
if (identity_is_reserved(src_identity)) {
struct remote_endpoint_info *info;

info = ipcache_lookup4(&IPCACHE_MAP, ip4->saddr, V4_CACHE_KEY_LEN);
if (info != NULL) {
__u32 sec_label = info->sec_label;
if (sec_label) {
/* When SNAT is enabled on traffic ingressing
* into Cilium, all traffic from the world will
* have a source IP of the host. It will only
* actually be from the host if "src_identity"
* (passed into this function) reports the src
* as the host. So we can ignore the ipcache
* if it reports the source as HOST_ID.
*/
if (sec_label != HOST_ID)
src_identity = sec_label;
}
}
cilium_dbg(skb, info ? DBG_IP_ID_MAP_SUCCEED4 : DBG_IP_ID_MAP_FAILED4,
ip4->saddr, src_identity);
}

cilium_dbg(skb, DBG_LOCAL_DELIVERY, LXC_ID, SECLABEL);

#if defined LOCAL_DELIVERY_METRICS
update_metrics(skb->len, METRIC_INGRESS, REASON_FORWARDED);
#endif

skb->cb[CB_SRC_LABEL] = 0;
ret = ipv4_policy(skb, 0, src_identity, &forwarding_reason, cfg, &proxy_port);

if (proxy_port != 0) {
ret = skb_redirect_to_proxy_hairpin(skb, proxy_port);
}

out:
if (IS_ERR(ret))
return send_drop_notify(skb, src_identity, SECLABEL, LXC_ID,
ret, TC_ACT_SHOT, METRIC_INGRESS);

return ret;
}
#endif /* ENABLE_IPV4 */

/* Handle policy decisions as the packet makes its way towards the endpoint.
Expand All @@ -979,13 +1112,13 @@ __section_tail(CILIUM_MAP_POLICY, TEMPLATE_LXC_ID) int handle_policy(struct __sk
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
invoke_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)),
CILIUM_CALL_IPV6_TO_LXC, tail_ipv6_policy);
CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY, tail_ipv6_policy);
break;
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
invoke_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)),
CILIUM_CALL_IPV4_TO_LXC, tail_ipv4_policy);
CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY, tail_ipv4_policy);
break;
#endif /* ENABLE_IPV4 */
default:
Expand Down Expand Up @@ -1043,8 +1176,56 @@ __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_NAT46) int tail_ipv4_to_ipv6(struct
cilium_dbg_capture(skb, DBG_CAPTURE_AFTER_V46, skb->ingress_ifindex);

invoke_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)),
CILIUM_CALL_IPV6_TO_LXC, tail_ipv6_policy);
CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY, tail_ipv6_policy);
return ret;
}
#endif
BPF_LICENSE("GPL");

__section("to-container")
int handle_to_container(struct __sk_buff *skb)
{
int ret, trace = TRACE_FROM_STACK;
__u32 identity = 0;
__u16 proto;

if (!validate_ethertype(skb, &proto)) {
ret = DROP_UNSUPPORTED_L2;
goto out;
}

bpf_clear_cb(skb);

if (inherit_identity_from_host(skb, &identity))
trace = TRACE_FROM_PROXY;

send_trace_notify(skb, trace, identity, 0, 0,
skb->ingress_ifindex, 0, TRACE_PAYLOAD_LEN);

skb->cb[CB_SRC_LABEL] = identity;

switch (proto) {
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
invoke_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)),
CILIUM_CALL_IPV6_TO_ENDPOINT, tail_ipv6_to_endpoint);
break;
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
invoke_tailcall_if(__and(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6)),
CILIUM_CALL_IPV4_TO_ENDPOINT, tail_ipv4_to_endpoint);
break;
#endif /* ENABLE_IPV4 */
default:
ret = DROP_UNKNOWN_L3;
break;
}

out:
if (IS_ERR(ret))
return send_drop_notify(skb, identity, SECLABEL, LXC_ID,
ret, TC_ACT_SHOT, METRIC_INGRESS);

return ret;
}
31 changes: 1 addition & 30 deletions bpf/bpf_netdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,35 +386,6 @@ __section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_LXC) int tail_handle_ipv4

#endif /* ENABLE_IPV4 */

#ifdef FROM_HOST
static inline bool __inline__ handle_identity_from_host(struct __sk_buff *skb, __u32 *identity)
{
__u32 magic = skb->mark & MARK_MAGIC_HOST_MASK;
bool from_proxy = false;

/* Packets from the ingress proxy must skip the proxy when the
* destination endpoint evaluates the policy. As the packet
* would loop otherwise. */
if (magic == MARK_MAGIC_PROXY_INGRESS) {
*identity = get_identity(skb);
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
from_proxy = true;
} else if (magic == MARK_MAGIC_PROXY_EGRESS) {
*identity = get_identity(skb);
from_proxy = true;
} else if (magic == MARK_MAGIC_HOST) {
*identity = HOST_ID;
} else {
*identity = WORLD_ID;
}

/* Reset packet mark to avoid hitting routing rules again */
skb->mark = 0;

return from_proxy;
}
#endif

static __always_inline int do_netdev(struct __sk_buff *skb, __u16 proto)
{
__u32 identity = 0;
Expand Down Expand Up @@ -454,7 +425,7 @@ static __always_inline int do_netdev(struct __sk_buff *skb, __u16 proto)
int trace = TRACE_FROM_HOST;
bool from_proxy;

from_proxy = handle_identity_from_host(skb, &identity);
from_proxy = inherit_identity_from_host(skb, &identity);
if (from_proxy)
trace = TRACE_FROM_PROXY;
send_trace_notify(skb, trace, identity, 0, 0,
Expand Down
1 change: 1 addition & 0 deletions bpf/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ if [ "$MODE" == "ipvlan" ]; then
COPTS+=" -DENABLE_EXTRA_HOST_DEV"
fi
bpf_load $HOST_DEV1 "$COPTS" "egress" bpf_netdev.c bpf_host.o from-netdev $CALLS_MAP
bpf_load $HOST_DEV1 "" "ingress" bpf_hostdev_ingress.c bpf_hostdev_ingress.o to-host $CALLS_MAP "no_qdisc_reset"

# bpf_ipsec.o is also needed by proxy redirects, so we load it unconditionally
bpf_load $HOST_DEV2 "" "ingress" bpf_ipsec.c bpf_ipsec.o from-netdev $CALLS_MAP
Expand Down
8 changes: 5 additions & 3 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,11 @@
#define CILIUM_CALL_NAT64 8
#define CILIUM_CALL_NAT46 9
#define CILIUM_CALL_IPV6_FROM_LXC 10
#define CILIUM_CALL_IPV4_TO_LXC 11
#define CILIUM_CALL_IPV6_TO_LXC 12
#define CILIUM_CALL_SIZE 13
#define CILIUM_CALL_IPV4_TO_LXC_POLICY_ONLY 11
#define CILIUM_CALL_IPV6_TO_LXC_POLICY_ONLY 12
#define CILIUM_CALL_IPV4_TO_ENDPOINT 13
#define CILIUM_CALL_IPV6_TO_ENDPOINT 14
#define CILIUM_CALL_SIZE 15

typedef __u64 mac_t;

Expand Down
Loading

0 comments on commit 25a80df

Please sign in to comment.