diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c index 041f8485bc05..e95a523cf832 100644 --- a/bpf/bpf_lxc.c +++ b/bpf/bpf_lxc.c @@ -106,7 +106,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb, { union macaddr router_mac = NODE_MAC; union v6addr host_ip = HOST_IP; - int do_nat46 = 0, ret, l4_off; + int ret, l4_off; struct csum_offset csum_off = {}; struct lb6_service *svc; struct lb6_key key = {}; @@ -238,12 +238,11 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb, return ipv6_local_delivery(skb, l3_off, l4_off, SECLABEL, ip6, tuple->nexthdr); } else { -#ifdef ENABLE_NAT46 - /* FIXME: Derive from prefix constant */ - if (unlikely((tuple->addr.p1 & 0xffff) == 0xadde)) { - do_nat46 = 1; - goto to_host; - } +#ifdef LXC_NAT46 + if (unlikely(ipv6_addr_is_mapped(&tuple->addr))) { + tail_call(skb, &cilium_calls, CILIUM_CALL_NAT64); + return DROP_MISSED_TAIL_CALL; + } #endif #ifdef ALLOW_TO_WORLD @@ -263,14 +262,6 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb, if (ret != TC_ACT_OK) return ret; - if (do_nat46) { - union v6addr dp = NAT46_DST_PREFIX; - - ret = ipv6_to_ipv4(skb, 14, &dp, IPV4_RANGE | (LXC_ID_NB <<16)); - if (IS_ERR(ret)) - return ret; - } - #ifndef POLICY_ENFORCEMENT cilium_trace_capture(skb, DBG_CAPTURE_DELIVERY, HOST_IFINDEX); return redirect(HOST_IFINDEX, 0); @@ -688,6 +679,13 @@ static inline int __inline__ ipv4_policy(struct __sk_buff *skb, int ifindex, __u if (ret < 0) return ret; +#ifdef LXC_NAT46 + if (skb->cb[CB_NAT46_STATE] == NAT46) { + tail_call(skb, &cilium_calls, CILIUM_CALL_NAT46); + return DROP_MISSED_TAIL_CALL; + } +#endif + if (unlikely(src_rev_nat)) { int ret2; @@ -745,4 +743,38 @@ __section_tail(CILIUM_MAP_POLICY, LXC_ID) int handle_policy(struct __sk_buff *sk return redirect(ifindex, 0); } +__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_NAT64) int tail_ipv6_to_ipv4(struct __sk_buff *skb) +{ + int ret = ipv6_to_ipv4(skb, 14, htonl(LXC_IPV4)); + if (IS_ERR(ret)) + return ret; + + cilium_trace_capture(skb, DBG_CAPTURE_AFTER_V64, skb->ingress_ifindex); + + skb->cb[CB_NAT46_STATE] = NAT64; + + tail_call(skb, &cilium_calls, CILIUM_CALL_IPV4); + return DROP_MISSED_TAIL_CALL; +} + +__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_NAT46) int tail_ipv4_to_ipv6(struct __sk_buff *skb) +{ + union v6addr dp = LXC_IP; + void *data = (void *) (long) skb->data; + void *data_end = (void *) (long) skb->data_end; + struct iphdr *ip4 = data + ETH_HLEN; + int ret; + + if (data + sizeof(*ip4) + ETH_HLEN > data_end) + return DROP_INVALID; + + ret = ipv4_to_ipv6(skb, ip4, 14, &dp); + if (IS_ERR(ret)) + return ret; + + cilium_trace_capture(skb, DBG_CAPTURE_AFTER_V46, skb->ingress_ifindex); + + tail_call(skb, &cilium_policy, LXC_ID); + return DROP_MISSED_TAIL_CALL; +} BPF_LICENSE("GPL"); diff --git a/bpf/bpf_netdev.c b/bpf/bpf_netdev.c index 217db6df9762..a41c972fcc83 100644 --- a/bpf/bpf_netdev.c +++ b/bpf/bpf_netdev.c @@ -37,7 +37,6 @@ #include "lib/eth.h" #include "lib/dbg.h" #include "lib/l3.h" -#include "lib/nat46.h" #include "lib/policy.h" #include "lib/drop.h" @@ -130,27 +129,6 @@ static inline int handle_ipv4(struct __sk_buff *skb) } #endif -#ifdef ENABLE_NAT46 - if (1) { - union v6addr sp = NAT46_SRC_PREFIX; - union v6addr dp = HOST_IP; - int ret; - - if (data + sizeof(*ip) + ETH_HLEN > data_end) - return DROP_INVALID; - - if ((ip->daddr & IPV4_MASK) != IPV4_RANGE) - return TC_ACT_OK; - - ret = ipv4_to_ipv6(skb, ip4, 14, &sp, &dp); - if (IS_ERR(ret)) - return ret; - - proto = __constant_htons(ETH_P_IPV6); - skb->tc_index = 1; - } -#endif - return TC_ACT_OK; } @@ -179,7 +157,7 @@ int from_netdev(struct __sk_buff *skb) ret = handle_ipv6(skb); break; -#if defined ENABLE_IPV4 || defined ENABLE_NAT46 +#ifdef ENABLE_IPV4 case __constant_htons(ETH_P_IP): tail_call(skb, &cilium_calls, CILIUM_CALL_IPV4); ret = DROP_MISSED_TAIL_CALL; diff --git a/bpf/lib/common.h b/bpf/lib/common.h index 35713f3c8484..571f0ff7d691 100644 --- a/bpf/lib/common.h +++ b/bpf/lib/common.h @@ -40,7 +40,9 @@ #define CILIUM_CALL_SEND_ICMP6_TIME_EXCEEDED 5 #define CILIUM_CALL_ARP 6 #define CILIUM_CALL_IPV4 7 -#define CILIUM_CALL_SIZE 8 +#define CILIUM_CALL_NAT64 8 +#define CILIUM_CALL_NAT46 9 +#define CILIUM_CALL_SIZE 10 typedef __u64 mac_t; @@ -143,6 +145,14 @@ enum { CB_SRC_LABEL, CB_IFINDEX, CB_POLICY, + CB_NAT46_STATE, +}; + +/* State values for NAT46 */ +enum { + NAT46_CLEAR, + NAT64, + NAT46, }; /* Flag values for CB_POLICY */ @@ -177,7 +187,8 @@ struct ct_entry { __u16 lifetime; __u16 rx_closing:1, tx_closing:1, - reserve:14; + nat46:1, + reserve:13; __u16 rev_nat_index; }; diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h index 18203a79aaae..ac2e63676e8b 100644 --- a/bpf/lib/conntrack.h +++ b/bpf/lib/conntrack.h @@ -60,6 +60,12 @@ static inline int __inline__ __ct_lookup(void *map, struct __sk_buff *skb, if (rev_nat_index) *rev_nat_index = entry->rev_nat_index;; +#ifdef LXC_NAT46 + /* This packet needs nat46 translation */ + if (entry->nat46 && !skb->cb[CB_NAT46_STATE]) + skb->cb[CB_NAT46_STATE] = NAT46; +#endif + #ifdef CONNTRACK_ACCOUNTING /* FIXME: This is slow, per-cpu counters? */ if (in) { @@ -232,6 +238,10 @@ static inline int __inline__ ct_lookup6(void *map, struct ipv6_ct_tuple *tuple, (tuple->nexthdr << 8) | tuple->flags); ret = __ct_lookup(map, skb, tuple, action, in, NULL); +#ifdef LXC_NAT46 + skb->cb[CB_NAT46_STATE] = NAT46_CLEAR; +#endif + /* No entries found, packet must be eligible for creating a CT entry */ if (ret == CT_NEW && action != ACTION_CREATE) ret = DROP_CT_CANT_CREATE; @@ -436,6 +446,11 @@ static inline int __inline__ ct_create4(void *map, struct ipv4_ct_tuple *tuple, entry.tx_bytes = skb->len; } +#ifdef LXC_NAT46 + if (skb->cb[CB_NAT46_STATE] == NAT64) + entry.nat46 = !in; +#endif + cilium_trace(skb, DBG_CT_CREATED, (ntohs(tuple->sport) << 16) | ntohs(tuple->dport), (tuple->nexthdr << 8) | tuple->flags); cilium_trace(skb, DBG_CT_CREATED2, tuple->addr, rev_nat_index); diff --git a/bpf/lib/dbg.h b/bpf/lib/dbg.h index 081dc18a73f8..eb0d56a96b04 100644 --- a/bpf/lib/dbg.h +++ b/bpf/lib/dbg.h @@ -63,6 +63,8 @@ enum { DBG_CAPTURE_FROM_OVERLAY, DBG_CAPTURE_DELIVERY, DBG_CAPTURE_FROM_LB, + DBG_CAPTURE_AFTER_V46, + DBG_CAPTURE_AFTER_V64, }; #ifdef DEBUG diff --git a/bpf/lib/ipv6.h b/bpf/lib/ipv6.h index 33e8587ccf48..282aba788ed7 100644 --- a/bpf/lib/ipv6.h +++ b/bpf/lib/ipv6.h @@ -270,4 +270,11 @@ static inline __be32 ipv6_pseudohdr_checksum(struct ipv6hdr *hdr, return sum; } +/* + * Ipv4 mapped address - 0:0:0:0:0:FFFF::/96 + */ +static inline int ipv6_addr_is_mapped(union v6addr *addr) +{ + return addr->p1 == 0 && addr->p2 == 0 && addr->p3 == 0xFFFF0000; +} #endif /* __LIB_IPV6__ */ diff --git a/bpf/lib/nat46.h b/bpf/lib/nat46.h index 6d57b87b11bb..61463763338d 100644 --- a/bpf/lib/nat46.h +++ b/bpf/lib/nat46.h @@ -26,6 +26,17 @@ #include "eth.h" #include "dbg.h" +#if defined ENABLE_NAT46 +#if defined ENABLE_IPV4 && defined CONNTRACK +#define LXC_NAT46 +#else +#warning "ENABLE_NAT46 requires ENABLE_IPv4 and CONNTRACK" +#undef LXC_NAT46 +#endif +#else +#undef LXC_NAT46 +#endif + static inline int get_csum_offset(__u8 protocol) { int csum_off; @@ -224,12 +235,11 @@ static inline int ipv6_prefix_match(struct in6_addr *addr, /* * ipv4 to ipv6 stateless nat * (s4,d4) -> (s6,d6) - * s6 = v6prefix_src - * d6 = v6prefix_dst + * s6 = nat46_prefix + * d6 = nat46_prefix or v6_dst if non null */ static inline int ipv4_to_ipv6(struct __sk_buff *skb, struct iphdr *ip4, - int nh_off, union v6addr *v6prefix_src, - union v6addr *v6predix_dst) + int nh_off, union v6addr *v6_dst) { struct ipv6hdr v6 = {}; struct iphdr v4; @@ -238,6 +248,7 @@ static inline int ipv4_to_ipv6(struct __sk_buff *skb, struct iphdr *ip4, __be16 v4hdr_len; __be16 protocol = htons(ETH_P_IPV6); __u64 csum_flags = BPF_F_PSEUDO_HDR; + union v6addr nat46_prefix = NAT46_PREFIX; if (skb_load_bytes(skb, nh_off, &v4, sizeof(v4)) < 0) return DROP_INVALID; @@ -247,15 +258,22 @@ static inline int ipv4_to_ipv6(struct __sk_buff *skb, struct iphdr *ip4, /* build v6 header */ v6.version = 0x6; - v6.saddr.in6_u.u6_addr32[0] = v6prefix_src->p1; - v6.saddr.in6_u.u6_addr32[1] = v6prefix_src->p2; - v6.saddr.in6_u.u6_addr32[2] = v6prefix_src->p3; + v6.saddr.in6_u.u6_addr32[0] = nat46_prefix.p1; + v6.saddr.in6_u.u6_addr32[1] = nat46_prefix.p2; + v6.saddr.in6_u.u6_addr32[2] = nat46_prefix.p3; v6.saddr.in6_u.u6_addr32[3] = v4.saddr; - v6.daddr.in6_u.u6_addr32[0] = v6predix_dst->p1; - v6.daddr.in6_u.u6_addr32[1] = v6predix_dst->p2; - v6.daddr.in6_u.u6_addr32[2] = v6predix_dst->p3; - v6.daddr.in6_u.u6_addr32[3] = htonl((ntohl(v6predix_dst->p4) & 0xFFFF0000) | (ntohl(v4.daddr) & 0xFFFF)); + if (v6_dst) { + v6.daddr.in6_u.u6_addr32[0] = v6_dst->p1; + v6.daddr.in6_u.u6_addr32[1] = v6_dst->p2; + v6.daddr.in6_u.u6_addr32[2] = v6_dst->p3; + v6.daddr.in6_u.u6_addr32[3] = v6_dst->p4; + } else { + v6.daddr.in6_u.u6_addr32[0] = nat46_prefix.p1; + v6.daddr.in6_u.u6_addr32[1] = nat46_prefix.p2; + v6.daddr.in6_u.u6_addr32[2] = nat46_prefix.p3; + v6.daddr.in6_u.u6_addr32[3] = htonl((ntohl(nat46_prefix.p4) & 0xFFFF0000) | (ntohl(v4.daddr) & 0xFFFF)); + } if (v4.protocol == IPPROTO_ICMP) v6.nexthdr = IPPROTO_ICMPV6; @@ -314,9 +332,7 @@ static inline int ipv4_to_ipv6(struct __sk_buff *skb, struct iphdr *ip4, * s4 = . * d4 = d6[96 .. 127] */ -static inline int ipv6_to_ipv4(struct __sk_buff *skb, int nh_off, - union v6addr *v6prefix_dst, - __u32 saddr) +static inline int ipv6_to_ipv4(struct __sk_buff *skb, int nh_off, __be32 saddr) { struct ipv6hdr v6; struct iphdr v4 = {}; @@ -332,13 +348,6 @@ static inline int ipv6_to_ipv4(struct __sk_buff *skb, int nh_off, if (ipv6_hdrlen(skb, nh_off, &v6.nexthdr) != sizeof(v6)) return DROP_INVALID_EXTHDR; - if (!ipv6_prefix_match(&v6.daddr, v6prefix_dst)) { -#ifdef DEBUG_NAT46 - printk("v64 nat dst prefix mismatch\n"); -#endif - return 0; - } - /* build v4 header */ v4.ihl = 0x5; v4.version = 0x4; @@ -400,5 +409,4 @@ static inline int ipv6_to_ipv4(struct __sk_buff *skb, int nh_off, return 0; } - #endif /* __LIB_NAT46__ */ diff --git a/bpf/node_config.h b/bpf/node_config.h index 31649b4bfde8..656f6ea08454 100644 --- a/bpf/node_config.h +++ b/bpf/node_config.h @@ -28,8 +28,7 @@ #define HOST_ID 1 #define WORLD_ID 2 #define HOST_IFINDEX_MAC { .addr = { 0xce, 0x72, 0xa7, 0x03, 0x88, 0x56 } } -#define NAT46_SRC_PREFIX { .addr = { 0xbe, 0xef, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0 } } -#define NAT46_DST_PREFIX { .addr = { 0xbe, 0xef, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0 } } +#define NAT46_PREFIX { .addr = { 0xbe, 0xef, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0 } } #define IPV4_RANGE 0xf50a #define IPV4_MASK 0xffff #define IPV4_CLUSTER_MASK 0xff0000 diff --git a/common/addressing/defaults.go b/common/addressing/defaults.go index d5ad13f18dba..65303e6fa957 100644 --- a/common/addressing/defaults.go +++ b/common/addressing/defaults.go @@ -32,7 +32,7 @@ const ( // Default IPv4 prefix length of entire cluster DefaultIPv4ClusterPrefixLen = 8 // Default IPv6 prefix to represent NATed IPv4 addresses - DefaultNAT46Prefix = "aa46::/48" + DefaultNAT46Prefix = "0:0:0:0:0:FFFF::/96" ) var ( diff --git a/common/bpf/debug.go b/common/bpf/debug.go index f6062ae75abe..f665d81808af 100644 --- a/common/bpf/debug.go +++ b/common/bpf/debug.go @@ -28,6 +28,8 @@ const ( DBG_CAPTURE_FROM_OVERLAY DBG_CAPTURE_DELIVERY DBG_CAPTURE_FROM_LB + DBG_CAPTURE_AFTER_V46 + DBG_CAPTURE_AFTER_V64 ) const ( @@ -200,6 +202,10 @@ func (n *DebugCapture) Dump(dissect bool, data []byte, prefix string) { fmt.Printf("Delivery to ifindex %d\n", n.Arg1) case DBG_CAPTURE_FROM_LB: fmt.Printf("Incoming packet to load balancer on ifindex %d\n", n.Arg1) + case DBG_CAPTURE_AFTER_V46: + fmt.Printf("Packet after nat46 ifindex %d\n", n.Arg1) + case DBG_CAPTURE_AFTER_V64: + fmt.Printf("Packet after nat64 ifindex %d\n", n.Arg1) default: fmt.Printf("Unknown message type=%d arg1=%d\n", n.SubType, n.Arg1) } diff --git a/daemon/daemon/daemon.go b/daemon/daemon/daemon.go index 749f145089ce..e49992946d4f 100644 --- a/daemon/daemon/daemon.go +++ b/daemon/daemon/daemon.go @@ -205,8 +205,7 @@ func (d *Daemon) init() error { fmt.Fprintf(fw, "#define IPV4_CLUSTER_MASK %#x\n", binary.LittleEndian.Uint32(ipv4ClusterRange.Mask)) if nat46Range := d.conf.NAT46Prefix; nat46Range != nil { - fw.WriteString(common.FmtDefineAddress("NAT46_SRC_PREFIX", nat46Range.IP)) - fw.WriteString(common.FmtDefineAddress("NAT46_DST_PREFIX", nat46Range.IP)) + fw.WriteString(common.FmtDefineAddress("NAT46_PREFIX", nat46Range.IP)) } fw.WriteString(common.FmtDefineAddress("HOST_IP", hostIP)) diff --git a/daemon/main.go b/daemon/main.go index 67284ea7ba31..4671ec5dd3e0 100644 --- a/daemon/main.go +++ b/daemon/main.go @@ -355,7 +355,7 @@ func initEnv(ctx *cli.Context) error { } config.Opts.Set(types.OptionDropNotify, true) - config.Opts.Set(types.OptionNAT46, false) + config.Opts.Set(types.OptionNAT46, true) config.Opts.Set(daemon.OptionPolicyTracing, enableTracing) config.Opts.Set(types.OptionConntrack, !disableConntrack) config.Opts.Set(types.OptionConntrackAccounting, !disableConntrack) diff --git a/tests/08-nat46.sh b/tests/08-nat46.sh new file mode 100755 index 000000000000..12b84e4eee83 --- /dev/null +++ b/tests/08-nat46.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +source "./helpers.bash" + +set -e + +TEST_NET="cilium" + +function cleanup { + docker rm -f server client 2> /dev/null || true + monitor_stop +} + +trap cleanup EXIT + +SERVER_LABEL="io.cilium.server" +CLIENT_LABEL="io.cilium.client" +NETPERF_IMAGE="noironetworks/netperf" + +monitor_start + +docker network inspect $TEST_NET 2> /dev/null || { + docker network create --ipv6 --subnet ::1/112 --ipam-driver cilium --driver cilium $TEST_NET +} + +docker run -dt -ti --net=$TEST_NET --name server -l $SERVER_LABEL $NETPERF_IMAGE +docker run -dt -ti --net=$TEST_NET --name client -l $CLIENT_LABEL $NETPERF_IMAGE + +CLIENT_IP=$(docker inspect --format '{{ .NetworkSettings.Networks.cilium.GlobalIPv6Address }}' client) +CLIENT_IP4=$(docker inspect --format '{{ .NetworkSettings.Networks.cilium.IPAddress }}' client) +CLIENT_ID=$(cilium endpoint list | grep $CLIENT_IP | awk '{ print $1}') +SERVER_IP=$(docker inspect --format '{{ .NetworkSettings.Networks.cilium.GlobalIPv6Address }}' server) +SERVER_IP4=$(docker inspect --format '{{ .NetworkSettings.Networks.cilium.IPAddress }}' server) +SERVER_ID=$(cilium endpoint list | grep $SERVER_IP | awk '{ print $1}') + +echo CLIENT_IP=$CLIENT_IP +echo CLIENT_IP4=$CLIENT_IP4 +echo CLIENT_ID=$CLIENT_ID +echo SERVER_IP=$SERVER_IP +echo SERVER_IP4=$SERVER_IP4 +echo SERVER_ID=$SERVER_ID + +# FIXME IPv6 DAD period +sleep 5 +set -x + +cat <