Skip to content

Commit

Permalink
bpf: Split connection tracking for TCP and non-TCP
Browse files Browse the repository at this point in the history
Split the connection tracking of TCP connections into a separate table
from non-TCP, to improve the isolation between the flows. This means
that if there is a bursty set of UDP traffic, it will not prevent new
TCP connections from establishing.

Related: #5048

Signed-off-by: Joe Stringer <joe@covalent.io>
  • Loading branch information
joestringer authored and tgraf committed Sep 17, 2018
1 parent 56ccb2a commit 750b3f9
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 91 deletions.
79 changes: 56 additions & 23 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,62 @@

#define POLICY_ID ((LXC_ID << 16) | SECLABEL)

struct bpf_elf_map __section_maps CT_MAP6 = {
#ifdef HAVE_LRU_MAP_TYPE
.type = BPF_MAP_TYPE_LRU_HASH,
#define CT_MAP_TYPE BPF_MAP_TYPE_LRU_HASH
#else
.type = BPF_MAP_TYPE_HASH,
#define CT_MAP_TYPE BPF_MAP_TYPE_HASH
#endif

struct bpf_elf_map __section_maps CT_MAP_TCP6 = {
.type = CT_MAP_TYPE,
.size_key = sizeof(struct ipv6_ct_tuple),
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE,
};

struct bpf_elf_map __section_maps CT_MAP4 = {
#ifdef HAVE_LRU_MAP_TYPE
.type = BPF_MAP_TYPE_LRU_HASH,
#else
.type = BPF_MAP_TYPE_HASH,
#endif
struct bpf_elf_map __section_maps CT_MAP_ANY6 = {
.type = CT_MAP_TYPE,
.size_key = sizeof(struct ipv6_ct_tuple),
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE,
};

struct bpf_elf_map __section_maps CT_MAP_TCP4 = {
.type = CT_MAP_TYPE,
.size_key = sizeof(struct ipv4_ct_tuple),
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE,
};

struct bpf_elf_map __section_maps CT_MAP_ANY4 = {
.type = CT_MAP_TYPE,
.size_key = sizeof(struct ipv4_ct_tuple),
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE,
};

static inline struct bpf_elf_map *
get_ct_map6(struct ipv6_ct_tuple *tuple)
{
if (tuple->nexthdr == IPPROTO_TCP) {
return &CT_MAP_TCP6;
}
return &CT_MAP_ANY6;
}

static inline struct bpf_elf_map *
get_ct_map4(struct ipv4_ct_tuple *tuple)
{
if (tuple->nexthdr == IPPROTO_TCP) {
return &CT_MAP_TCP4;
}
return &CT_MAP_ANY4;
}

static inline bool redirect_to_proxy(int verdict)
{
return verdict > 0;
Expand Down Expand Up @@ -132,7 +164,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
* address.
*/
if ((svc = lb6_lookup_service(skb, &key)) != NULL) {
ret = lb6_local(&CT_MAP6, skb, l3_off, l4_off,
ret = lb6_local(get_ct_map6(tuple), skb, l3_off, l4_off,
&csum_off, &key, tuple, svc, &ct_state_new);
if (IS_ERR(ret))
return ret;
Expand All @@ -155,7 +187,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
* entry to allow reverse packets and return set cb[CB_POLICY] to
* POLICY_SKIP if the packet is a reply packet to an existing
* incoming connection. */
ret = ct_lookup6(&CT_MAP6, tuple, skb, l4_off, CT_EGRESS,
ret = ct_lookup6(get_ct_map6(tuple), tuple, skb, l4_off, CT_EGRESS,
&ct_state, &monitor);
if (ret < 0) {
relax_verifier();
Expand Down Expand Up @@ -197,7 +229,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
/* If the connection was previously known and packet is now
* denied, remove the connection tracking entry */
if (ret == CT_ESTABLISHED)
ct_delete6(&CT_MAP6, tuple, skb);
ct_delete6(get_ct_map6(tuple), tuple, skb);

return verdict;
}
Expand All @@ -210,7 +242,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
* reverse NAT.
*/
ct_state_new.src_sec_id = SECLABEL;
ret = ct_create6(&CT_MAP6, tuple, skb, CT_EGRESS, &ct_state_new);
ret = ct_create6(get_ct_map6(tuple), tuple, skb, CT_EGRESS, &ct_state_new);
if (IS_ERR(ret))
return ret;
monitor = TRACE_PAYLOAD_LEN;
Expand Down Expand Up @@ -452,7 +484,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
ct_state_new.orig_dport = key.dport;
#ifdef ENABLE_IPV4
if ((svc = lb4_lookup_service(skb, &key)) != NULL) {
ret = lb4_local(&CT_MAP4, skb, l3_off, l4_off, &csum_off,
ret = lb4_local(get_ct_map4(&tuple), skb, l3_off, l4_off, &csum_off,
&key, &tuple, svc, &ct_state_new, ip4->saddr);
if (IS_ERR(ret))
return ret;
Expand All @@ -474,7 +506,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
* entry to allow reverse packets and return set cb[CB_POLICY] to
* POLICY_SKIP if the packet is a reply packet to an existing
* incoming connection. */
ret = ct_lookup4(&CT_MAP4, &tuple, skb, l4_off, CT_EGRESS,
ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_EGRESS,
&ct_state, &monitor);
if (ret < 0)
return ret;
Expand Down Expand Up @@ -507,7 +539,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
/* If the connection was previously known and packet is now
* denied, remove the connection tracking entry */
if (ret == CT_ESTABLISHED)
ct_delete4(&CT_MAP4, &tuple, skb);
ct_delete4(get_ct_map4(&tuple), &tuple, skb);

return verdict;
}
Expand All @@ -520,7 +552,8 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
* reverse NAT.
*/
ct_state_new.src_sec_id = SECLABEL;
ret = ct_create4(&CT_MAP4, &tuple, skb, CT_EGRESS, &ct_state_new);
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_EGRESS,
&ct_state_new);
if (IS_ERR(ret))
return ret;
break;
Expand Down Expand Up @@ -771,7 +804,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
}
}

ret = ct_lookup6(&CT_MAP6, &tuple, skb, l4_off, CT_INGRESS,
ret = ct_lookup6(get_ct_map6(&tuple), &tuple, skb, l4_off, CT_INGRESS,
&ct_state, &monitor);
if (ret < 0)
return ret;
Expand All @@ -797,7 +830,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
/* If the connection was previously known and packet is now
* denied, remove the connection tracking entry */
if (ret == CT_ESTABLISHED)
ct_delete6(&CT_MAP6, &tuple, skb);
ct_delete6(get_ct_map6(&tuple), &tuple, skb);

return DROP_POLICY;
}
Expand All @@ -808,7 +841,7 @@ ipv6_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
if (ret == CT_NEW) {
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ret = ct_create6(&CT_MAP6, &tuple, skb, CT_INGRESS, &ct_state_new);
ret = ct_create6(get_ct_map6(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new);
if (IS_ERR(ret))
return ret;

Expand Down Expand Up @@ -896,7 +929,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
is_fragment = ipv4_is_fragment(ip4);

ret = ct_lookup4(&CT_MAP4, &tuple, skb, l4_off, CT_INGRESS, &ct_state,
ret = ct_lookup4(get_ct_map4(&tuple), &tuple, skb, l4_off, CT_INGRESS, &ct_state,
&monitor);
if (ret < 0)
return ret;
Expand Down Expand Up @@ -931,7 +964,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
/* If the connection was previously known and packet is now
* denied, remove the connection tracking entry */
if (ret == CT_ESTABLISHED)
ct_delete4(&CT_MAP4, &tuple, skb);
ct_delete4(get_ct_map4(&tuple), &tuple, skb);

return DROP_POLICY;
}
Expand All @@ -942,7 +975,7 @@ ipv4_policy(struct __sk_buff *skb, int ifindex, __u32 src_label, int *forwarding
if (ret == CT_NEW) {
ct_state_new.orig_dport = tuple.dport;
ct_state_new.src_sec_id = src_label;
ret = ct_create4(&CT_MAP4, &tuple, skb, CT_INGRESS, &ct_state_new);
ret = ct_create4(get_ct_map4(&tuple), &tuple, skb, CT_INGRESS, &ct_state_new);
if (IS_ERR(ret))
return ret;

Expand Down
6 changes: 4 additions & 2 deletions bpf/lxc_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@
#endif
#define DROP_NOTIFY
#define TRACE_NOTIFY
#define CT_MAP6 cilium_ct6_111
#define CT_MAP4 cilium_ct4_111
#define CT_MAP_TCP6 cilium_ct_tcp6_111
#define CT_MAP_ANY6 cilium_ct_any6_111
#define CT_MAP_TCP4 cilium_ct_tcp4_111
#define CT_MAP_ANY4 cilium_ct_any4_111
#define CT_MAP_SIZE 4096
#define CALLS_MAP cilium_calls_111
#define LB_L3
Expand Down
2 changes: 1 addition & 1 deletion daemon/bpf.sha
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
GO_BINDATA_SHA1SUM=034c8207c4d8bf10b01cecd6c7add6e16eedb149
GO_BINDATA_SHA1SUM=4f3bc614b4c4456be0d038f88cc0d0846ec23be0
BPF_FILES=../bpf/.gitignore ../bpf/COPYING ../bpf/Makefile ../bpf/bpf_features.h ../bpf/bpf_lb.c ../bpf/bpf_lxc.c ../bpf/bpf_netdev.c ../bpf/bpf_overlay.c ../bpf/bpf_xdp.c ../bpf/cilium-map-migrate.c ../bpf/filter_config.h ../bpf/include/bpf/api.h ../bpf/include/elf/elf.h ../bpf/include/elf/gelf.h ../bpf/include/elf/libelf.h ../bpf/include/iproute2/bpf_elf.h ../bpf/include/linux/bpf.h ../bpf/include/linux/bpf_common.h ../bpf/include/linux/byteorder.h ../bpf/include/linux/byteorder/big_endian.h ../bpf/include/linux/byteorder/little_endian.h ../bpf/include/linux/icmp.h ../bpf/include/linux/icmpv6.h ../bpf/include/linux/if_arp.h ../bpf/include/linux/if_ether.h ../bpf/include/linux/in.h ../bpf/include/linux/in6.h ../bpf/include/linux/ioctl.h ../bpf/include/linux/ip.h ../bpf/include/linux/ipv6.h ../bpf/include/linux/perf_event.h ../bpf/include/linux/swab.h ../bpf/include/linux/tcp.h ../bpf/include/linux/type_mapper.h ../bpf/include/linux/udp.h ../bpf/init.sh ../bpf/lib/arp.h ../bpf/lib/common.h ../bpf/lib/conntrack.h ../bpf/lib/csum.h ../bpf/lib/dbg.h ../bpf/lib/drop.h ../bpf/lib/encap.h ../bpf/lib/eps.h ../bpf/lib/eth.h ../bpf/lib/events.h ../bpf/lib/icmp6.h ../bpf/lib/ipv4.h ../bpf/lib/ipv6.h ../bpf/lib/l3.h ../bpf/lib/l4.h ../bpf/lib/lb.h ../bpf/lib/lxc.h ../bpf/lib/maps.h ../bpf/lib/metrics.h ../bpf/lib/nat46.h ../bpf/lib/policy.h ../bpf/lib/trace.h ../bpf/lib/utils.h ../bpf/lib/xdp.h ../bpf/lxc_config.h ../bpf/netdev_config.h ../bpf/node_config.h ../bpf/probes/raw_change_tail.t ../bpf/probes/raw_insn.h ../bpf/probes/raw_invalidate_hash.t ../bpf/probes/raw_lpm_map.t ../bpf/probes/raw_lru_map.t ../bpf/probes/raw_main.c ../bpf/probes/raw_map_val_adj.t ../bpf/probes/raw_mark_map_val.t ../bpf/run_probes.sh ../bpf/spawn_netns.sh
7 changes: 2 additions & 5 deletions daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1351,9 +1351,7 @@ func (d *Daemon) checkStaleMap(path string, filename string, id string) {
func (d *Daemon) checkStaleGlobalMap(path string, filename string) {
globalCTinUse := endpointmanager.HasGlobalCT()

if !globalCTinUse &&
(filename == ctmap.MapName6Global ||
filename == ctmap.MapName4Global) {
if !globalCTinUse && ctmap.NameIsGlobal(filename) {
d.removeStaleMap(path)
}
}
Expand All @@ -1363,8 +1361,7 @@ func (d *Daemon) staleMapWalker(path string) error {

mapPrefix := []string{
policymap.MapName,
ctmap.MapName6,
ctmap.MapName4,
ctmap.MapNamePrefix,
endpoint.CallsMapName,
}

Expand Down
110 changes: 69 additions & 41 deletions pkg/maps/ctmap/ctmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,48 +45,27 @@ var (
metrics.LabelDatapathFamily: "ipv4",
}

mapInfo = map[MapType]struct {
keySize int
maxEntries int
parser bpf.DumpParser
bpfDefine string
}{
MapTypeIPv4Local: {
keySize: int(unsafe.Sizeof(CtKey4{})),
maxEntries: MapNumEntriesLocal,
parser: ct4DumpParser,
bpfDefine: "CT_MAP4",
},
MapTypeIPv6Local: {
keySize: int(unsafe.Sizeof(CtKey6{})),
maxEntries: MapNumEntriesLocal,
parser: ct6DumpParser,
bpfDefine: "CT_MAP6",
},
MapTypeIPv4Global: {
keySize: int(unsafe.Sizeof(CtKey4{})),
maxEntries: MapNumEntriesGlobal,
parser: ct4DumpParser,
bpfDefine: "CT_MAP4",
},
MapTypeIPv6Global: {
keySize: int(unsafe.Sizeof(CtKey6{})),
maxEntries: MapNumEntriesGlobal,
parser: ct6DumpParser,
bpfDefine: "CT_MAP6",
},
}
mapInfo = make(map[MapType]mapAttributes)
)

const (
// mapCount counts the maximum number of CT maps that one endpoint may
// access at once.
mapCount = 2

MapName6 = "cilium_ct6_"
MapName4 = "cilium_ct4_"
MapName6Global = MapName6 + "global"
MapName4Global = MapName4 + "global"
mapCount = 4

// Map names for TCP CT tables are retained from Cilium 1.0 naming
// scheme to minimize disruption of ongoing connections during upgrade.
MapNamePrefix = "cilium_ct"
MapNameTCP6 = MapNamePrefix + "6_"
MapNameTCP4 = MapNamePrefix + "4_"
MapNameTCP6Global = MapNameTCP6 + "global"
MapNameTCP4Global = MapNameTCP4 + "global"

// Map names for "any" protocols indicate CT for non-TCP protocols.
MapNameAny6 = MapNamePrefix + "_any6_"
MapNameAny4 = MapNamePrefix + "_any4_"
MapNameAny6Global = MapNameAny6 + "global"
MapNameAny4Global = MapNameAny4 + "global"

MapNumEntriesLocal = 64000
MapNumEntriesGlobal = 1000000
Expand All @@ -106,6 +85,41 @@ const (
metricsDeleted = "deleted"
)

type mapAttributes struct {
keySize int
maxEntries int
parser bpf.DumpParser
bpfDefine string
}

func setupMapInfo(mapType MapType, define string, keySize, maxEntries int, parser bpf.DumpParser) {
mapInfo[mapType] = mapAttributes{
bpfDefine: define,
keySize: keySize,
maxEntries: maxEntries,
parser: parser,
}
}

func initMapInfo() {
mapType := MapTypeIPv4TCPLocal
for _, maxEntries := range []int{MapNumEntriesLocal, MapNumEntriesGlobal} {
// CT_MAP_ANY4, CT_MAP_ANY6, CT_MAP_TCP4, CT_MAP_TCP6
for _, proto := range []string{"TCP", "ANY"} {
setupMapInfo(MapType(mapType), fmt.Sprintf("CT_MAP_%s4", proto),
int(unsafe.Sizeof(CtKey4{})), maxEntries, ct4DumpParser)
mapType++
setupMapInfo(MapType(mapType), fmt.Sprintf("CT_MAP_%s6", proto),
int(unsafe.Sizeof(CtKey6{})), maxEntries, ct6DumpParser)
mapType++
}
}
}

func init() {
initMapInfo()
}

// CtEndpoint represents an endpoint for the functions required to manage
// conntrack maps for the endpoint.
type CtEndpoint interface {
Expand Down Expand Up @@ -380,17 +394,21 @@ func maps(e CtEndpoint, ipv4, ipv6 bool) []*Map {
result := make([]*Map, 0, mapCount)
if e == nil {
if ipv4 {
result = append(result, NewMap(MapName4Global, MapTypeIPv4Global))
result = append(result, NewMap(MapNameTCP4Global, MapTypeIPv4TCPGlobal))
result = append(result, NewMap(MapNameAny4Global, MapTypeIPv4AnyGlobal))
}
if ipv6 {
result = append(result, NewMap(MapName6Global, MapTypeIPv6Global))
result = append(result, NewMap(MapNameTCP6Global, MapTypeIPv6TCPGlobal))
result = append(result, NewMap(MapNameAny6Global, MapTypeIPv6AnyGlobal))
}
} else {
if ipv4 {
result = append(result, NewMap(MapName4+e.StringID(), MapTypeIPv4Local))
result = append(result, NewMap(MapNameTCP4+e.StringID(), MapTypeIPv4TCPLocal))
result = append(result, NewMap(MapNameAny4+e.StringID(), MapTypeIPv4AnyLocal))
}
if ipv6 {
result = append(result, NewMap(MapName6+e.StringID(), MapTypeIPv6Local))
result = append(result, NewMap(MapNameTCP6+e.StringID(), MapTypeIPv6TCPLocal))
result = append(result, NewMap(MapNameAny6+e.StringID(), MapTypeIPv6AnyLocal))
}
}
return result
Expand All @@ -414,6 +432,16 @@ func GlobalMaps(ipv4, ipv6 bool) []*Map {
return maps(nil, ipv4, ipv6)
}

// NameIsGlobal returns true if the specified filename (basename) denotes a
// global conntrack map.
func NameIsGlobal(filename string) bool {
switch filename {
case MapNameTCP4Global, MapNameAny4Global, MapNameTCP6Global, MapNameAny6Global:
return true
}
return false
}

// WriteBPFMacros writes the map names for conntrack maps into the specified
// writer, defining usage of the global map or local maps depending on whether
// the specified CtEndpoint is nil.
Expand Down
5 changes: 5 additions & 0 deletions pkg/maps/ctmap/ctmap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ func (t *CTMapTestSuite) TestInit(c *C) {
c.Assert(info.keySize, Equals, int(unsafe.Sizeof(CtKey4{})))
c.Assert(strings.Contains(info.bpfDefine, "4"), Equals, true)
}
if mapType.isTCP() {
c.Assert(strings.Contains(info.bpfDefine, "TCP"), Equals, true)
} else {
c.Assert(strings.Contains(info.bpfDefine, "ANY"), Equals, true)
}
if mapType.isLocal() {
c.Assert(info.maxEntries, Equals, MapNumEntriesLocal)
}
Expand Down

0 comments on commit 750b3f9

Please sign in to comment.