Skip to content

Commit

Permalink
ctmap: GC orphan SNAT entries
Browse files Browse the repository at this point in the history
This commit adds a mechanism to remove orphan SNAT entries.

We call an SNAT entry orphan if it does not have either a corresponding
CT entry or an SNAT entry in a reverse order. Both cases can happen due
to LRU eviction heuristics (both CT and NAT maps are of the LRU type).

The mechanism for the removal is based on the GC signaling in the
datapath. When the datapath SNAT routine fails to find a free mapping
after SNAT_SIGNAL_THRES attempts, it sends the signal via the perf ring
buffer. The consumer of the buffer is the daemon. After receiving the
signal it invokes the CT GC.

The newly implemented GC addition iterates over all SNAT entries and
checks whether a corresponding CT entry is found, and if not, it tries
to remove both SNAT entries (for original and reverse flows).

For now, I didn't add GC of orphan SNAT entries created by DSR to keep
complexity of changes as low as possible. This will come as a follow up.

Signed-off-by: Martynas Pumputis <m@lambda.lt>
  • Loading branch information
brb authored and borkmann committed Nov 12, 2020
1 parent ec64380 commit c9810bf
Show file tree
Hide file tree
Showing 7 changed files with 457 additions and 8 deletions.
84 changes: 84 additions & 0 deletions pkg/maps/ctmap/ctmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package ctmap

import (
"bytes"
"errors"
"fmt"
"io"
"math"
Expand All @@ -34,8 +35,10 @@ import (
"github.com/cilium/cilium/pkg/metrics"
"github.com/cilium/cilium/pkg/option"
"github.com/cilium/cilium/pkg/tuple"
"github.com/cilium/cilium/pkg/u8proto"

"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

var (
Expand Down Expand Up @@ -103,6 +106,10 @@ type NatMap interface {
Open() error
Close() error
DeleteMapping(key tuple.TupleKey) error
DumpWithCallback(bpf.DumpCallback) error
DumpReliablyWithCallback(bpf.DumpCallback, *bpf.DumpStats) error
Delete(bpf.MapKey) error
DumpStats() *bpf.DumpStats
}

type mapAttributes struct {
Expand Down Expand Up @@ -510,6 +517,83 @@ func GC(m *Map, filter *GCFilter) int {
return doGC(m, filter)
}

// PurgeOrphanNATEntries removes orphan SNAT entries. We call an SNAT entry
// orphan if it does not have a corresponding CT entry.
//
// This can happen when the CT entry is removed by the LRU eviction which
// happens when the CT map becomes full.
//
// PurgeOrphanNATEntries() is triggered by the datapath via the GC signaling
// mechanism. When the datapath SNAT fails to find free mapping after
// SNAT_SIGNAL_THRES attempts, it sends the signal via the perf ring buffer.
// The consumer of the buffer invokes the function.
//
// The SNAT is being used for the following cases:
// 1. By NodePort BPF on an intermediate node before fwd'ing request from outside
// to a destination node.
// 2. A packet from local endpoint sent to outside (BPF-masq).
// 3. A packet from a host local application (i.e. running in the host netns)
// This is needed to prevent SNAT from hijacking such connections.
// 4. By DSR on a backend node to SNAT responses with service IP+port before
// sending to a client.
//
// In the case of 1-3, we always create a CT_EGRESS CT entry. This allows the
// CT GC to remove corresponding SNAT entries. In the case of 4, will create
// CT_INGRESS CT entry. See the unit test TestOrphanNatGC for more examples.
//
// The function only handles 1-3 cases, the 4. case is TODO(brb).
func PurgeOrphanNATEntries(ctMap *Map) *NatGCStats {
if option.Config.NodePortMode == option.NodePortModeDSR ||
option.Config.NodePortMode == option.NodePortModeHybrid {
return nil
}

natMap := mapInfo[ctMap.mapType].natMap
if natMap == nil {
return nil
}

isCTMapTCP := ctMap.mapType == mapTypeIPv4TCPLocal ||
ctMap.mapType == mapTypeIPv6TCPLocal ||
ctMap.mapType == mapTypeIPv4TCPGlobal ||
ctMap.mapType == mapTypeIPv6TCPGlobal
stats := newNatGCStats(natMap)

cb := func(key bpf.MapKey, value bpf.MapValue) {
natKey := key.(nat.NatKey)
natVal := value.(nat.NatEntry)

// In opposite to the CT maps, TCP and UDP entries are stored in the same
// SNAT map. Therefore, to avoid a case when the given ctMap does not
// store entries of the given natKey.NextHeader proto, we should return
// early. Otherwise, the natKey entries will be removed, which is wrong.
if (natKey.GetNextHeader() == u8proto.TCP) != isCTMapTCP {
return
}

if natKey.GetFlags()&tuple.TUPLE_F_IN == 1 { // natKey is r(everse)tuple
ctKey := egressCTKeyFromIngressNatKeyAndVal(natKey, natVal)
if _, err := ctMap.Lookup(ctKey); errors.Is(err, unix.ENOENT) {
// No CT entry is found, so delete SNAT for both original and
// reverse flows
oNatKey := oNatKeyFromReverse(natKey, natVal)
if err := natMap.Delete(oNatKey); err == nil {
stats.EgressDeleted += 1
}
if err := natMap.Delete(natKey); err == nil {
stats.IngressDeleted += 1
}
} else {
stats.IngressAlive += 1
}
}
}

natMap.DumpReliablyWithCallback(cb, stats.DumpStats)

return &stats
}

// Flush runs garbage collection for map m with the name mapType, deleting all
// entries. The specified map must be already opened using bpf.OpenMap().
func (m *Map) Flush() int {
Expand Down
236 changes: 231 additions & 5 deletions pkg/maps/ctmap/ctmap_privileged_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ func Test(t *testing.T) {
TestingT(t)
}

func (k *CTMapTestSuite) SetUpSuite(c *C) {
bpf.CheckOrMountFS("", false)
err := bpf.ConfigureResourceLimits()
c.Assert(err, IsNil)
}

func (k *CTMapTestSuite) Benchmark_MapUpdate(c *C) {
m := newMap(MapNameTCP4Global+"_test", mapTypeIPv4TCPGlobal)
_, err := m.OpenOrCreate()
Expand Down Expand Up @@ -98,13 +104,9 @@ func (k *CTMapTestSuite) Benchmark_MapUpdate(c *C) {
// TestCtGcIcmp tests whether ICMP NAT entries are removed upon a removal of
// their CT entry (GH#12625).
func (k *CTMapTestSuite) TestCtGcIcmp(c *C) {
bpf.CheckOrMountFS("", false)
err := bpf.ConfigureResourceLimits()
c.Assert(err, IsNil)

// Init maps
natMap := nat.NewMap("cilium_nat_any4_test", true, 1000)
_, err = natMap.OpenOrCreate()
_, err := natMap.OpenOrCreate()
c.Assert(err, IsNil)
defer natMap.Map.Unpin()

Expand Down Expand Up @@ -211,3 +213,227 @@ func (k *CTMapTestSuite) TestCtGcIcmp(c *C) {
c.Assert(err, IsNil)
c.Assert(len(buf), Equals, 0)
}

// TestOrphanNat checks whether dangling NAT entries are GC'd (GH#12686)
func (k *CTMapTestSuite) TestOrphanNatGC(c *C) {
// Init maps
natMap := nat.NewMap("cilium_nat_any4_test", true, 1000)
_, err := natMap.OpenOrCreate()
c.Assert(err, IsNil)
defer natMap.Map.Unpin()

ctMapName := MapNameAny4Global + "_test"
setupMapInfo(mapTypeIPv4AnyGlobal, ctMapName,
&CtKey4Global{}, int(unsafe.Sizeof(CtKey4Global{})),
100, natMap)
ctMap := newMap(ctMapName, mapTypeIPv4AnyGlobal)
_, err = ctMap.OpenOrCreate()
c.Assert(err, IsNil)
defer ctMap.Map.Unpin()

ctTCPMapName := MapNameTCP4Global + "_test"
setupMapInfo(mapTypeIPv4TCPGlobal, ctTCPMapName,
&CtKey4Global{}, int(unsafe.Sizeof(CtKey4Global{})),
100, natMap)
ctTCPMap := newMap(ctTCPMapName, mapTypeIPv4TCPGlobal)
_, err = ctTCPMap.OpenOrCreate()
c.Assert(err, IsNil)
defer ctTCPMap.Map.Unpin()

// Create the following entries and check that SNAT entries are NOT GC-ed
// (as we have the CT entry which they belong to):
//
// - Host local traffic (no SNAT):
// CT: UDP OUT 10.23.32.45:54864 -> 10.23.53.48:8472
// NAT: UDP IN 10.23.53.48:8472 -> 10.23.32.45:54865 XLATE_DST 10.23.32.45:54864
// UDP OUT 10.23.32.45:54864 -> 10.23.53.48:8472 XLATE_SRC 10.23.32.45:54865
//
// The example above covers other SNAT cases. E.g. (not used in unit tests below, just
// to show for completion):
//
// - NodePort request from outside (subject to NodePort SNAT):
// CT: TCP OUT 192.168.34.1:63000 -> 10.0.1.99:80
// NAT: TCP IN 10.0.1.99:80 -> 10.0.0.134:63000 XLATE_DST 192.168.34.1:63000
// NAT: TCP OUT 192.168.34.1:63000 -> 10.0.1.99:80 XLATE_SRC 10.0.0.134:63000
//
// - Local endpoint request to outside (subject to BPF-masq):
// CT: TCP OUT 10.0.1.99:34520 -> 1.1.1.1:80
// NAT: TCP IN 1.1.1.1:80 -> 10.0.2.15:34520 XLATE_DST 10.0.1.99:34520
// TCP OUT 10.0.1.99:34520 -> 1.1.1.1:80 XLATE_SRC 10.0.2.15:34520

ctKey := &CtKey4Global{
tuple.TupleKey4Global{
tuple.TupleKey4{
DestAddr: types.IPv4{10, 23, 32, 45},
SourceAddr: types.IPv4{10, 23, 53, 48},
SourcePort: 0x50d6,
DestPort: 0x1821,
NextHeader: u8proto.UDP,
Flags: tuple.TUPLE_F_OUT,
},
},
}
ctVal := &CtEntry{
TxPackets: 1,
TxBytes: 216,
Lifetime: 37459,
}
err = bpf.UpdateElement(ctMap.Map.GetFd(), unsafe.Pointer(ctKey),
unsafe.Pointer(ctVal), 0)
c.Assert(err, IsNil)

natKey := &nat.NatKey4{
tuple.TupleKey4Global{
tuple.TupleKey4{
SourceAddr: types.IPv4{10, 23, 32, 45},
DestAddr: types.IPv4{10, 23, 53, 48},
SourcePort: 0x50d6,
DestPort: 0x1821,
NextHeader: u8proto.UDP,
Flags: tuple.TUPLE_F_OUT,
},
},
}
natVal := &nat.NatEntry4{
Created: 37400,
HostLocal: 1,
Addr: types.IPv4{10, 23, 32, 45},
Port: 0x51d6,
}
err = bpf.UpdateElement(natMap.Map.GetFd(), unsafe.Pointer(natKey),
unsafe.Pointer(natVal), 0)
c.Assert(err, IsNil)
natKey = &nat.NatKey4{
tuple.TupleKey4Global{
tuple.TupleKey4{
DestAddr: types.IPv4{10, 23, 32, 45},
SourceAddr: types.IPv4{10, 23, 53, 48},
DestPort: 0x51d6,
SourcePort: 0x1821,
NextHeader: u8proto.UDP,
Flags: tuple.TUPLE_F_IN,
},
},
}
natVal = &nat.NatEntry4{
Created: 37400,
HostLocal: 1,
Addr: types.IPv4{10, 23, 32, 45},
Port: 0x50d6,
}
err = bpf.UpdateElement(natMap.Map.GetFd(), unsafe.Pointer(natKey),
unsafe.Pointer(natVal), 0)
c.Assert(err, IsNil)

ctKeyTCP := &CtKey4Global{
tuple.TupleKey4Global{
tuple.TupleKey4{
DestAddr: types.IPv4{10, 23, 32, 45},
SourceAddr: types.IPv4{10, 23, 53, 48},
SourcePort: 0x50d6,
DestPort: 0x1821,
NextHeader: u8proto.TCP,
Flags: tuple.TUPLE_F_OUT,
},
},
}
err = bpf.UpdateElement(ctTCPMap.Map.GetFd(), unsafe.Pointer(ctKeyTCP),
unsafe.Pointer(ctVal), 0)
c.Assert(err, IsNil)

// UDP SNAT entries should not be removed when the TCP CT map is given
stats := PurgeOrphanNATEntries(ctTCPMap)
c.Assert(stats.IngressDeleted, Equals, uint32(0))
c.Assert(stats.EgressDeleted, Equals, uint32(0))

stats = PurgeOrphanNATEntries(ctMap)
c.Assert(stats.IngressAlive, Equals, uint32(1))
c.Assert(stats.IngressDeleted, Equals, uint32(0))
c.Assert(stats.EgressDeleted, Equals, uint32(0))
// Check that both entries haven't removed
buf := make(map[string][]string)
err = natMap.Map.Dump(buf)
c.Assert(err, IsNil)
c.Assert(len(buf), Equals, 2)

// Now remove the CT entry which should remove both NAT entries
err = bpf.DeleteElement(ctMap.Map.GetFd(), unsafe.Pointer(ctKey))
c.Assert(err, IsNil)
stats = PurgeOrphanNATEntries(ctMap)
c.Assert(stats.IngressDeleted, Equals, uint32(1))
c.Assert(stats.IngressAlive, Equals, uint32(0))
c.Assert(stats.EgressDeleted, Equals, uint32(1))
// Check that both orphan NAT entries have been removed
buf = make(map[string][]string)
err = natMap.Map.Dump(buf)
c.Assert(err, IsNil)
c.Assert(len(buf), Equals, 0)

// Create only CT_INGRESS NAT entry which should be removed
err = bpf.UpdateElement(natMap.Map.GetFd(), unsafe.Pointer(natKey),
unsafe.Pointer(natVal), 0)
c.Assert(err, IsNil)

stats = PurgeOrphanNATEntries(ctMap)
c.Assert(stats.IngressDeleted, Equals, uint32(1))
c.Assert(stats.EgressDeleted, Equals, uint32(0))
buf = make(map[string][]string)
err = natMap.Map.Dump(buf)
c.Assert(err, IsNil)
c.Assert(len(buf), Equals, 0)

// Let's check IPv6

natMapV6 := nat.NewMap("cilium_nat_any6_test", false, 1000)
_, err = natMapV6.OpenOrCreate()
c.Assert(err, IsNil)
defer natMapV6.Map.Unpin()

ctMapAnyName = MapNameAny6Global + "_test"
setupMapInfo(mapTypeIPv6AnyGlobal, ctMapAnyName,
&CtKey6Global{}, int(unsafe.Sizeof(CtKey6Global{})),
100, natMapV6)
ctMapAnyV6 := newMap(ctMapAnyName, mapTypeIPv6AnyGlobal)
_, err = ctMapAnyV6.OpenOrCreate()
c.Assert(err, IsNil)
defer ctMapAnyV6.Map.Unpin()

ctMapTCPName = MapNameTCP6Global + "_test"
setupMapInfo(mapTypeIPv6TCPGlobal, ctMapTCPName,
&CtKey6Global{}, int(unsafe.Sizeof(CtKey6Global{})),
100, natMapV6)
ctMapTCPV6 := newMap(ctMapTCPName, mapTypeIPv6TCPGlobal)
_, err = ctMapTCP.OpenOrCreate()
c.Assert(err, IsNil)
defer ctMapTCPV6.Map.Unpin()

natKeyV6 := &nat.NatKey6{
tuple.TupleKey6Global{
tuple.TupleKey6{
SourceAddr: types.IPv6{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
DestAddr: types.IPv6{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
SourcePort: 0x50d6,
DestPort: 0x1821,
NextHeader: u8proto.UDP,
Flags: tuple.TUPLE_F_IN,
},
},
}
natValV6 := &nat.NatEntry6{
Created: 37400,
HostLocal: 1,
Addr: types.IPv6{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
Port: 0x51d6,
}
err = bpf.UpdateElement(natMapV6.Map.GetFd(), unsafe.Pointer(natKeyV6),
unsafe.Pointer(natValV6), 0)
c.Assert(err, IsNil)

stats = PurgeOrphanNATEntries(ctMapTCPV6, ctMapAnyV6)
c.Assert(stats.IngressDeleted, Equals, uint32(1))
c.Assert(stats.EgressDeleted, Equals, uint32(0))
buf = make(map[string][]string)
err = natMap.Map.Dump(buf)
c.Assert(err, IsNil)
c.Assert(len(buf), Equals, 0)
}
Loading

0 comments on commit c9810bf

Please sign in to comment.