Skip to content

Commit

Permalink
remove ARP entries left from previous Cilium run
Browse files Browse the repository at this point in the history
In certain configurations, when node neighbor discovery is enabled [1],
the neighbor table is populated with PERMANENT entries. If the agent is
then configured to not use neighbor discovery, those entries are left
behind, without being garbage collected. This can cause connectivity
issues across nodes, where it's more likely to happen in the same L2
network, if a new node reuses an IP address from a previous node and its
MAC address changes. In a L3 network it is unlikely to happen since the
ARP entry will be associated with a L3 router and it is less likely to
change its MAC address.

[1]
```
n.enableNeighDiscovery = n.nodeConfig.EnableIPv4 &&
	(option.Config.EnableNodePort ||
		(n.nodeConfig.EnableIPSec && option.Config.Tunnel == option.TunnelDisabled))
```

Signed-off-by: André Martins <andre@cilium.io>
  • Loading branch information
aanm authored and nathanjsweet committed Jun 2, 2021
1 parent 516a20b commit e80cc75
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 5 deletions.
5 changes: 5 additions & 0 deletions daemon/cmd/daemon_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1555,6 +1555,11 @@ func runDaemon() {
log.WithError(err).Warn("Failed to send agent start monitor message")
}

// clean up all arp PERM entries that might have previously set by
// a Cilium instance
if !d.datapath.Node().NodeNeighDiscoveryEnabled() {
d.datapath.Node().NodeCleanNeighbors()
}
// Start periodical arping to refresh neighbor table
if d.datapath.Node().NodeNeighDiscoveryEnabled() && option.Config.ARPPingRefreshPeriod != 0 {
d.nodeDiscovery.Manager.StartNeighborRefresh(d.datapath.Node())
Expand Down
7 changes: 6 additions & 1 deletion daemon/cmd/status.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2016-2020 Authors of Cilium
// Copyright 2016-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -472,6 +472,11 @@ func (c *clusterNodesClient) NodeNeighborRefresh(ctx context.Context, node nodeT
return
}

func (c *clusterNodesClient) NodeCleanNeighbors() {
// no-op
return
}

func (h *getNodes) cleanupClients() {
past := time.Now().Add(-clientGCTimeout)
for k, v := range h.clients {
Expand Down
6 changes: 5 additions & 1 deletion pkg/datapath/fake/node.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018-2019 Authors of Cilium
// Copyright 2018-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -55,3 +55,7 @@ func (n *fakeNodeHandler) NodeNeighDiscoveryEnabled() bool {
func (n *fakeNodeHandler) NodeNeighborRefresh(ctx context.Context, node nodeTypes.Node) {
return
}

func (n *fakeNodeHandler) NodeCleanNeighbors() {
return
}
136 changes: 135 additions & 1 deletion pkg/datapath/linux/node.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018-2019 Authors of Cilium
// Copyright 2018-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -16,10 +16,12 @@ package linux

import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"os"
"path/filepath"
"reflect"
"time"

Expand Down Expand Up @@ -52,6 +54,15 @@ const (
failed = "failed"
)

const (
neighFileName = "neigh-link.json"
)

// NeighLink contains the details of a NeighLink
type NeighLink struct {
Name string `json:"link-name"`
}

type linuxNodeHandler struct {
mutex lock.Mutex
isInitialized bool
Expand Down Expand Up @@ -1363,6 +1374,17 @@ func (n *linuxNodeHandler) NodeConfigurationChanged(newConfig datapath.LocalNode
return fmt.Errorf("cannot find link by name %s for neigh discovery: %w",
ifaceName, err)
}

// Store neighDiscoveryLink so that we can remove the ARP
// PERM entries when cilium-agent starts with neigh discovery
// disabled next time.
err = storeNeighLink(option.Config.StateDir, ifaceName)
if err != nil {
log.WithError(err).Warning("Unable to store neigh discovery iface." +
" Removing ARP PERM entries upon cilium-agent init when neigh" +
" discovery is disabled will not work.")
}

// neighDiscoveryLink can be accessed by a concurrent insertNeighbor
// goroutine.
n.neighLock.Lock()
Expand Down Expand Up @@ -1453,6 +1475,118 @@ func (n *linuxNodeHandler) NodeNeighborRefresh(ctx context.Context, nodeToRefres
}
}

// NodeCleanNeighbors cleans all neighbor entries of previously used neighbor
// discovery link interfaces. It should be used when the agent changes the state
// from `n.enableNeighDiscovery = true` to `n.enableNeighDiscovery = false`.
func (n *linuxNodeHandler) NodeCleanNeighbors() {
linkName, err := loadNeighLink(option.Config.StateDir)
if err != nil {
log.WithError(err).Error("Unable to load neigh discovery iface name" +
" for removing ARP PERM entries")
return
}
if len(linkName) == 0 {
return
}

// Delete the file after cleaning up neighbor list if we were able to clean
// up all neighbors.
successClean := true
defer func() {
if successClean {
os.Remove(filepath.Join(option.Config.StateDir, neighFileName))
}
}()

l, err := netlink.LinkByName(linkName)
if err != nil {
// If the link is not found we don't need to keep retrying cleaning
// up the neihbor entries so we can keep successClean=true
if _, ok := err.(netlink.LinkNotFoundError); !ok {
log.WithError(err).WithFields(logrus.Fields{
logfields.Device: linkName,
}).Error("Unable to remove PERM ARP entries of network device")
successClean = false
}
return
}

neighList, err := netlink.NeighListExecute(netlink.Ndmsg{
Family: netlink.FAMILY_V4,
Index: uint32(l.Attrs().Index),
State: netlink.NUD_PERMANENT,
})
if err != nil {
log.WithError(err).WithFields(logrus.Fields{
logfields.Device: linkName,
logfields.LinkIndex: l.Attrs().Index,
}).Error("Unable to list PERM ARP entries for removal of network device")
successClean = false
return
}

var successRemoval, errRemoval int
for _, neigh := range neighList {
err := netlink.NeighDel(&neigh)
if err != nil {
log.WithError(err).WithFields(logrus.Fields{
logfields.Device: linkName,
logfields.LinkIndex: l.Attrs().Index,
"neighbor": neigh.String(),
}).Errorf("Unable to remove PERM ARP entry of network device. "+
"Consider removing this entry manually with 'ip neigh del %s dev %s'", neigh.IP.String(), linkName)
errRemoval++
successClean = false
} else {
successRemoval++
}
}
if successRemoval != 0 {
log.WithFields(logrus.Fields{
logfields.Count: successRemoval,
}).Info("Removed PERM ARP entries previously installed by cilium-agent")
}
if errRemoval != 0 {
log.WithFields(logrus.Fields{
logfields.Count: errRemoval,
}).Warning("Unable to remove PERM ARP entries previously installed by cilium-agent")
}
}

func storeNeighLink(dir string, name string) error {
configFileName := filepath.Join(dir, neighFileName)
f, err := os.Create(configFileName)
if err != nil {
return fmt.Errorf("unable to create '%s': %w", configFileName, err)
}
defer f.Close()
nl := NeighLink{Name: name}
err = json.NewEncoder(f).Encode(nl)
if err != nil {
return fmt.Errorf("unable to encode '%+v': %w", nl, err)
}
return nil
}

func loadNeighLink(dir string) (string, error) {
configFileName := filepath.Join(dir, neighFileName)
f, err := os.Open(configFileName)
if err != nil {
if os.IsNotExist(err) {
return "", nil
}
return "", fmt.Errorf("unable to open '%s': %w", configFileName, err)
}
defer f.Close()
var nl NeighLink

err = json.NewDecoder(f).Decode(&nl)
if err != nil {
return "", fmt.Errorf("unable to decode '%s': %w", configFileName, err)
}
return nl.Name, nil
}

// NodeDeviceNameWithDefaultRoute returns the node's device name which
// handles the default route in the current namespace
func NodeDeviceNameWithDefaultRoute() (string, error) {
Expand Down
39 changes: 39 additions & 0 deletions pkg/datapath/linux/node_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,12 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
runtime.LockOSThread()
defer runtime.UnlockOSThread()

prevStateDir := option.Config.StateDir
defer func() { option.Config.StateDir = prevStateDir }()

tmpDir := c.MkDir()
option.Config.StateDir = tmpDir

// 1. Test whether another node in the same L2 subnet can be arpinged.
// The other node is in the different netns reachable via the veth pair.
//
Expand Down Expand Up @@ -1402,6 +1408,39 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
}
}
c.Assert(found, check.Equals, false)

c.Assert(linuxNodeHandler.NodeAdd(nodev3), check.IsNil)
time.Sleep(100 * time.Millisecond) // insertNeighbor is invoked async

nextHop = net.ParseIP("9.9.9.250")
// Check that both node{2,3} are via nextHop (gw)
neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
c.Assert(err, check.IsNil)
found = false
for _, n := range neighs {
if n.IP.Equal(nextHop) && n.State == netlink.NUD_PERMANENT {
found = true
} else if n.IP.Equal(node2IP) || n.IP.Equal(node3IP) {
c.ExpectFailure("node{2,3} should not be in the same L2")
}
}
c.Assert(found, check.Equals, true)

// We have stored the devices in NodeConfigurationChanged
linuxNodeHandler.NodeCleanNeighbors()

neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
c.Assert(err, check.IsNil)
found = false
for _, n := range neighs {
if n.IP.Equal(nextHop) && n.State == netlink.NUD_PERMANENT {
found = true
} else if n.IP.Equal(node2IP) || n.IP.Equal(node3IP) {
c.ExpectFailure("node{2,3} should not be in the same L2")
}
}
c.Assert(found, check.Equals, false)

}

func (s *linuxPrivilegedBaseTestSuite) benchmarkNodeUpdate(c *check.C, config datapath.LocalNodeConfiguration) {
Expand Down
13 changes: 12 additions & 1 deletion pkg/datapath/linux/node_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018-2019 Authors of Cilium
// Copyright 2018-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,3 +108,14 @@ func (s *linuxTestSuite) TestCreateNodeRouteSpecMtu(c *check.C) {
c.Assert(err, check.IsNil)
c.Assert(generatedRoute.MTU, check.Equals, 0)
}

func (s *linuxTestSuite) TestStoreLoadNeighLinks(c *check.C) {
tmpDir := c.MkDir()
devExpected := "dev1"
err := storeNeighLink(tmpDir, devExpected)
c.Assert(err, check.IsNil)

devsActual, err := loadNeighLink(tmpDir)
c.Assert(err, check.IsNil)
c.Assert(devExpected, checker.DeepEquals, devsActual)
}
4 changes: 4 additions & 0 deletions pkg/datapath/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,8 @@ type NodeHandler interface {

// NodeNeighborRefresh is called to refresh node neighbor table
NodeNeighborRefresh(ctx context.Context, node nodeTypes.Node)

// NodeCleanNeighbors cleans all neighbor entries for the direct routing device
// and the encrypt interface.
NodeCleanNeighbors()
}
7 changes: 6 additions & 1 deletion pkg/hubble/peer/handler.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020 Authors of Cilium
// Copyright 2020-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,6 +131,11 @@ func (h handler) NodeNeighborRefresh(_ context.Context, _ types.Node) {
return
}

func (h handler) NodeCleanNeighbors() {
// no-op
return
}

// Close frees handler resources.
func (h *handler) Close() {
close(h.stop)
Expand Down
4 changes: 4 additions & 0 deletions pkg/node/manager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ func (n *signalNodeHandler) NodeNeighborRefresh(ctx context.Context, node nodeTy
return
}

func (n *signalNodeHandler) NodeCleanNeighbors() {
return
}

func (s *managerTestSuite) TestNodeLifecycle(c *check.C) {
dp := newSignalNodeHandler()
dp.EnableNodeAddEvent = true
Expand Down

0 comments on commit e80cc75

Please sign in to comment.