Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

node-neigh: Fix unit test flake #16072

Merged
merged 3 commits into from
May 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 8 additions & 2 deletions pkg/datapath/linux/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,7 @@ func (n *linuxNodeHandler) insertNeighbor(ctx context.Context, newNode *nodeType

// nextHop hasn't been arpinged before OR we are refreshing neigh entry
var hwAddr net.HardwareAddr
var now time.Time
if nextHopIsNew || refresh {
hwAddr, err = arp.PingOverLink(link, srcIPv4, nextHopIPv4)
if err != nil {
Expand All @@ -760,13 +761,19 @@ func (n *linuxNodeHandler) insertNeighbor(ctx context.Context, newNode *nodeType
return
}
metrics.ArpingRequestsTotal.WithLabelValues(success).Inc()
now = time.Now()
}

n.neighLock.Lock()
n.neighLastPingByNextHop[nextHopStr] = time.Now()
defer n.neighLock.Unlock()

if hwAddr != nil {
if prev, found := n.neighLastPingByNextHop[nextHopStr]; found && prev.After(now) {
// Do not update the neigh entry if there was another goroutine which
// issued arping after us, as it might have a more recent hwAddr value.
return
}
n.neighLastPingByNextHop[nextHopStr] = time.Now()
if prevHwAddr, found := n.neighByNextHop[nextHopStr]; found && prevHwAddr.String() == hwAddr.String() {
// Nothing to update, return early to avoid calling to netlink. This
// is based on the assumption that n.neighByNextHop gets populated
Expand Down Expand Up @@ -820,7 +827,6 @@ func (n *linuxNodeHandler) deleteNeighbor(oldNode *nodeTypes.Node) {
defer func() { delete(n.neighNextHopByNode, oldNode.Identity()) }()

if n.neighNextHopRefCount.Delete(nextHopStr) {

neigh, found := n.neighByNextHop[nextHopStr]
delete(n.neighByNextHop, nextHopStr)
delete(n.neighLastPingByNextHop, nextHopStr)
Expand Down
48 changes: 40 additions & 8 deletions pkg/datapath/linux/node_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
nodeaddressing "github.com/cilium/cilium/pkg/node/addressing"
nodeTypes "github.com/cilium/cilium/pkg/node/types"
"github.com/cilium/cilium/pkg/option"
"github.com/cilium/cilium/pkg/testutils"

"github.com/containernetworking/plugins/pkg/ns"
"github.com/vishvananda/netlink"
Expand Down Expand Up @@ -1014,7 +1015,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
dpConfig := DatapathConfiguration{HostDevice: "veth0"}
prevARPPeriod := option.Config.ARPPingRefreshPeriod
defer func() { option.Config.ARPPingRefreshPeriod = prevARPPeriod }()
option.Config.ARPPingRefreshPeriod = time.Duration(10 * time.Millisecond)
option.Config.ARPPingRefreshPeriod = time.Duration(1 * time.Nanosecond)

linuxNodeHandler := NewNodeHandler(dpConfig, s.nodeAddressing, nil).(*linuxNodeHandler)
c.Assert(linuxNodeHandler, check.Not(check.IsNil))
Expand All @@ -1026,18 +1027,42 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
})
c.Assert(err, check.IsNil)

// wait waits for neigh entry update or waits for removal if waitForDelete=true
wait := func(nodeID nodeTypes.Identity, before *time.Time, waitForDelete bool) {
err := testutils.WaitUntil(func() bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my reading of the code now is a lower bound for when the operation happened.

However, it is used only for insertion and not deletion (since for the latter, we do not have a deletion timestamp to check).
Given this, and the overall structure of the function, I wonder if it would be better to have two versions of this function one for deletes and one for inserts.

I think this would also improve readability on the caller side (at the cost of some code duplication).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my reading of the code now is a lower bound for when the operation happened.

Did s/now/before/. Hope it's more clear.

However, it is used only for insertion and not deletion

Made before as *time.Time, so that nil can be passed for the wait for the deletion.

Given this, and the overall structure of the function, I wonder if it would be better to have two versions of this function one for deletes and one for inserts.

I want this waiting routine to be concise to avoid polluting the test case. Also, to follow DRY principle.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, sounds good.

linuxNodeHandler.neighLock.Lock()
defer linuxNodeHandler.neighLock.Unlock()
nextHop, found := linuxNodeHandler.neighNextHopByNode[nodeID]
if !found {
return waitForDelete
}
lastPing, found := linuxNodeHandler.neighLastPingByNextHop[nextHop]
if !found {
return false
}
if waitForDelete {
return false
}
return before.Before(lastPing)
}, 5*time.Second)
c.Assert(err, check.IsNil)
}

nodev1 := nodeTypes.Node{
Name: "node1",
IPAddresses: []nodeTypes.Address{{nodeaddressing.NodeInternalIP, ip1}},
}
now := time.Now()
err = linuxNodeHandler.NodeAdd(nodev1)
c.Assert(err, check.IsNil)
// insertNeighbor is invoked async
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same comment appears a few lines below. Was this maybe forgotten from an earlier version?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed redundant comments.

// Insert the same node second time. This should not increment refcount for
// the same nextHop. We test it by checking that NodeDelete has removed the
// related neigh entry.
err = linuxNodeHandler.NodeAdd(nodev1)
c.Assert(err, check.IsNil)
time.Sleep(100 * time.Millisecond) // insertNeighbor is invoked async
// insertNeighbor is invoked async, so thus this wait based on last ping
wait(nodev1.Identity(), &now, false)

// Check whether an arp entry for nodev1 IP addr (=veth1) was added
neighs, err := netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
Expand All @@ -1064,10 +1089,12 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
return nil
})

now = time.Now()
err = netlink.LinkSetHardwareAddr(veth0, veth1HwAddr)
c.Assert(err, check.IsNil)

linuxNodeHandler.NodeNeighborRefresh(context.TODO(), nodev1)
wait(nodev1.Identity(), &now, false)
neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
c.Assert(err, check.IsNil)
found = false
Expand All @@ -1084,7 +1111,8 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
// Remove nodev1, and check whether the arp entry was removed
err = linuxNodeHandler.NodeDelete(nodev1)
c.Assert(err, check.IsNil)
time.Sleep(100 * time.Millisecond) // deleteNeighbor is invoked async
// deleteNeighbor is invoked async too
wait(nodev1.Identity(), nil, true)

neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
c.Assert(err, check.IsNil)
Expand All @@ -1100,9 +1128,10 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
// Create multiple goroutines which call insertNeighbor and check whether
// MAC changes of veth1 are properly handled. This is a basic randomized
// testing of insertNeighbor() fine-grained locking.
now = time.Now()
err = linuxNodeHandler.NodeAdd(nodev1)
c.Assert(err, check.IsNil)
time.Sleep(100 * time.Millisecond)
wait(nodev1.Identity(), &now, false)

rndHWAddr := func() net.HardwareAddr {
mac := make([]byte, 6)
Expand Down Expand Up @@ -1172,10 +1201,10 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
}
// Cleanup
close(done)
wg.Wait()
now = time.Now()
err = linuxNodeHandler.NodeDelete(nodev1)
c.Assert(err, check.IsNil)
time.Sleep(100 * time.Millisecond) // deleteNeighbor is invoked async
wait(nodev1.Identity(), nil, true)

// Setup routine for the 2. test
setupRemoteNode := func(vethName, vethPeerName, netnsName, vethCIDR, vethIPAddr,
Expand Down Expand Up @@ -1320,15 +1349,17 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {
Name: "node2",
IPAddresses: []nodeTypes.Address{{nodeaddressing.NodeInternalIP, node2IP}},
}
now = time.Now()
c.Assert(linuxNodeHandler.NodeAdd(nodev2), check.IsNil)
wait(nodev2.Identity(), &now, false)

node3IP := net.ParseIP("7.7.7.250")
nodev3 := nodeTypes.Node{
Name: "node3",
IPAddresses: []nodeTypes.Address{{nodeaddressing.NodeInternalIP, node3IP}},
}
c.Assert(linuxNodeHandler.NodeAdd(nodev3), check.IsNil)
time.Sleep(100 * time.Millisecond) // insertNeighbor is invoked async
wait(nodev3.Identity(), &now, false)

nextHop := net.ParseIP("9.9.9.250")
// Check that both node{2,3} are via nextHop (gw)
Expand All @@ -1346,6 +1377,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {

// Check that removing node2 will not remove nextHop, as it is still used by node3
c.Assert(linuxNodeHandler.NodeDelete(nodev2), check.IsNil)
wait(nodev2.Identity(), nil, true)
neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
found = false
for _, n := range neighs {
Expand All @@ -1358,7 +1390,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandling(c *check.C) {

// However, removing node3 should remove the neigh entry for nextHop
c.Assert(linuxNodeHandler.NodeDelete(nodev3), check.IsNil)
time.Sleep(100 * time.Millisecond) // deleteNeighbor is invoked async
wait(nodev3.Identity(), nil, true)

found = false
neighs, err = netlink.NeighList(veth0.Attrs().Index, netlink.FAMILY_V4)
Expand Down