Skip to content

Commit

Permalink
nodediscovery: Fix bug where CiliumInternalIP was flapping
Browse files Browse the repository at this point in the history
[ upstream commit 263e689 ]

[ backporter's notes: Various conflicts. net.IPFamilyOfString(x) doesn't
  exist so it had to be replaced with ParseIPSloppy(x).To4().
  slices.DeleteFunc doesn't exist so it had to be defined in our own
  slices package as done on v1.14. mutateNodeResource's logic was
  significantly refactored so the new code to clean node IP addresses
  had to be moved before all appending of node IP addresses. ]

This fixes a bug in `UpdateCiliumNodeResource` where the
`CiliumInternalIP` (aka `cilium_host` IP, aka router IP) was flapping in
the node manager during restoration (i.e. during cilium-agent restarts).

In particular in `cluster-pool` mode, `UpdateCiliumNodeResource` is
called before the `cilium_host` IP has been restored, as there are some
circular dependencies: The restored IP can only be fully validated after
the IPAM subsystem is ready, but that in turn can only happen if the
`CiliumNode` object has been created. The `UpdateCiliumNodeResource`
function however will only announce the `cilium_host` IP if it has been
restored.

This commit attempts to break that cycle by not overwriting any already
existing `CiliumInternalIP` in the CiliumNode resource.

Overall, this change is rather hacky, in particular it does not address
the fact that other less crucial node information (like the health IP)
also flaps. But since we want to backport this bugfix to older stable
branches too, this change is intentionally kept as minimal as possible.

Example node event (as observed by other nodes) before this change:

```
2023-12-18T12:58:20.070330814Z level=debug msg="Received node update event from custom-resource" node="{\"Name\":\"kind-worker\",\"Cluster\":\"default\",\"IPAddresses\":[{\"Type\":\"InternalIP\",\"IP\":\"172.18.0.4\"},{\"Type\":\"InternalIP\",\"IP\":\"fc00:c111::4\"}],..." subsys=nodemanager
2023-12-18T12:58:20.208082226Z level=debug msg="Received node update event from custom-resource" node="{\"Name\":\"kind-worker\",\"Cluster\":\"default\",\"IPAddresses\":[{\"Type\":\"InternalIP\",\"IP\":\"172.18.0.4\"},{\"Type\":\"InternalIP\",\"IP\":\"fc00:c111::4\"},{\"Type\":\"CiliumInternalIP\",\"IP\":\"10.0.1.245\"}],..." subsys=nodemanager
```

After this change (note the `CiliumInternalIP` present in both events):

```
2023-12-18T15:38:23.695653876Z level=debug msg="Received node update event from custom-resource" node="{\"Name\":\"kind-worker\",\"Cluster\":\"default\",\"IPAddresses\":[{\"Type\":\"CiliumInternalIP\",\"IP\":\"10.0.1.245\"},{\"Type\":\"InternalIP\",\"IP\":\"172.18.0.4\"},{\"Type\":\"InternalIP\",\"IP\":\"fc00:c111::4\"}],..." subsys=nodemanager
2023-12-18T15:38:23.838604573Z level=debug msg="Received node update event from custom-resource" node="{\"Name\":\"kind-worker\",\"Cluster\":\"default\",\"IPAddresses\":[{\"Type\":\"InternalIP\",\"IP\":\"172.18.0.4\"},{\"Type\":\"InternalIP\",\"IP\":\"fc00:c111::4\"},{\"Type\":\"CiliumInternalIP\",\"IP\":\"10.0.1.245\"}],...}" subsys=nodemanager
```

Reported-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Sebastian Wicki <sebastian@isovalent.com>
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
  • Loading branch information
gandro authored and pchaigno committed Dec 20, 2023
1 parent 2650925 commit f68f463
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 2 deletions.
25 changes: 23 additions & 2 deletions pkg/nodediscovery/nodediscovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/net"

alibabaCloudTypes "github.com/cilium/cilium/pkg/alibabacloud/eni/types"
alibabaCloudMetadata "github.com/cilium/cilium/pkg/alibabacloud/metadata"
Expand Down Expand Up @@ -42,6 +43,7 @@ import (
"github.com/cilium/cilium/pkg/node/types"
nodeTypes "github.com/cilium/cilium/pkg/node/types"
"github.com/cilium/cilium/pkg/option"
"github.com/cilium/cilium/pkg/slices"
"github.com/cilium/cilium/pkg/source"
cnitypes "github.com/cilium/cilium/plugins/cilium-cni/types"
)
Expand Down Expand Up @@ -412,8 +414,6 @@ func (n *NodeDiscovery) mutateNodeResource(nodeResource *ciliumv2.CiliumNode) er
k8sNodeAddresses []nodeTypes.Address
)

nodeResource.Spec.Addresses = []ciliumv2.NodeAddress{}

// If we are unable to fetch the K8s Node resource and the CiliumNode does
// not have an OwnerReference set, then somehow we are running in an
// environment where only the CiliumNode exists. Do not proceed as this is
Expand Down Expand Up @@ -461,6 +461,27 @@ func (n *NodeDiscovery) mutateNodeResource(nodeResource *ciliumv2.CiliumNode) er
localCN := n.localNode.ToCiliumNode()
nodeResource.ObjectMeta.Annotations = localCN.Annotations

// This function can be called before we have restored the CiliumInternalIP.
// In that case, we do not want to remove the old CiliumInternalIP, as this
// would lead to the IP address flapping. Therefore, this code preserves any
// CiliumInternalIP if (and only if) the local node store does not yet
// include the restored CiliumInternalIP.
nodeResource.Spec.Addresses = slices.DeleteFunc(nodeResource.Spec.Addresses, func(address ciliumv2.NodeAddress) bool {
if address.Type == addressing.NodeCiliumInternalIP {
// Only delete a CiliumInternalIP if
// a) its IP family is disabled,
// and/or
// b) the LocalNode store contains an IP address which we can use instead
if v4 := net.ParseIPSloppy(address.IP).To4(); v4 != nil {
return !n.LocalConfig.EnableIPv4 || n.localNode.GetCiliumInternalIP(false) != nil
} else {
return !n.LocalConfig.EnableIPv6 || n.localNode.GetCiliumInternalIP(true) != nil
}
}

return true // delete all other node addresses
})

for _, k8sAddress := range k8sNodeAddresses {
// Do not add CiliumNodeInternalIP from the k8sNodeAddress. The source
// of truth is always the local node. The CiliumInternalIP address is
Expand Down
5 changes: 5 additions & 0 deletions pkg/slices/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

// Package slices contains common utilities to work on slices of any type.
package slices
29 changes: 29 additions & 0 deletions pkg/slices/slices.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package slices

import (
"golang.org/x/exp/slices"
)

// DeleteFunc removes any elements from s for which del returns true,
// returning the modified slice.
// When DeleteFunc removes m elements, it might not modify the elements
// s[len(s)-m:len(s)]. If those elements contain pointers you might consider
// zeroing those elements so that objects they reference can be garbage
// collected.
func DeleteFunc[S ~[]E, E any](s S, del func(E) bool) S {
i := slices.IndexFunc(s, del)
if i == -1 {
return s
}
// Don't start copying elements until we find one to delete.
for j := i + 1; j < len(s); j++ {
if v := s[j]; !del(v) {
s[i] = v
i++
}
}
return s[:i]
}

0 comments on commit f68f463

Please sign in to comment.