Skip to content

Commit

Permalink
health-server: Do not cleanup health checking result on node updates.
Browse files Browse the repository at this point in the history
Whenever node was updated, healtch-checking was removing and re-adding
that node. This caused it to lose information about previously performed
probes, which resulted in `unknown` status for such nodes. This
can happen often especially in ENI mode, where node updates happen each
time new pod is scheduled on the node.

Signed-off-by: Marcel Zieba <marcel.zieba@isovalent.com>
  • Loading branch information
marseel committed Mar 1, 2024
1 parent 4082bc3 commit fa1bccf
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 4 deletions.
20 changes: 18 additions & 2 deletions pkg/health/server/prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,30 @@ func (p *prober) RemoveIP(ip string) {
// setNodes will steal references to nodes referenced from 'added', so the
// caller should not modify them after a call to setNodes.
// If a node is updated, it will appear in both maps and will be removed then
// added (potentially with different information).
// added (potentially with different information). We want to do it only if relevant
// health-information changes to preserve previous health-checking results.
func (p *prober) setNodes(added nodeMap, removed nodeMap) {
p.Lock()
defer p.Unlock()

// Check what IPs will be readded
// so we don't remove results that we already have for them.
readdedIPs := map[string]struct{}{}
for _, n := range added {
for elem, primary := range n.Addresses() {
_, addr := resolveIP(&n, elem, primary)
if addr == nil {
continue
}
readdedIPs[elem.IP] = struct{}{}
}
}

for _, n := range removed {
for elem := range n.Addresses() {
p.RemoveIP(elem.IP)
if _, ok := readdedIPs[elem.IP]; !ok {
p.RemoveIP(elem.IP)
}
}
}

Expand Down
18 changes: 17 additions & 1 deletion pkg/health/server/prober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

check "github.com/cilium/checkmate"

ciliumModels "github.com/cilium/cilium/api/v1/health/models"
"github.com/cilium/cilium/api/v1/models"
"github.com/cilium/cilium/pkg/checker"
)
Expand Down Expand Up @@ -146,7 +147,15 @@ func (s *HealthServerTestSuite) TestProbersetNodes(c *check.C) {
}},
}
c.Assert(sortNodes(nodes), checker.DeepEquals, sortNodes(expected))

// Set result of probing before updating the nodes.
// The result should not be deleted after node update.
if elem, ok := prober.results[ipString(node1.NodeElement.PrimaryAddress.IPV4.IP)]; ok {
elem.Icmp = &ciliumModels.ConnectivityStatus{
Status: "Some status",
}
} else {
c.Errorf("expected to find result element for node's ip %s", node1.NodeElement.PrimaryAddress.IPV4.IP)
}
// Update node 1. Node 2 should remain unaffected.
modifiedNodesOld := nodeMap{
ipString(node1.Name): node1,
Expand All @@ -163,6 +172,13 @@ func (s *HealthServerTestSuite) TestProbersetNodes(c *check.C) {
IP: node1HealthIP,
}}
c.Assert(sortNodes(nodes), checker.DeepEquals, sortNodes(expected))
if elem, ok := prober.results[ipString(node1.NodeElement.PrimaryAddress.IPV4.IP)]; !ok {
c.Errorf("expected to find result element for node's ip %s", node1.NodeElement.PrimaryAddress.IPV4.IP)
} else {
// Check that status was not removed when updating node
c.Assert(elem.Icmp, check.NotNil)
c.Assert(elem.Icmp.Status, checker.Equals, "Some status")
}

// Remove node 1. Again, Node 2 should remain.
removedNodes := nodeMap{
Expand Down
3 changes: 2 additions & 1 deletion pkg/health/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ func (s *Server) runActiveServices() error {
// OnIdle is called every ProbeInterval after sending out all icmp pings.
// There are a few important consideration here:
// (1) ICMP prober doesn't report failed probes
// (2) We can receive the same nodes multiple times in nodesAdded in case of updates
// (2) We can receive the same nodes multiple times,
// updated node is present in both nodesAdded and nodesRemoved
// (3) We need to clean icmp status to not retain stale probe results
// (4) We don't want to report stale nodes in metrics

Expand Down

0 comments on commit fa1bccf

Please sign in to comment.