Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

health-server: Do not cleanup health checking result on node updates. #30917

Merged
merged 1 commit into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 18 additions & 2 deletions pkg/health/server/prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,30 @@ func (p *prober) RemoveIP(ip string) {
// setNodes will steal references to nodes referenced from 'added', so the
// caller should not modify them after a call to setNodes.
// If a node is updated, it will appear in both maps and will be removed then
// added (potentially with different information).
// added (potentially with different information). We want to do it only if relevant
// health-information changes to preserve previous health-checking results.
func (p *prober) setNodes(added nodeMap, removed nodeMap) {
p.Lock()
defer p.Unlock()

// Check what IPs will be readded
// so we don't remove results that we already have for them.
readdedIPs := map[string]struct{}{}
for _, n := range added {
for elem, primary := range n.Addresses() {
_, addr := resolveIP(&n, elem, primary)
if addr == nil {
continue
}
readdedIPs[elem.IP] = struct{}{}
}
}

for _, n := range removed {
for elem := range n.Addresses() {
p.RemoveIP(elem.IP)
if _, ok := readdedIPs[elem.IP]; !ok {
p.RemoveIP(elem.IP)
}
}
}

Expand Down
18 changes: 17 additions & 1 deletion pkg/health/server/prober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

check "github.com/cilium/checkmate"

ciliumModels "github.com/cilium/cilium/api/v1/health/models"
"github.com/cilium/cilium/api/v1/models"
"github.com/cilium/cilium/pkg/checker"
)
Expand Down Expand Up @@ -146,7 +147,15 @@ func (s *HealthServerTestSuite) TestProbersetNodes(c *check.C) {
}},
}
c.Assert(sortNodes(nodes), checker.DeepEquals, sortNodes(expected))

// Set result of probing before updating the nodes.
// The result should not be deleted after node update.
if elem, ok := prober.results[ipString(node1.NodeElement.PrimaryAddress.IPV4.IP)]; ok {
elem.Icmp = &ciliumModels.ConnectivityStatus{
Status: "Some status",
}
} else {
c.Errorf("expected to find result element for node's ip %s", node1.NodeElement.PrimaryAddress.IPV4.IP)
}
// Update node 1. Node 2 should remain unaffected.
modifiedNodesOld := nodeMap{
ipString(node1.Name): node1,
Expand All @@ -163,6 +172,13 @@ func (s *HealthServerTestSuite) TestProbersetNodes(c *check.C) {
IP: node1HealthIP,
}}
c.Assert(sortNodes(nodes), checker.DeepEquals, sortNodes(expected))
if elem, ok := prober.results[ipString(node1.NodeElement.PrimaryAddress.IPV4.IP)]; !ok {
c.Errorf("expected to find result element for node's ip %s", node1.NodeElement.PrimaryAddress.IPV4.IP)
} else {
// Check that status was not removed when updating node
c.Assert(elem.Icmp, check.NotNil)
c.Assert(elem.Icmp.Status, checker.Equals, "Some status")
}

// Remove node 1. Again, Node 2 should remain.
removedNodes := nodeMap{
Expand Down
3 changes: 2 additions & 1 deletion pkg/health/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ func (s *Server) runActiveServices() error {
// OnIdle is called every ProbeInterval after sending out all icmp pings.
// There are a few important consideration here:
// (1) ICMP prober doesn't report failed probes
// (2) We can receive the same nodes multiple times in nodesAdded in case of updates
// (2) We can receive the same nodes multiple times,
// updated node is present in both nodesAdded and nodesRemoved
// (3) We need to clean icmp status to not retain stale probe results
// (4) We don't want to report stale nodes in metrics

Expand Down