Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add a metric for max observed endpoint ifindex #27953

Merged
merged 4 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions Documentation/observability/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,20 @@ Endpoint
Name Labels Default Description
============================================ ================================================== ========== ========================================================
``endpoint`` Enabled Number of endpoints managed by this agent
``endpoint_max_ifindex`` Disabled Maximum interface index observed for existing endpoints
``endpoint_regenerations_total`` ``outcome`` Enabled Count of all endpoint regenerations that have completed
``endpoint_regeneration_time_stats_seconds`` ``scope`` Enabled Endpoint regeneration time stats
``endpoint_state`` ``state`` Enabled Count of all endpoints
============================================ ================================================== ========== ========================================================

The default enabled status of ``endpoint_max_ifindex`` is dynamic. On earlier
kernels (typically with version lower than 5.10), Cilium must store the
interface index for each endpoint in the conntrack map, which reserves 16 bits
for this field. If Cilium is running on such a kernel, this metric will be
enabled by default. It can be used to implement an alert if the ifindex is
approaching the limit of 65535. This may be the case in instances of
significant Endpoint churn.

Services
~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions Documentation/operations/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ Added Metrics
~~~~~~~~~~~~~

* ``cilium_ipam_capacity``
* ``cilium_endpoint_max_ifindex`` See `#27953 <https://github.com/cilium/cilium/pull/27953>`_ for configuration and usage information

Changed Metrics
~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions Documentation/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ hugepages
hv
icmp
ie
ifindex
impactful
ingressing
init
Expand Down
10 changes: 10 additions & 0 deletions daemon/cmd/datapath.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/cilium/cilium/pkg/maps/tunnel"
"github.com/cilium/cilium/pkg/maps/vtep"
"github.com/cilium/cilium/pkg/maps/worldcidrsmap"
"github.com/cilium/cilium/pkg/metrics"
"github.com/cilium/cilium/pkg/mtu"
"github.com/cilium/cilium/pkg/node"
"github.com/cilium/cilium/pkg/option"
Expand Down Expand Up @@ -298,6 +299,15 @@ func (d *Daemon) syncHostIPs() error {
}
}

// we have a reference to all ifindex values, so we update the related metric
maxIfindex := uint32(0)
for _, endpoint := range existingEndpoints {
if endpoint.IfIndex > maxIfindex {
maxIfindex = endpoint.IfIndex
}
}
metrics.EndpointMaxIfindex.Set(float64(maxIfindex))

if option.Config.EnableVTEP {
err := setupVTEPMapping()
if err != nil {
Expand Down
3 changes: 2 additions & 1 deletion pkg/aws/eni/node_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
ipamTypes "github.com/cilium/cilium/pkg/ipam/types"
v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
"github.com/cilium/cilium/pkg/testutils"
testipam "github.com/cilium/cilium/pkg/testutils/ipam"
)

var (
Expand Down Expand Up @@ -459,7 +460,7 @@ func (e *ENISuite) TestNodeManagerReleaseAddress(c *check.C) {
time.Sleep(1 * time.Second)
node.PopulateIPReleaseStatus(obj)
// Fake acknowledge IPs for release like agent would.
testutils.FakeAcknowledgeReleaseIps(obj)
testipam.FakeAcknowledgeReleaseIps(obj)
node.UpdatedResource(obj)
// Resync one more time to process acknowledgements.
syncTime = instances.Resync(context.TODO())
Expand Down
8 changes: 4 additions & 4 deletions pkg/datapath/linux/probes/managed_neighbors.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ func haveManagedNeighbors() (outer error) {
neigh := netlink.Neigh{
LinkIndex: veth.Index,
IP: net.IPv4(0, 0, 0, 1),
Flags: netlink.NTF_EXT_LEARNED,
FlagsExt: netlink.NTF_EXT_MANAGED,
Flags: NTF_EXT_LEARNED,
FlagsExt: NTF_EXT_MANAGED,
}

if err := netlink.NeighAdd(&neigh); err != nil {
Expand All @@ -103,10 +103,10 @@ func haveManagedNeighbors() (outer error) {
if !n.IP.Equal(neigh.IP) {
continue
}
if n.Flags != netlink.NTF_EXT_LEARNED {
if n.Flags != NTF_EXT_LEARNED {
continue
}
if n.FlagsExt != netlink.NTF_EXT_MANAGED {
if n.FlagsExt != NTF_EXT_MANAGED {
continue
}

Expand Down
12 changes: 12 additions & 0 deletions pkg/datapath/linux/probes/probes_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package probes

import "github.com/vishvananda/netlink"

// Family type definitions
const (
NTF_EXT_LEARNED = netlink.NTF_EXT_LEARNED
NTF_EXT_MANAGED = netlink.NTF_EXT_MANAGED
)
12 changes: 12 additions & 0 deletions pkg/datapath/linux/probes/probes_unspecified.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

//go:build !linux

package probes

// Dummy values on non-linux platform
const (
NTF_EXT_LEARNED = iota
NTF_EXT_MANAGED
)
5 changes: 3 additions & 2 deletions pkg/ipam/node_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cilium/cilium/pkg/lock"
"github.com/cilium/cilium/pkg/math"
"github.com/cilium/cilium/pkg/testutils"
testipam "github.com/cilium/cilium/pkg/testutils/ipam"
)

var (
Expand Down Expand Up @@ -493,7 +494,7 @@ func (e *IPAMSuite) TestNodeManagerReleaseAddress(c *check.C) {
time.Sleep(1 * time.Second)
node.PopulateIPReleaseStatus(node.resource)
// Fake acknowledge IPs for release like agent would.
testutils.FakeAcknowledgeReleaseIps(node.resource)
testipam.FakeAcknowledgeReleaseIps(node.resource)
// Resync one more time to process acknowledgements.
node.instanceSync.Trigger()
})
Expand Down Expand Up @@ -557,7 +558,7 @@ func (e *IPAMSuite) TestNodeManagerAbortRelease(c *check.C) {
c.Assert(len(node.resource.Status.IPAM.ReleaseIPs), check.Equals, 1)

// Fake acknowledge IPs for release like agent would.
testutils.FakeAcknowledgeReleaseIps(node.resource)
testipam.FakeAcknowledgeReleaseIps(node.resource)

// Use up one more IP to make excess = 0
mngr.Upsert(updateCiliumNode(node.resource, 3))
Expand Down
23 changes: 23 additions & 0 deletions pkg/metrics/metrics.go
asauber marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/sirupsen/logrus"

"github.com/cilium/cilium/api/v1/models"
"github.com/cilium/cilium/pkg/datapath/linux/probes"
"github.com/cilium/cilium/pkg/metrics/metric"
"github.com/cilium/cilium/pkg/promise"
"github.com/cilium/cilium/pkg/version"
Expand Down Expand Up @@ -258,6 +259,9 @@ var (
// It must be thread-safe.
Endpoint metric.GaugeFunc

// EndpointMaxIfindex is the maximum observed interface index for existing endpoints
EndpointMaxIfindex = NoOpGauge

// EndpointRegenerationTotal is a count of the number of times any endpoint
// has been regenerated and success/fail outcome
EndpointRegenerationTotal = NoOpCounterVec
Expand Down Expand Up @@ -570,6 +574,7 @@ type LegacyMetrics struct {
NodeConnectivityStatus metric.Vec[metric.Gauge]
NodeConnectivityLatency metric.Vec[metric.Gauge]
Endpoint metric.GaugeFunc
EndpointMaxIfindex metric.Gauge
christarazi marked this conversation as resolved.
Show resolved Hide resolved
EndpointRegenerationTotal metric.Vec[metric.Counter]
EndpointStateCount metric.Vec[metric.Gauge]
EndpointRegenerationTimeStats metric.Vec[metric.Observer]
Expand Down Expand Up @@ -1280,6 +1285,23 @@ func NewLegacyMetrics() *LegacyMetrics {
}),
}

ifindexOpts := metric.GaugeOpts{
ConfigName: Namespace + "_endpoint_max_ifindex",
Disabled: true,
Namespace: Namespace,
Name: "endpoint_max_ifindex",
Help: "Maximum interface index observed for existing endpoints",
}
// On kernels which do not provide ifindex via the FIB, Cilium needs
// to store it in the CT map, with a field limit of max(uint16).
// The EndpointMaxIfindex metric can be used to determine if that
// limit is approaching. However, it should only be enabled by
// default if we observe that the FIB is not providing the ifindex.
if probes.HaveFibIfindex() != nil {
ifindexOpts.Disabled = false
}
lm.EndpointMaxIfindex = metric.NewGauge(ifindexOpts)

v := version.GetCiliumVersion()
lm.VersionMetric.WithLabelValues(v.Version, v.Revision, v.Arch)

Expand All @@ -1288,6 +1310,7 @@ func NewLegacyMetrics() *LegacyMetrics {
NodeConnectivityStatus = lm.NodeConnectivityStatus
NodeConnectivityLatency = lm.NodeConnectivityLatency
Endpoint = lm.Endpoint
EndpointMaxIfindex = lm.EndpointMaxIfindex
EndpointRegenerationTotal = lm.EndpointRegenerationTotal
EndpointStateCount = lm.EndpointStateCount
EndpointRegenerationTimeStats = lm.EndpointRegenerationTimeStats
Expand Down
2 changes: 1 addition & 1 deletion pkg/testutils/ipam.go → pkg/testutils/ipam/ipam.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package testutils
package testipam

import (
ipamOption "github.com/cilium/cilium/pkg/ipam/option"
Expand Down