Skip to content

Commit

Permalink
ipam/metrics: add per node eni pam ip metrics.
Browse files Browse the repository at this point in the history
These are intended to provide precise visibility into actual
available capacity individual nodes have for scheduling more CNI Endpoints.

Specifically this addresses problems with existing metrics:

* ipam_ips: do not actually provide total available for scheduling, but rather
	is the number of available ips for a particular point in time (discounting
	any future allocation).
* available interface / slots metrics: these can provide if there is capacity,
	but not specifically how much capacity.

We want to be able to determine exactly how many more CNI Pods should be able to
be scheduled, as well as what is the availability on individual nodes.

Signed-off-by: Tom Hadlaw <tom.hadlaw@isovalent.com>
  • Loading branch information
tommyp1ckles committed Apr 20, 2023
1 parent f03c7f0 commit 56ab89f
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
47 changes: 46 additions & 1 deletion pkg/ipam/metrics/metrics.go
Expand Up @@ -21,7 +21,12 @@ type prometheusMetrics struct {
AllocateInterfaceOps *prometheus.CounterVec
AllocateIpOps *prometheus.CounterVec
ReleaseIpOps *prometheus.CounterVec
IPsAllocated *prometheus.GaugeVec
AvailableIPs *prometheus.GaugeVec
UsedIPs *prometheus.GaugeVec
NeededIPs *prometheus.GaugeVec
// Deprecated, will be removed in version 1.15.
// Use AvailableIPs, UsedIPs and NeededIPs instead.
IPsAllocated *prometheus.GaugeVec
// Deprecated, will be removed in version 1.14:
// Use InterfaceCandidates and EmptyInterfaceSlots instead
AvailableInterfaces prometheus.Gauge
Expand All @@ -35,13 +40,36 @@ type prometheusMetrics struct {
resync *triggerMetrics
}

const LabelTargetNodeName = "target_node"

// NewPrometheusMetrics returns a new interface metrics implementation backed by
// Prometheus metrics.
func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) *prometheusMetrics {
m := &prometheusMetrics{
registry: registry,
}

m.AvailableIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: ipamSubsystem,
Name: "available_ips",
Help: "Total available IPs on Node for IPAM allocation",
}, []string{LabelTargetNodeName})

m.UsedIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: ipamSubsystem,
Name: "used_ips",
Help: "Total used IPs on Node for IPAM allocation",
}, []string{LabelTargetNodeName})

m.NeededIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: ipamSubsystem,
Name: "needed_ips",
Help: "Number of IPs that are needed on the Node to satisfy IPAM allocation requests",
}, []string{LabelTargetNodeName})

m.IPsAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: ipamSubsystem,
Expand Down Expand Up @@ -140,6 +168,10 @@ func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) *
m.k8sSync = NewTriggerMetrics(namespace, "k8s_sync")
m.resync = NewTriggerMetrics(namespace, "resync")

registry.MustRegister(m.AvailableIPs)
registry.MustRegister(m.UsedIPs)
registry.MustRegister(m.NeededIPs)

registry.MustRegister(m.IPsAllocated)
registry.MustRegister(m.AllocateIpOps)
registry.MustRegister(m.ReleaseIpOps)
Expand Down Expand Up @@ -219,6 +251,19 @@ func (p *prometheusMetrics) ReleaseAttempt(typ, status, subnetID string, observe
p.Release.WithLabelValues(typ, status, subnetID).Observe(observe)
}

// Per Node metrics.
func (p *prometheusMetrics) SetIPAvailable(node string, cap int) {
p.AvailableIPs.WithLabelValues(node).Set(float64(cap))
}

func (p *prometheusMetrics) SetIPUsed(node string, usage int) {
p.UsedIPs.WithLabelValues(node).Set(float64(usage))
}

func (p *prometheusMetrics) SetIPNeeded(node string, usage int) {
p.NeededIPs.WithLabelValues(node).Set(float64(usage))
}

type triggerMetrics struct {
total prometheus.Counter
folds prometheus.Gauge
Expand Down
4 changes: 4 additions & 0 deletions pkg/ipam/metrics/mock/mock.go
Expand Up @@ -173,6 +173,10 @@ func (m *mockMetrics) IncResyncCount() {
m.mutex.Unlock()
}

func (m *mockMetrics) SetNodeIPCapacity(s string, t string, n int) {

}

func (m *mockMetrics) PoolMaintainerTrigger() trigger.MetricsObserver {
return nil
}
Expand Down
16 changes: 16 additions & 0 deletions pkg/ipam/node_manager.go
Expand Up @@ -138,6 +138,8 @@ type AllocationImplementation interface {

// MetricsAPI represents the metrics being maintained by a NodeManager
type MetricsAPI interface {
MetricsNodeAPI

AllocationAttempt(typ, status, subnetID string, observe float64)
ReleaseAttempt(typ, status, subnetID string, observe float64)
IncInterfaceAllocation(subnetID string)
Expand All @@ -155,6 +157,12 @@ type MetricsAPI interface {
ResyncTrigger() trigger.MetricsObserver
}

type MetricsNodeAPI interface {
SetIPAvailable(node string, cap int)
SetIPUsed(node string, used int)
SetIPNeeded(node string, needed int)
}

// nodeMap is a mapping of node names to ENI nodes
type nodeMap map[string]*Node

Expand Down Expand Up @@ -470,6 +478,9 @@ func (n *NodeManager) resyncNode(ctx context.Context, node *Node, stats *resyncS

stats.mutex.Lock()
stats.totalUsed += nodeStats.UsedIPs
// availableOnNode is the number of available IPs on the node at this
// current moment. It does not take into account the number of IPs that
// can be allocated in the future.
availableOnNode := nodeStats.AvailableIPs - nodeStats.UsedIPs
stats.totalAvailable += availableOnNode
stats.totalNeeded += nodeStats.NeededIPs
Expand All @@ -480,6 +491,11 @@ func (n *NodeManager) resyncNode(ctx context.Context, node *Node, stats *resyncS

stats.nodeCapacity = nodeStats.Capacity

// Set per Node metrics.
n.metricsAPI.SetIPAvailable(node.name, stats.nodeCapacity)
n.metricsAPI.SetIPUsed(node.name, nodeStats.UsedIPs)
n.metricsAPI.SetIPNeeded(node.name, nodeStats.NeededIPs)

if allocationNeeded {
stats.nodesInDeficit++
}
Expand Down

0 comments on commit 56ab89f

Please sign in to comment.