diff --git a/pkg/ipam/metrics/metrics.go b/pkg/ipam/metrics/metrics.go index 581f3c86312f..44dc90e39210 100644 --- a/pkg/ipam/metrics/metrics.go +++ b/pkg/ipam/metrics/metrics.go @@ -21,7 +21,12 @@ type prometheusMetrics struct { AllocateInterfaceOps *prometheus.CounterVec AllocateIpOps *prometheus.CounterVec ReleaseIpOps *prometheus.CounterVec - IPsAllocated *prometheus.GaugeVec + AvailableIPs *prometheus.GaugeVec + UsedIPs *prometheus.GaugeVec + NeededIPs *prometheus.GaugeVec + // Deprecated, will be removed in version 1.15. + // Use AvailableIPs, UsedIPs and NeededIPs instead. + IPsAllocated *prometheus.GaugeVec // Deprecated, will be removed in version 1.14: // Use InterfaceCandidates and EmptyInterfaceSlots instead AvailableInterfaces prometheus.Gauge @@ -35,6 +40,8 @@ type prometheusMetrics struct { resync *triggerMetrics } +const LabelTargetNodeName = "target_node" + // NewPrometheusMetrics returns a new interface metrics implementation backed by // Prometheus metrics. func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) *prometheusMetrics { @@ -42,6 +49,27 @@ func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) * registry: registry, } + m.AvailableIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: ipamSubsystem, + Name: "available_ips", + Help: "Total available IPs on Node for IPAM allocation", + }, []string{LabelTargetNodeName}) + + m.UsedIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: ipamSubsystem, + Name: "used_ips", + Help: "Total used IPs on Node for IPAM allocation", + }, []string{LabelTargetNodeName}) + + m.NeededIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: ipamSubsystem, + Name: "needed_ips", + Help: "Number of IPs that are needed on the Node to satisfy IPAM allocation requests", + }, []string{LabelTargetNodeName}) + m.IPsAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: ipamSubsystem, @@ -140,6 +168,10 @@ func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) * m.k8sSync = NewTriggerMetrics(namespace, "k8s_sync") m.resync = NewTriggerMetrics(namespace, "resync") + registry.MustRegister(m.AvailableIPs) + registry.MustRegister(m.UsedIPs) + registry.MustRegister(m.NeededIPs) + registry.MustRegister(m.IPsAllocated) registry.MustRegister(m.AllocateIpOps) registry.MustRegister(m.ReleaseIpOps) @@ -219,6 +251,19 @@ func (p *prometheusMetrics) ReleaseAttempt(typ, status, subnetID string, observe p.Release.WithLabelValues(typ, status, subnetID).Observe(observe) } +// Per Node metrics. +func (p *prometheusMetrics) SetIPAvailable(node string, cap int) { + p.AvailableIPs.WithLabelValues(node).Set(float64(cap)) +} + +func (p *prometheusMetrics) SetIPUsed(node string, usage int) { + p.UsedIPs.WithLabelValues(node).Set(float64(usage)) +} + +func (p *prometheusMetrics) SetIPNeeded(node string, usage int) { + p.NeededIPs.WithLabelValues(node).Set(float64(usage)) +} + type triggerMetrics struct { total prometheus.Counter folds prometheus.Gauge diff --git a/pkg/ipam/metrics/mock/mock.go b/pkg/ipam/metrics/mock/mock.go index 95299e399620..c02586a68bc4 100644 --- a/pkg/ipam/metrics/mock/mock.go +++ b/pkg/ipam/metrics/mock/mock.go @@ -173,6 +173,10 @@ func (m *mockMetrics) IncResyncCount() { m.mutex.Unlock() } +func (m *mockMetrics) SetNodeIPCapacity(s string, t string, n int) { + +} + func (m *mockMetrics) PoolMaintainerTrigger() trigger.MetricsObserver { return nil } diff --git a/pkg/ipam/node_manager.go b/pkg/ipam/node_manager.go index ba91b6586bd7..84afa03aa4f4 100644 --- a/pkg/ipam/node_manager.go +++ b/pkg/ipam/node_manager.go @@ -138,6 +138,8 @@ type AllocationImplementation interface { // MetricsAPI represents the metrics being maintained by a NodeManager type MetricsAPI interface { + MetricsNodeAPI + AllocationAttempt(typ, status, subnetID string, observe float64) ReleaseAttempt(typ, status, subnetID string, observe float64) IncInterfaceAllocation(subnetID string) @@ -155,6 +157,12 @@ type MetricsAPI interface { ResyncTrigger() trigger.MetricsObserver } +type MetricsNodeAPI interface { + SetIPAvailable(node string, cap int) + SetIPUsed(node string, used int) + SetIPNeeded(node string, needed int) +} + // nodeMap is a mapping of node names to ENI nodes type nodeMap map[string]*Node @@ -470,6 +478,9 @@ func (n *NodeManager) resyncNode(ctx context.Context, node *Node, stats *resyncS stats.mutex.Lock() stats.totalUsed += nodeStats.UsedIPs + // availableOnNode is the number of available IPs on the node at this + // current moment. It does not take into account the number of IPs that + // can be allocated in the future. availableOnNode := nodeStats.AvailableIPs - nodeStats.UsedIPs stats.totalAvailable += availableOnNode stats.totalNeeded += nodeStats.NeededIPs @@ -480,6 +491,11 @@ func (n *NodeManager) resyncNode(ctx context.Context, node *Node, stats *resyncS stats.nodeCapacity = nodeStats.Capacity + // Set per Node metrics. + n.metricsAPI.SetIPAvailable(node.name, stats.nodeCapacity) + n.metricsAPI.SetIPUsed(node.name, nodeStats.UsedIPs) + n.metricsAPI.SetIPNeeded(node.name, nodeStats.NeededIPs) + if allocationNeeded { stats.nodesInDeficit++ }