Skip to content

Commit

Permalink
operator: fix errors/warnings metric.
Browse files Browse the repository at this point in the history
This was broken during transition of pkg/metrics to integrate with Hive where relevant operator metrics where never initialized.
This adds a init func specific for operator and cleans up the "flush" logic used as a work around for errors/warnings emitted prior to agent starting (in the case of the operator).

Addresses: #29525

Signed-off-by: Tom Hadlaw <tom.hadlaw@isovalent.com>
  • Loading branch information
tommyp1ckles authored and julianwiedmann committed Mar 15, 2024
1 parent e929947 commit f61651f
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 11 deletions.
4 changes: 4 additions & 0 deletions operator/metrics/metrics.go
Expand Up @@ -103,5 +103,9 @@ func registerMetricsManager(p params) {
Registry.MustRegister(metric.(prometheus.Collector))
}

metrics.InitOperatorMetrics()
Registry.MustRegister(metrics.ErrorsWarnings)
metrics.FlushLoggingMetrics()

p.Lifecycle.Append(mm)
}
5 changes: 1 addition & 4 deletions pkg/metrics/cell.go
Expand Up @@ -13,9 +13,6 @@ var Cell = cell.Module("metrics", "Metrics",
cell.Invoke(func(_ *Registry) {
// This is a hack to ensure that errors/warnings collected in the pre hive initialization
// phase are emitted as metrics.
if metricsInitialized != nil {
close(metricsInitialized)
metricsInitialized = nil
}
FlushLoggingMetrics()
}),
)
18 changes: 17 additions & 1 deletion pkg/metrics/logging_hook.go
Expand Up @@ -6,14 +6,30 @@ package metrics
import (
"fmt"
"reflect"
"sync"
"sync/atomic"

"github.com/sirupsen/logrus"

"github.com/cilium/cilium/pkg/logging/logfields"
)

var metricsInitialized chan struct{} = make(chan struct{})
var (
metricsInitialized chan struct{} = make(chan struct{})
flushMetrics = sync.Once{}
)

// FlushLoggingMetrics will cause all logging hook metrics accumulated prior
// to the errors_warnings metrics being registered with the Prometheus collector
// to be incremented to their respective errors_warnings metrics tuple.
func FlushLoggingMetrics() {
flushMetrics.Do(func() {
if metricsInitialized != nil {
close(metricsInitialized)
metricsInitialized = nil
}
})
}

// LoggingHook is a hook for logrus which counts error and warning messages as a
// Prometheus metric.
Expand Down
21 changes: 15 additions & 6 deletions pkg/metrics/metrics.go
Expand Up @@ -981,12 +981,7 @@ func NewLegacyMetrics() *LegacyMetrics {
Help: "Number of services events labeled by action type",
}, []string{LabelAction}),

ErrorsWarnings: metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_errors_warnings_total",
Namespace: Namespace,
Name: "errors_warnings_total",
Help: "Number of total errors in cilium-agent instances",
}, []string{"level", "subsystem"}),
ErrorsWarnings: newErrorsWarningsMetric(),

ControllerRuns: metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_controllers_runs_total",
Expand Down Expand Up @@ -1413,6 +1408,20 @@ func NewLegacyMetrics() *LegacyMetrics {
return lm
}

// InitOperatorMetrics is used to init legacy metrics necessary during operator init.
func InitOperatorMetrics() {
ErrorsWarnings = newErrorsWarningsMetric()
}

func newErrorsWarningsMetric() metric.Vec[metric.Counter] {
return metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_errors_warnings_total",
Namespace: Namespace,
Name: "errors_warnings_total",
Help: "Number of total errors in cilium-agent instances",
}, []string{"level", "subsystem"})
}

// GaugeWithThreshold is a prometheus gauge that registers itself with
// prometheus if over a threshold value and unregisters when under.
type GaugeWithThreshold struct {
Expand Down

0 comments on commit f61651f

Please sign in to comment.