Skip to content

Commit

Permalink
operator: fix errors/warnings metric.
Browse files Browse the repository at this point in the history
This was broken during transition of pkg/metrics to integrate with Hive where relevant operator metrics where never initialized.
This adds a init func specific for operator and cleans up the "flush" logic used as a work around for errors/warnings emitted prior to agent starting (in the case of the operator).

Addresses: #29525

Signed-off-by: Tom Hadlaw <tom.hadlaw@isovalent.com>
  • Loading branch information
tommyp1ckles committed Mar 12, 2024
1 parent 1907334 commit b8061b4
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 9 deletions.
7 changes: 7 additions & 0 deletions operator/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"sync/atomic"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/viper"
Expand Down Expand Up @@ -113,6 +114,12 @@ var (
EnableGatewayAPI: operatorCfg.EnableGatewayAPI,
}
}),

cell.Invoke(func(r prometheus.Registerer) {
metrics.InitOperatorMetrics()
r.MustRegister(metrics.ErrorsWarnings)
metrics.FlushLoggingMetrics()
}),
)

// ControlPlane implements the control functions.
Expand Down
2 changes: 1 addition & 1 deletion operator/metrics/cell.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ var Cell = cell.Module(
"Operator Metrics",

cell.Config(defaultConfig),
cell.Invoke(registerMetricsManager),
cell.Provide(registerMetricsManager),
)

// Config contains the configuration for the operator-metrics cell.
Expand Down
5 changes: 3 additions & 2 deletions operator/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ func (mm *metricsManager) Stop(ctx cell.HookContext) error {
return nil
}

func registerMetricsManager(p params) {
func registerMetricsManager(p params) prometheus.Registerer {
if !p.SharedCfg.EnableMetrics {
return
return nil
}

mm := &metricsManager{
Expand Down Expand Up @@ -104,4 +104,5 @@ func registerMetricsManager(p params) {
}

p.Lifecycle.Append(mm)
return Registry
}
9 changes: 9 additions & 0 deletions pkg/metrics/logging_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ import (

var metricsInitialized chan struct{} = make(chan struct{})

func FlushLoggingMetrics() {
if metricsInitialized != nil {
close(metricsInitialized)
metricsInitialized = nil
}
}

// LoggingHook is a hook for logrus which counts error and warning messages as a
// Prometheus metric.
type LoggingHook struct {
Expand All @@ -23,6 +30,8 @@ type LoggingHook struct {

// NewLoggingHook returns a new instance of LoggingHook for the given Cilium
// component.
// If flush is set, metrics accumulated prior to initializing daemon metrics will be flushed
// to the prometheus collector (see pkg/metrics.Cell).
func NewLoggingHook() *LoggingHook {
lh := &LoggingHook{}
go func() {
Expand Down
21 changes: 15 additions & 6 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -981,12 +981,7 @@ func NewLegacyMetrics() *LegacyMetrics {
Help: "Number of services events labeled by action type",
}, []string{LabelAction}),

ErrorsWarnings: metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_errors_warnings_total",
Namespace: Namespace,
Name: "errors_warnings_total",
Help: "Number of total errors in cilium-agent instances",
}, []string{"level", "subsystem"}),
ErrorsWarnings: newErrorsWarningsMetric(),

ControllerRuns: metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_controllers_runs_total",
Expand Down Expand Up @@ -1413,6 +1408,20 @@ func NewLegacyMetrics() *LegacyMetrics {
return lm
}

// InitOperatorMetrics is used to init legacy metrics necessary during operator init.
func InitOperatorMetrics() {
ErrorsWarnings = newErrorsWarningsMetric()
}

func newErrorsWarningsMetric() metric.Vec[metric.Counter] {
return metric.NewCounterVec(metric.CounterOpts{
ConfigName: Namespace + "_errors_warnings_total",
Namespace: Namespace,
Name: "errors_warnings_total",
Help: "Number of total errors in cilium-agent instances",
}, []string{"level", "subsystem"})
}

// GaugeWithThreshold is a prometheus gauge that registers itself with
// prometheus if over a threshold value and unregisters when under.
type GaugeWithThreshold struct {
Expand Down

0 comments on commit b8061b4

Please sign in to comment.