diff --git a/integration/asserts.go b/integration/asserts.go index fc4e77c4a94..e595aa1b3c5 100644 --- a/integration/asserts.go +++ b/integration/asserts.go @@ -32,7 +32,7 @@ var ( Querier: []string{}, QueryFrontend: []string{"cortex_frontend", "cortex_query_frontend"}, TableManager: []string{}, - AlertManager: []string{}, + AlertManager: []string{"cortex_alertmanager"}, Ruler: []string{}, } diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 6a8c2b6ba0c..66c874b1b8f 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -6,6 +6,7 @@ import ( "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/require" ) @@ -220,48 +221,37 @@ type nflogMetrics struct { func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { m := &nflogMetrics{} - m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + m.gcDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{ Name: "alertmanager_nflog_gc_duration_seconds", Help: "Duration of the last notification log garbage collection cycle.", Objectives: map[float64]float64{}, }) - m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + m.snapshotDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{ Name: "alertmanager_nflog_snapshot_duration_seconds", Help: "Duration of the last notification log snapshot.", Objectives: map[float64]float64{}, }) - m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + m.snapshotSize = promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_nflog_snapshot_size_bytes", Help: "Size of the last notification log snapshot in bytes.", }) - m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.queriesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_nflog_queries_total", Help: "Number of notification log queries were received.", }) - m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.queryErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_nflog_query_errors_total", Help: "Number notification log received queries that failed.", }) - m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + m.queryDuration = promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "alertmanager_nflog_query_duration_seconds", Help: "Duration of notification log query evaluation.", }) - m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.propagatedMessagesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_nflog_gossip_messages_propagated_total", Help: "Number of received gossip messages that have been further gossiped.", }) - if r != nil { - r.MustRegister( - m.gcDuration, - m.snapshotDuration, - m.snapshotSize, - m.queriesTotal, - m.queryErrorsTotal, - m.queryDuration, - m.propagatedMessagesTotal, - ) - } return m } @@ -282,66 +272,52 @@ type silenceMetrics struct { func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { m := &silenceMetrics{} - m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + m.gcDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{ Name: "alertmanager_silences_gc_duration_seconds", Help: "Duration of the last silence garbage collection cycle.", Objectives: map[float64]float64{}, }) - m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + m.snapshotDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{ Name: "alertmanager_silences_snapshot_duration_seconds", Help: "Duration of the last silence snapshot.", Objectives: map[float64]float64{}, }) - m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + m.snapshotSize = promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_silences_snapshot_size_bytes", Help: "Size of the last silence snapshot in bytes.", }) - m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.queriesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_silences_queries_total", Help: "How many silence queries were received.", }) - m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.queryErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_silences_query_errors_total", Help: "How many silence received queries did not succeed.", }) - m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + m.queryDuration = promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "alertmanager_silences_query_duration_seconds", Help: "Duration of silence query evaluation.", }) - m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + m.propagatedMessagesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_silences_gossip_messages_propagated_total", Help: "Number of received gossip messages that have been further gossiped.", }) - m.silencesActive = prometheus.NewGauge(prometheus.GaugeOpts{ + m.silencesActive = promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_silences", Help: "How many silences by state.", ConstLabels: prometheus.Labels{"state": string(types.SilenceStateActive)}, }) - m.silencesPending = prometheus.NewGauge(prometheus.GaugeOpts{ + m.silencesPending = promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_silences", Help: "How many silences by state.", ConstLabels: prometheus.Labels{"state": string(types.SilenceStatePending)}, }) - m.silencesExpired = prometheus.NewGauge(prometheus.GaugeOpts{ + m.silencesExpired = promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_silences", Help: "How many silences by state.", ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)}, }) - if r != nil { - r.MustRegister( - m.gcDuration, - m.snapshotDuration, - m.snapshotSize, - m.queriesTotal, - m.queryErrorsTotal, - m.queryDuration, - m.silencesActive, - m.silencesPending, - m.silencesExpired, - m.propagatedMessagesTotal, - ) - } return m } @@ -354,17 +330,17 @@ type notifyMetrics struct { func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { m := ¬ifyMetrics{ - numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + numNotifications: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notifications_total", Help: "The total number of attempted notifications.", }, []string{"integration"}), - numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + numFailedNotifications: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", }, []string{"integration"}), - notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + notificationLatencySeconds: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{ Namespace: "alertmanager", Name: "notification_latency_seconds", Help: "The latency of notifications in seconds.", @@ -376,7 +352,6 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { m.numFailedNotifications.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) } - r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds) return m } @@ -385,15 +360,12 @@ type markerMetrics struct { } func newMarkerMetrics(r prometheus.Registerer) *markerMetrics { - m := &markerMetrics{ - alerts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + return &markerMetrics{ + alerts: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ Name: "alertmanager_alerts", Help: "How many alerts by state.", }, []string{"state"}), } - - r.MustRegister(m.alerts) - return m } // Copied from github.com/alertmanager/api/metrics/metrics.go @@ -404,19 +376,17 @@ type apiMetrics struct { } func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { - numReceivedAlerts := prometheus.NewCounterVec(prometheus.CounterOpts{ + numReceivedAlerts := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "alertmanager_alerts_received_total", Help: "The total number of received alerts.", ConstLabels: prometheus.Labels{"version": version}, }, []string{"status"}) - numInvalidAlerts := prometheus.NewCounter(prometheus.CounterOpts{ + numInvalidAlerts := promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "alertmanager_alerts_invalid_total", Help: "The total number of received alerts that were invalid.", ConstLabels: prometheus.Labels{"version": version}, }) - if r != nil { - r.MustRegister(numReceivedAlerts, numInvalidAlerts) - } + return &apiMetrics{ firing: numReceivedAlerts.WithLabelValues("firing"), resolved: numReceivedAlerts.WithLabelValues("resolved"), diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index d370cbc3439..7e578a2b315 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -19,6 +19,7 @@ import ( "github.com/prometheus/alertmanager/cluster" amconfig "github.com/prometheus/alertmanager/config" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/weaveworks/common/user" "github.com/cortexproject/cortex/pkg/alertmanager/alerts" @@ -70,20 +71,10 @@ const ( ) var ( - totalConfigs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: "cortex", - Name: "alertmanager_configs", - Help: "How many configs the multitenant alertmanager knows about.", - }, []string{"status"}) statusTemplate *template.Template ) func init() { - // Ensure the metric values are initialized. - totalConfigs.WithLabelValues(configStatusInvalid).Set(0) - totalConfigs.WithLabelValues(configStatusValid).Set(0) - - prometheus.MustRegister(totalConfigs) statusTemplate = template.Must(template.New("statusPage").Funcs(map[string]interface{}{ "state": func(enabled bool) string { if enabled { @@ -133,6 +124,24 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { cfg.Store.RegisterFlags(f) } +type multitenantAlertmanagerMetrics struct { + totalConfigs *prometheus.GaugeVec +} + +func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { + m := &multitenantAlertmanagerMetrics{} + + m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "alertmanager_configs", + Help: "How many configs the multitenant alertmanager knows about.", + }, []string{"status"}) + m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0) + m.totalConfigs.WithLabelValues(configStatusValid).Set(0) + + return m +} + // A MultitenantAlertmanager manages Alertmanager instances for multiple // organizations. type MultitenantAlertmanager struct { @@ -153,8 +162,9 @@ type MultitenantAlertmanager struct { alertmanagersMtx sync.Mutex alertmanagers map[string]*Alertmanager - logger log.Logger - metrics *alertmanagerMetrics + logger log.Logger + alertmanagerMetrics *alertmanagerMetrics + multitenantMetrics *multitenantAlertmanagerMetrics peer *cluster.Peer } @@ -213,18 +223,19 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.L func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackConfig []byte, peer *cluster.Peer, store AlertStore, logger log.Logger, registerer prometheus.Registerer) *MultitenantAlertmanager { am := &MultitenantAlertmanager{ - cfg: cfg, - fallbackConfig: string(fallbackConfig), - cfgs: map[string]alerts.AlertConfigDesc{}, - alertmanagers: map[string]*Alertmanager{}, - metrics: newAlertmanagerMetrics(), - peer: peer, - store: store, - logger: log.With(logger, "component", "MultiTenantAlertmanager"), + cfg: cfg, + fallbackConfig: string(fallbackConfig), + cfgs: map[string]alerts.AlertConfigDesc{}, + alertmanagers: map[string]*Alertmanager{}, + alertmanagerMetrics: newAlertmanagerMetrics(), + multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer), + peer: peer, + store: store, + logger: log.With(logger, "component", "MultiTenantAlertmanager"), } if registerer != nil { - registerer.MustRegister(am.metrics) + registerer.MustRegister(am.alertmanagerMetrics) } am.Service = services.NewTimerService(am.cfg.PollInterval, am.starting, am.iteration, am.stopping) @@ -320,8 +331,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user) } } - totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid)) - totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid)) + am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid)) + am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid)) } func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) { @@ -437,7 +448,6 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error { func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config) (*Alertmanager, error) { reg := prometheus.NewRegistry() - am.metrics.addUserRegistry(userID, reg) newAM, err := New(&Config{ UserID: userID, DataDir: am.cfg.DataDir, @@ -455,7 +465,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco return nil, fmt.Errorf("unable to apply initial config for user %v: %v", userID, err) } - am.metrics.addUserRegistry(userID, reg) + am.alertmanagerMetrics.addUserRegistry(userID, reg) return newAM, nil } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 158d4a3fc09..4a7d486f13e 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -3,12 +3,16 @@ package alertmanager import ( + "bytes" "context" "io/ioutil" "os" "testing" "github.com/go-kit/kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/cortexproject/cortex/pkg/alertmanager/alerts" @@ -64,20 +68,27 @@ func TestLoadAllConfigs(t *testing.T) { require.NoError(t, err) defer os.RemoveAll(tempDir) + reg := prometheus.NewPedanticRegistry() am := createMultitenantAlertmanager(&MultitenantAlertmanagerConfig{ ExternalURL: externalURL, DataDir: tempDir, - }, nil, nil, mockStore, log.NewNopLogger(), nil) + }, nil, nil, mockStore, log.NewNopLogger(), reg) // Ensure the configs are synced correctly require.NoError(t, am.updateConfigs()) - require.Len(t, am.alertmanagers, 2) currentConfig, exists := am.cfgs["user1"] require.True(t, exists) require.Equal(t, simpleConfigOne, currentConfig.RawConfig) + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` + # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. + # TYPE cortex_alertmanager_configs gauge + cortex_alertmanager_configs{status="valid"} 2 + cortex_alertmanager_configs{status="invalid"} 0 + `), "cortex_alertmanager_configs")) + // Ensure when a 3rd config is added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ User: "user3", @@ -86,9 +97,15 @@ func TestLoadAllConfigs(t *testing.T) { } require.NoError(t, am.updateConfigs()) - require.Len(t, am.alertmanagers, 3) + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` + # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. + # TYPE cortex_alertmanager_configs gauge + cortex_alertmanager_configs{status="valid"} 3 + cortex_alertmanager_configs{status="invalid"} 0 + `), "cortex_alertmanager_configs")) + // Ensure the config is updated mockStore.configs["user1"] = alerts.AlertConfigDesc{ User: "user1", @@ -114,6 +131,13 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, exists) require.False(t, userAM.IsActive()) + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` + # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. + # TYPE cortex_alertmanager_configs gauge + cortex_alertmanager_configs{status="valid"} 2 + cortex_alertmanager_configs{status="invalid"} 0 + `), "cortex_alertmanager_configs")) + // Ensure when a 3rd config is re-added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ User: "user3", @@ -130,4 +154,11 @@ func TestLoadAllConfigs(t *testing.T) { userAM, exists = am.alertmanagers["user3"] require.True(t, exists) require.True(t, userAM.IsActive()) + + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` + # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. + # TYPE cortex_alertmanager_configs gauge + cortex_alertmanager_configs{status="valid"} 3 + cortex_alertmanager_configs{status="invalid"} 0 + `), "cortex_alertmanager_configs")) }