Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion integration/asserts.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ var (
Querier: []string{},
QueryFrontend: []string{"cortex_frontend", "cortex_query_frontend"},
TableManager: []string{},
AlertManager: []string{},
AlertManager: []string{"cortex_alertmanager"},
Ruler: []string{},
}

Expand Down
82 changes: 26 additions & 56 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -220,48 +221,37 @@ type nflogMetrics struct {
func newNflogMetrics(r prometheus.Registerer) *nflogMetrics {
m := &nflogMetrics{}

m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{
m.gcDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "alertmanager_nflog_gc_duration_seconds",
Help: "Duration of the last notification log garbage collection cycle.",
Objectives: map[float64]float64{},
})
m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{
m.snapshotDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "alertmanager_nflog_snapshot_duration_seconds",
Help: "Duration of the last notification log snapshot.",
Objectives: map[float64]float64{},
})
m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{
m.snapshotSize = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_nflog_snapshot_size_bytes",
Help: "Size of the last notification log snapshot in bytes.",
})
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.queriesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_queries_total",
Help: "Number of notification log queries were received.",
})
m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.queryErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_query_errors_total",
Help: "Number notification log received queries that failed.",
})
m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
m.queryDuration = promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Name: "alertmanager_nflog_query_duration_seconds",
Help: "Duration of notification log query evaluation.",
})
m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.propagatedMessagesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_gossip_messages_propagated_total",
Help: "Number of received gossip messages that have been further gossiped.",
})

if r != nil {
r.MustRegister(
m.gcDuration,
m.snapshotDuration,
m.snapshotSize,
m.queriesTotal,
m.queryErrorsTotal,
m.queryDuration,
m.propagatedMessagesTotal,
)
}
return m
}

Expand All @@ -282,66 +272,52 @@ type silenceMetrics struct {
func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics {
m := &silenceMetrics{}

m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{
m.gcDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "alertmanager_silences_gc_duration_seconds",
Help: "Duration of the last silence garbage collection cycle.",
Objectives: map[float64]float64{},
})
m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{
m.snapshotDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "alertmanager_silences_snapshot_duration_seconds",
Help: "Duration of the last silence snapshot.",
Objectives: map[float64]float64{},
})
m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{
m.snapshotSize = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_silences_snapshot_size_bytes",
Help: "Size of the last silence snapshot in bytes.",
})
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.queriesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_queries_total",
Help: "How many silence queries were received.",
})
m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.queryErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_query_errors_total",
Help: "How many silence received queries did not succeed.",
})
m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
m.queryDuration = promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Name: "alertmanager_silences_query_duration_seconds",
Help: "Duration of silence query evaluation.",
})
m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{
m.propagatedMessagesTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_gossip_messages_propagated_total",
Help: "Number of received gossip messages that have been further gossiped.",
})
m.silencesActive = prometheus.NewGauge(prometheus.GaugeOpts{
m.silencesActive = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_silences",
Help: "How many silences by state.",
ConstLabels: prometheus.Labels{"state": string(types.SilenceStateActive)},
})
m.silencesPending = prometheus.NewGauge(prometheus.GaugeOpts{
m.silencesPending = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_silences",
Help: "How many silences by state.",
ConstLabels: prometheus.Labels{"state": string(types.SilenceStatePending)},
})
m.silencesExpired = prometheus.NewGauge(prometheus.GaugeOpts{
m.silencesExpired = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_silences",
Help: "How many silences by state.",
ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)},
})

if r != nil {
r.MustRegister(
m.gcDuration,
m.snapshotDuration,
m.snapshotSize,
m.queriesTotal,
m.queryErrorsTotal,
m.queryDuration,
m.silencesActive,
m.silencesPending,
m.silencesExpired,
m.propagatedMessagesTotal,
)
}
return m
}

Expand All @@ -354,17 +330,17 @@ type notifyMetrics struct {

func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
m := &notifyMetrics{
numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
numNotifications: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_total",
Help: "The total number of attempted notifications.",
}, []string{"integration"}),
numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
numFailedNotifications: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
notificationLatencySeconds: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Help: "The latency of notifications in seconds.",
Expand All @@ -376,7 +352,6 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
m.numFailedNotifications.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)
}
r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds)
return m
}

Expand All @@ -385,15 +360,12 @@ type markerMetrics struct {
}

func newMarkerMetrics(r prometheus.Registerer) *markerMetrics {
m := &markerMetrics{
alerts: prometheus.NewGaugeVec(prometheus.GaugeOpts{
return &markerMetrics{
alerts: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
}, []string{"state"}),
}

r.MustRegister(m.alerts)
return m
}

// Copied from github.com/alertmanager/api/metrics/metrics.go
Expand All @@ -404,19 +376,17 @@ type apiMetrics struct {
}

func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics {
numReceivedAlerts := prometheus.NewCounterVec(prometheus.CounterOpts{
numReceivedAlerts := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_alerts_received_total",
Help: "The total number of received alerts.",
ConstLabels: prometheus.Labels{"version": version},
}, []string{"status"})
numInvalidAlerts := prometheus.NewCounter(prometheus.CounterOpts{
numInvalidAlerts := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_alerts_invalid_total",
Help: "The total number of received alerts that were invalid.",
ConstLabels: prometheus.Labels{"version": version},
})
if r != nil {
r.MustRegister(numReceivedAlerts, numInvalidAlerts)
}

return &apiMetrics{
firing: numReceivedAlerts.WithLabelValues("firing"),
resolved: numReceivedAlerts.WithLabelValues("resolved"),
Expand Down
60 changes: 35 additions & 25 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/prometheus/alertmanager/cluster"
amconfig "github.com/prometheus/alertmanager/config"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/weaveworks/common/user"

"github.com/cortexproject/cortex/pkg/alertmanager/alerts"
Expand Down Expand Up @@ -70,20 +71,10 @@ const (
)

var (
totalConfigs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_configs",
Help: "How many configs the multitenant alertmanager knows about.",
}, []string{"status"})
statusTemplate *template.Template
)

func init() {
// Ensure the metric values are initialized.
totalConfigs.WithLabelValues(configStatusInvalid).Set(0)
totalConfigs.WithLabelValues(configStatusValid).Set(0)

prometheus.MustRegister(totalConfigs)
statusTemplate = template.Must(template.New("statusPage").Funcs(map[string]interface{}{
"state": func(enabled bool) string {
if enabled {
Expand Down Expand Up @@ -133,6 +124,24 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
cfg.Store.RegisterFlags(f)
}

type multitenantAlertmanagerMetrics struct {
totalConfigs *prometheus.GaugeVec
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
m := &multitenantAlertmanagerMetrics{}

m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_configs",
Help: "How many configs the multitenant alertmanager knows about.",
}, []string{"status"})
m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0)
m.totalConfigs.WithLabelValues(configStatusValid).Set(0)

return m
}

// A MultitenantAlertmanager manages Alertmanager instances for multiple
// organizations.
type MultitenantAlertmanager struct {
Expand All @@ -153,8 +162,9 @@ type MultitenantAlertmanager struct {
alertmanagersMtx sync.Mutex
alertmanagers map[string]*Alertmanager

logger log.Logger
metrics *alertmanagerMetrics
logger log.Logger
alertmanagerMetrics *alertmanagerMetrics
multitenantMetrics *multitenantAlertmanagerMetrics

peer *cluster.Peer
}
Expand Down Expand Up @@ -213,18 +223,19 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.L

func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackConfig []byte, peer *cluster.Peer, store AlertStore, logger log.Logger, registerer prometheus.Registerer) *MultitenantAlertmanager {
am := &MultitenantAlertmanager{
cfg: cfg,
fallbackConfig: string(fallbackConfig),
cfgs: map[string]alerts.AlertConfigDesc{},
alertmanagers: map[string]*Alertmanager{},
metrics: newAlertmanagerMetrics(),
peer: peer,
store: store,
logger: log.With(logger, "component", "MultiTenantAlertmanager"),
cfg: cfg,
fallbackConfig: string(fallbackConfig),
cfgs: map[string]alerts.AlertConfigDesc{},
alertmanagers: map[string]*Alertmanager{},
alertmanagerMetrics: newAlertmanagerMetrics(),
multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer),
peer: peer,
store: store,
logger: log.With(logger, "component", "MultiTenantAlertmanager"),
}

if registerer != nil {
registerer.MustRegister(am.metrics)
registerer.MustRegister(am.alertmanagerMetrics)
}

am.Service = services.NewTimerService(am.cfg.PollInterval, am.starting, am.iteration, am.stopping)
Expand Down Expand Up @@ -320,8 +331,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
}
}
totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid))
totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid))
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid))
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid))
}

func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) {
Expand Down Expand Up @@ -437,7 +448,6 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error {

func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config) (*Alertmanager, error) {
reg := prometheus.NewRegistry()
am.metrics.addUserRegistry(userID, reg)
newAM, err := New(&Config{
UserID: userID,
DataDir: am.cfg.DataDir,
Expand All @@ -455,7 +465,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco
return nil, fmt.Errorf("unable to apply initial config for user %v: %v", userID, err)
}

am.metrics.addUserRegistry(userID, reg)
am.alertmanagerMetrics.addUserRegistry(userID, reg)
return newAM, nil
}

Expand Down
Loading