-
Notifications
You must be signed in to change notification settings - Fork 781
/
manager_metrics.go
224 lines (207 loc) · 8.16 KB
/
manager_metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
package ruler
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/cortexproject/cortex/pkg/util"
)
// ManagerMetrics aggregates metrics exported by the Prometheus
// rules package and returns them as Cortex metrics
type ManagerMetrics struct {
regs *util.UserRegistries
disableRuleGroupLabel bool
EvalDuration *prometheus.Desc
IterationDuration *prometheus.Desc
IterationsMissed *prometheus.Desc
IterationsScheduled *prometheus.Desc
EvalTotal *prometheus.Desc
EvalFailures *prometheus.Desc
GroupInterval *prometheus.Desc
GroupLastEvalTime *prometheus.Desc
GroupLastDuration *prometheus.Desc
GroupRules *prometheus.Desc
GroupLastEvalSamples *prometheus.Desc
NotificationLatency *prometheus.Desc
NotificationErrors *prometheus.Desc
NotificationSent *prometheus.Desc
NotificationDropped *prometheus.Desc
NotificationQueueLength *prometheus.Desc
NotificationQueueCapacity *prometheus.Desc
AlertmanagersDiscovered *prometheus.Desc
}
// NewManagerMetrics returns a ManagerMetrics struct
func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics {
commonLabels := []string{"user"}
if !disableRuleGroupLabel {
commonLabels = append(commonLabels, "rule_group")
}
return &ManagerMetrics{
regs: util.NewUserRegistries(),
disableRuleGroupLabel: disableRuleGroupLabel,
EvalDuration: prometheus.NewDesc(
"cortex_prometheus_rule_evaluation_duration_seconds",
"The duration for a rule to execute.",
[]string{"user"},
nil,
),
IterationDuration: prometheus.NewDesc(
"cortex_prometheus_rule_group_duration_seconds",
"The duration of rule group evaluations.",
[]string{"user"},
nil,
),
IterationsMissed: prometheus.NewDesc(
"cortex_prometheus_rule_group_iterations_missed_total",
"The total number of rule group evaluations missed due to slow rule group evaluation.",
commonLabels,
nil,
),
IterationsScheduled: prometheus.NewDesc(
"cortex_prometheus_rule_group_iterations_total",
"The total number of scheduled rule group evaluations, whether executed or missed.",
commonLabels,
nil,
),
EvalTotal: prometheus.NewDesc(
"cortex_prometheus_rule_evaluations_total",
"The total number of rule evaluations.",
commonLabels,
nil,
),
EvalFailures: prometheus.NewDesc(
"cortex_prometheus_rule_evaluation_failures_total",
"The total number of rule evaluation failures.",
commonLabels,
nil,
),
GroupInterval: prometheus.NewDesc(
"cortex_prometheus_rule_group_interval_seconds",
"The interval of a rule group.",
commonLabels,
nil,
),
GroupLastEvalTime: prometheus.NewDesc(
"cortex_prometheus_rule_group_last_evaluation_timestamp_seconds",
"The timestamp of the last rule group evaluation in seconds.",
commonLabels,
nil,
),
GroupLastDuration: prometheus.NewDesc(
"cortex_prometheus_rule_group_last_duration_seconds",
"The duration of the last rule group evaluation.",
commonLabels,
nil,
),
GroupRules: prometheus.NewDesc(
"cortex_prometheus_rule_group_rules",
"The number of rules.",
commonLabels,
nil,
),
GroupLastEvalSamples: prometheus.NewDesc(
"cortex_prometheus_last_evaluation_samples",
"The number of samples returned during the last rule group evaluation.",
commonLabels,
nil,
),
// Prometheus' ruler's notification metrics
NotificationLatency: prometheus.NewDesc(
"cortex_prometheus_notifications_latency_seconds",
"Latency quantiles for sending alert notifications.",
[]string{"user"},
nil,
),
NotificationErrors: prometheus.NewDesc(
"cortex_prometheus_notifications_errors_total",
"Total number of errors sending alert notifications.",
[]string{"user", "alertmanager"},
nil,
),
NotificationSent: prometheus.NewDesc(
"cortex_prometheus_notifications_sent_total",
"Total number of alerts sent.",
[]string{"user", "alertmanager"},
nil,
),
NotificationDropped: prometheus.NewDesc(
"cortex_prometheus_notifications_dropped_total",
"Total number of alerts dropped due to errors when sending to Alertmanager.",
[]string{"user"},
nil,
),
NotificationQueueLength: prometheus.NewDesc(
"cortex_prometheus_notifications_queue_length",
"The number of alert notifications in the queue.",
[]string{"user"},
nil,
),
NotificationQueueCapacity: prometheus.NewDesc(
"cortex_prometheus_notifications_queue_capacity",
"The capacity of the alert notifications queue.",
[]string{"user"},
nil,
),
AlertmanagersDiscovered: prometheus.NewDesc(
"cortex_prometheus_notifications_alertmanagers_discovered",
"The number of alertmanagers discovered and active.",
[]string{"user"},
nil,
),
}
}
// AddUserRegistry adds a user-specific Prometheus registry.
func (m *ManagerMetrics) AddUserRegistry(user string, reg *prometheus.Registry) {
m.regs.AddUserRegistry(user, reg)
}
// RemoveUserRegistry removes user-specific Prometheus registry.
func (m *ManagerMetrics) RemoveUserRegistry(user string) {
m.regs.RemoveUserRegistry(user, true)
}
// Describe implements the Collector interface
func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.EvalDuration
out <- m.IterationDuration
out <- m.IterationsMissed
out <- m.IterationsScheduled
out <- m.EvalTotal
out <- m.EvalFailures
out <- m.GroupInterval
out <- m.GroupLastEvalTime
out <- m.GroupLastDuration
out <- m.GroupRules
out <- m.GroupLastEvalSamples
out <- m.NotificationLatency
out <- m.NotificationErrors
out <- m.NotificationSent
out <- m.NotificationDropped
out <- m.NotificationQueueLength
out <- m.NotificationQueueCapacity
out <- m.AlertmanagersDiscovered
}
// Collect implements the Collector interface
func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
data := m.regs.BuildMetricFamiliesPerUser()
labels := []string{}
if !m.disableRuleGroupLabel {
labels = append(labels, "rule_group")
}
// WARNING: It is important that all metrics generated in this method are "Per User".
// Thanks to that we can actually *remove* metrics for given user (see RemoveUserRegistry).
// If same user is later re-added, all metrics will start from 0, which is fine.
data.SendSumOfSummariesPerUser(out, m.EvalDuration, "prometheus_rule_evaluation_duration_seconds")
data.SendSumOfSummariesPerUser(out, m.IterationDuration, "prometheus_rule_group_duration_seconds")
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsMissed, "prometheus_rule_group_iterations_missed_total", labels...)
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total", labels...)
data.SendSumOfCountersPerUserWithLabels(out, m.EvalTotal, "prometheus_rule_evaluations_total", labels...)
data.SendSumOfCountersPerUserWithLabels(out, m.EvalFailures, "prometheus_rule_evaluation_failures_total", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...)
data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds")
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager")
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager")
data.SendSumOfCountersPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total")
data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length")
data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity")
data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered")
}