/
setting_unified_alerting.go
408 lines (366 loc) · 18.2 KB
/
setting_unified_alerting.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
package setting
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
"github.com/grafana/grafana-plugin-sdk-go/backend/gtime"
"github.com/prometheus/alertmanager/cluster"
"gopkg.in/ini.v1"
"github.com/grafana/grafana/pkg/util"
)
const (
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
alertmanagerDefaultPeerTimeout = 15 * time.Second
alertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
alertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
alertmanagerDefaultConfigPollInterval = time.Minute
alertmanagerRedisDefaultMaxConns = 5
// To start, the alertmanager needs at least one route defined.
// TODO: we should move this to Grafana settings and define this as the default.
alertmanagerDefaultConfiguration = `{
"alertmanager_config": {
"route": {
"receiver": "grafana-default-email",
"group_by": ["grafana_folder", "alertname"]
},
"receivers": [{
"name": "grafana-default-email",
"grafana_managed_receiver_configs": [{
"uid": "",
"name": "email receiver",
"type": "email",
"isDefault": true,
"settings": {
"addresses": "<example@email.com>"
}
}]
}]
}
}
`
evaluatorDefaultEvaluationTimeout = 30 * time.Second
schedulerDefaultAdminConfigPollInterval = time.Minute
schedulereDefaultExecuteAlerts = true
schedulerDefaultMaxAttempts = 1
schedulerDefaultLegacyMinInterval = 1
screenshotsDefaultCapture = false
screenshotsDefaultCaptureTimeout = 10 * time.Second
screenshotsMaxCaptureTimeout = 30 * time.Second
screenshotsDefaultMaxConcurrent = 5
screenshotsDefaultUploadImageStorage = false
// SchedulerBaseInterval base interval of the scheduler. Controls how often the scheduler fetches database for new changes as well as schedules evaluation of a rule
// changing this value is discouraged because this could cause existing alert definition
// with intervals that are not exactly divided by this number not to be evaluated
SchedulerBaseInterval = 10 * time.Second
// DefaultRuleEvaluationInterval indicates a default interval of for how long a rule should be evaluated to change state from Pending to Alerting
DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds
stateHistoryDefaultEnabled = true
)
type UnifiedAlertingSettings struct {
AdminConfigPollInterval time.Duration
AlertmanagerConfigPollInterval time.Duration
HAListenAddr string
HAAdvertiseAddr string
HAPeers []string
HAPeerTimeout time.Duration
HAGossipInterval time.Duration
HAPushPullInterval time.Duration
HALabel string
HARedisAddr string
HARedisPeerName string
HARedisPrefix string
HARedisUsername string
HARedisPassword string
HARedisDB int
HARedisMaxConns int
MaxAttempts int64
MinInterval time.Duration
EvaluationTimeout time.Duration
ExecuteAlerts bool
DefaultConfiguration string
Enabled *bool // determines whether unified alerting is enabled. If it is nil then user did not define it and therefore its value will be determined during migration. Services should not use it directly.
DisabledOrgs map[int64]struct{}
// BaseInterval interval of time the scheduler updates the rules and evaluates rules.
// Only for internal use and not user configuration.
BaseInterval time.Duration
// DefaultRuleEvaluationInterval default interval between evaluations of a rule.
DefaultRuleEvaluationInterval time.Duration
Screenshots UnifiedAlertingScreenshotSettings
ReservedLabels UnifiedAlertingReservedLabelSettings
StateHistory UnifiedAlertingStateHistorySettings
RemoteAlertmanager RemoteAlertmanagerSettings
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
MaxStateSaveConcurrency int
}
// RemoteAlertmanagerSettings contains the configuration needed
// to disable the internal Alertmanager and use an external one instead.
type RemoteAlertmanagerSettings struct {
Enable bool
URL string
TenantID string
Password string
}
type UnifiedAlertingScreenshotSettings struct {
Capture bool
CaptureTimeout time.Duration
MaxConcurrentScreenshots int64
UploadExternalImageStorage bool
}
type UnifiedAlertingReservedLabelSettings struct {
DisabledLabels map[string]struct{}
}
type UnifiedAlertingStateHistorySettings struct {
Enabled bool
Backend string
LokiRemoteURL string
LokiReadURL string
LokiWriteURL string
LokiTenantID string
// LokiBasicAuthUsername and LokiBasicAuthPassword are used for basic auth
// if one of them is set.
LokiBasicAuthPassword string
LokiBasicAuthUsername string
MultiPrimary string
MultiSecondaries []string
ExternalLabels map[string]string
}
// IsEnabled returns true if UnifiedAlertingSettings.Enabled is either nil or true.
// It hides the implementation details of the Enabled and simplifies its usage.
func (u *UnifiedAlertingSettings) IsEnabled() bool {
return u.Enabled == nil || *u.Enabled
}
// IsReservedLabelDisabled returns true if UnifiedAlertingReservedLabelSettings.DisabledLabels contains the given reserved label.
func (u *UnifiedAlertingReservedLabelSettings) IsReservedLabelDisabled(label string) bool {
_, ok := u.DisabledLabels[label]
return ok
}
// readUnifiedAlertingEnabledSettings reads the settings for unified alerting.
// It returns a non-nil bool and a nil error when unified alerting is enabled either
// because it has been enabled in the settings or by default. It returns nil and
// a non-nil error both unified alerting and legacy alerting are enabled at the same time.
func (cfg *Cfg) readUnifiedAlertingEnabledSetting(section *ini.Section) (*bool, error) {
// At present an invalid value is considered the same as no value. This means that a
// spelling mistake in the string "false" could enable unified alerting rather
// than disable it. This issue can be found here
hasEnabled := section.Key("enabled").Value() != ""
if !hasEnabled {
// TODO: Remove in Grafana v10
if cfg.IsFeatureToggleEnabled("ngalert") {
cfg.Logger.Warn("ngalert feature flag is deprecated: use unified alerting enabled setting instead")
// feature flag overrides the legacy alerting setting
legacyAlerting := false
AlertingEnabled = &legacyAlerting
unifiedAlerting := true
return &unifiedAlerting, nil
}
// if legacy alerting has not been configured then enable unified alerting
if AlertingEnabled == nil {
unifiedAlerting := true
return &unifiedAlerting, nil
}
// enable unified alerting and disable legacy alerting
legacyAlerting := false
AlertingEnabled = &legacyAlerting
unifiedAlerting := true
return &unifiedAlerting, nil
}
unifiedAlerting, err := section.Key("enabled").Bool()
if err != nil {
// the value for unified alerting is invalid so disable all alerting
legacyAlerting := false
AlertingEnabled = &legacyAlerting
return nil, fmt.Errorf("invalid value %s, should be either true or false", section.Key("enabled"))
}
// If both legacy and unified alerting are enabled then return an error
if AlertingEnabled != nil && *AlertingEnabled && unifiedAlerting {
return nil, errors.New("legacy and unified alerting cannot both be enabled at the same time, please disable one of them and restart Grafana")
}
if AlertingEnabled == nil {
legacyAlerting := !unifiedAlerting
AlertingEnabled = &legacyAlerting
}
return &unifiedAlerting, nil
}
// ReadUnifiedAlertingSettings reads both the `unified_alerting` and `alerting` sections of the configuration while preferring configuration the `alerting` section.
// It first reads the `unified_alerting` section, then looks for non-defaults on the `alerting` section and prefers those.
func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
var err error
uaCfg := UnifiedAlertingSettings{}
ua := iniFile.Section("unified_alerting")
uaCfg.Enabled, err = cfg.readUnifiedAlertingEnabledSetting(ua)
if err != nil {
return fmt.Errorf("failed to read unified alerting enabled setting: %w", err)
}
uaCfg.DisabledOrgs = make(map[int64]struct{})
orgsStr := valueAsString(ua, "disabled_orgs", "")
for _, org := range util.SplitString(orgsStr) {
orgID, err := strconv.ParseInt(org, 10, 64)
if err != nil {
return err
}
uaCfg.DisabledOrgs[orgID] = struct{}{}
}
uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (schedulerDefaultAdminConfigPollInterval).String()))
if err != nil {
return err
}
uaCfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (alertmanagerDefaultConfigPollInterval).String()))
if err != nil {
return err
}
uaCfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (alertmanagerDefaultPeerTimeout).String()))
if err != nil {
return err
}
uaCfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (alertmanagerDefaultGossipInterval).String()))
if err != nil {
return err
}
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
if err != nil {
return err
}
uaCfg.HAListenAddr = ua.Key("ha_listen_address").MustString(alertmanagerDefaultClusterAddr)
uaCfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("")
uaCfg.HALabel = ua.Key("ha_label").MustString("")
uaCfg.HARedisAddr = ua.Key("ha_redis_address").MustString("")
uaCfg.HARedisPeerName = ua.Key("ha_redis_peer_name").MustString("")
uaCfg.HARedisPrefix = ua.Key("ha_redis_prefix").MustString("")
uaCfg.HARedisUsername = ua.Key("ha_redis_username").MustString("")
uaCfg.HARedisPassword = ua.Key("ha_redis_password").MustString("")
uaCfg.HARedisDB = ua.Key("ha_redis_db").MustInt(0)
uaCfg.HARedisMaxConns = ua.Key("ha_redis_max_conns").MustInt(alertmanagerRedisDefaultMaxConns)
peers := ua.Key("ha_peers").MustString("")
uaCfg.HAPeers = make([]string, 0)
if peers != "" {
for _, peer := range strings.Split(peers, ",") {
peer = strings.TrimSpace(peer)
uaCfg.HAPeers = append(uaCfg.HAPeers, peer)
}
}
// TODO load from ini file
uaCfg.DefaultConfiguration = alertmanagerDefaultConfiguration
alerting := iniFile.Section("alerting")
uaExecuteAlerts := ua.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
if uaExecuteAlerts { // unified option equals the default (true)
legacyExecuteAlerts := alerting.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
if !legacyExecuteAlerts {
cfg.Logger.Warn("falling back to legacy setting of 'execute_alerts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaExecuteAlerts = legacyExecuteAlerts
}
uaCfg.ExecuteAlerts = uaExecuteAlerts
// if the unified alerting options equal the defaults, apply the respective legacy one
uaEvaluationTimeout, err := gtime.ParseDuration(valueAsString(ua, "evaluation_timeout", evaluatorDefaultEvaluationTimeout.String()))
if err != nil || uaEvaluationTimeout == evaluatorDefaultEvaluationTimeout { // unified option is invalid duration or equals the default
legaceEvaluationTimeout := time.Duration(alerting.Key("evaluation_timeout_seconds").MustInt64(int64(evaluatorDefaultEvaluationTimeout.Seconds()))) * time.Second
if legaceEvaluationTimeout != evaluatorDefaultEvaluationTimeout {
cfg.Logger.Warn("falling back to legacy setting of 'evaluation_timeout_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaEvaluationTimeout = legaceEvaluationTimeout
}
uaCfg.EvaluationTimeout = uaEvaluationTimeout
uaCfg.MaxAttempts = ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
uaCfg.BaseInterval = SchedulerBaseInterval
// The base interval of the scheduler for evaluating alerts.
// 1. It is used by the internal scheduler's timer to tick at this interval.
// 2. to spread evaluations of rules that need to be evaluated at the current tick T. In other words, the evaluation of rules at the tick T will be evenly spread in the interval from T to T+scheduler_tick_interval.
// For example, if there are 100 rules that need to be evaluated at tick T, and the base interval is 10s, rules will be evaluated every 100ms.
// 3. It increases delay between rule updates and state reset.
// NOTE:
// 1. All alert rule intervals should be times of this interval. Otherwise, the rules will not be evaluated. It is not recommended to set it lower than 10s or odd numbers. Recommended: 10s, 30s, 1m
// 2. The increasing of the interval will affect how slow alert rule updates will reset the state, and therefore reset notification. Higher the interval - slower propagation of the changes.
baseInterval, err := gtime.ParseDuration(valueAsString(ua, "scheduler_tick_interval", SchedulerBaseInterval.String()))
if cfg.IsFeatureToggleEnabled("configurableSchedulerTick") { // use literal to avoid cycle imports
if err != nil {
return fmt.Errorf("failed to parse setting 'scheduler_tick_interval' as duration: %w", err)
}
if baseInterval != SchedulerBaseInterval {
cfg.Logger.Warn("Scheduler tick interval is changed to non-default", "interval", baseInterval, "default", SchedulerBaseInterval)
}
uaCfg.BaseInterval = baseInterval
} else if baseInterval != SchedulerBaseInterval {
cfg.Logger.Warn("Scheduler tick interval is changed to non-default but the feature flag is not enabled. Using default.", "interval", baseInterval, "default", SchedulerBaseInterval)
}
uaMinInterval, err := gtime.ParseDuration(valueAsString(ua, "min_interval", uaCfg.BaseInterval.String()))
if err != nil || uaMinInterval == uaCfg.BaseInterval { // unified option is invalid duration or equals the default
// if the legacy option is invalid, fallback to 10 (unified alerting min interval default)
legacyMinInterval := time.Duration(alerting.Key("min_interval_seconds").MustInt64(int64(uaCfg.BaseInterval.Seconds()))) * time.Second
if legacyMinInterval > uaCfg.BaseInterval {
cfg.Logger.Warn("falling back to legacy setting of 'min_interval_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
uaMinInterval = legacyMinInterval
} else {
// if legacy interval is smaller than the base interval, adjust it to the base interval
uaMinInterval = uaCfg.BaseInterval
}
}
if uaMinInterval < uaCfg.BaseInterval {
return fmt.Errorf("value of setting 'min_interval' should be greater than the base interval (%v)", uaCfg.BaseInterval)
}
if uaMinInterval%uaCfg.BaseInterval != 0 {
return fmt.Errorf("value of setting 'min_interval' should be times of base interval (%v)", uaCfg.BaseInterval)
}
uaCfg.MinInterval = uaMinInterval
uaCfg.DefaultRuleEvaluationInterval = DefaultRuleEvaluationInterval
if uaMinInterval > uaCfg.DefaultRuleEvaluationInterval {
uaCfg.DefaultRuleEvaluationInterval = uaMinInterval
}
remoteAlertmanager := iniFile.Section("remote.alertmanager")
uaCfgRemoteAM := RemoteAlertmanagerSettings{
Enable: remoteAlertmanager.Key("enabled").MustBool(false),
URL: remoteAlertmanager.Key("url").MustString(""),
TenantID: remoteAlertmanager.Key("tenant").MustString(""),
Password: remoteAlertmanager.Key("password").MustString(""),
}
uaCfg.RemoteAlertmanager = uaCfgRemoteAM
screenshots := iniFile.Section("unified_alerting.screenshots")
uaCfgScreenshots := uaCfg.Screenshots
uaCfgScreenshots.Capture = screenshots.Key("capture").MustBool(screenshotsDefaultCapture)
captureTimeout := screenshots.Key("capture_timeout").MustDuration(screenshotsDefaultCaptureTimeout)
if captureTimeout > screenshotsMaxCaptureTimeout {
return fmt.Errorf("value of setting 'capture_timeout' cannot exceed %s", screenshotsMaxCaptureTimeout)
}
uaCfgScreenshots.CaptureTimeout = captureTimeout
uaCfgScreenshots.MaxConcurrentScreenshots = screenshots.Key("max_concurrent_screenshots").MustInt64(screenshotsDefaultMaxConcurrent)
uaCfgScreenshots.UploadExternalImageStorage = screenshots.Key("upload_external_image_storage").MustBool(screenshotsDefaultUploadImageStorage)
uaCfg.Screenshots = uaCfgScreenshots
reservedLabels := iniFile.Section("unified_alerting.reserved_labels")
uaCfgReservedLabels := UnifiedAlertingReservedLabelSettings{
DisabledLabels: make(map[string]struct{}),
}
for _, label := range util.SplitString(reservedLabels.Key("disabled_labels").MustString("")) {
uaCfgReservedLabels.DisabledLabels[label] = struct{}{}
}
uaCfg.ReservedLabels = uaCfgReservedLabels
stateHistory := iniFile.Section("unified_alerting.state_history")
stateHistoryLabels := iniFile.Section("unified_alerting.state_history.external_labels")
uaCfgStateHistory := UnifiedAlertingStateHistorySettings{
Enabled: stateHistory.Key("enabled").MustBool(stateHistoryDefaultEnabled),
Backend: stateHistory.Key("backend").MustString("annotations"),
LokiRemoteURL: stateHistory.Key("loki_remote_url").MustString(""),
LokiReadURL: stateHistory.Key("loki_remote_read_url").MustString(""),
LokiWriteURL: stateHistory.Key("loki_remote_write_url").MustString(""),
LokiTenantID: stateHistory.Key("loki_tenant_id").MustString(""),
LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""),
LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""),
MultiPrimary: stateHistory.Key("primary").MustString(""),
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
ExternalLabels: stateHistoryLabels.KeysHash(),
}
uaCfg.StateHistory = uaCfgStateHistory
uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1)
cfg.UnifiedAlerting = uaCfg
return nil
}
func GetAlertmanagerDefaultConfiguration() string {
return alertmanagerDefaultConfiguration
}
func splitTrim(s string, sep string) []string {
spl := strings.Split(s, sep)
for i := range spl {
spl[i] = strings.TrimSpace(spl[i])
}
return spl
}