This repository was archived by the owner on Jul 12, 2023. It is now read-only.
File tree Expand file tree Collapse file tree 3 files changed +62
-0
lines changed Expand file tree Collapse file tree 3 files changed +62
-0
lines changed Original file line number Diff line number Diff line change
1
+ # Fast Error Budget Alert
2
+
3
+ This alert fires when 5% of the error budget, as determined by the availability SLO, is consumed in 6 hours.
4
+
5
+ * First check if the site is up
6
+ * Check if https://encv.org/ loads (or appropriate domain for this environment)
7
+ * Check if you can login
8
+ * If you can, admins aren't affected
9
+ * If you can't login everyone is affected
10
+ * Post in chat that you've got the alert
11
+ * Communicate to your team that you are actively looking at this alert to lower confusion.
12
+ * Look at services dashboard
13
+ * Load https://console.cloud.google.com/monitoring/services
14
+ * Look at the Verification Server service and determine its health
15
+ * Look at the e2e-runner service request logs. This is a service executing code issue and key upload logic and may contain more information on what's going on
16
+ * Look for servers with elevated 5xx
17
+ * Look at request logs, you can navigate by hand or use the following query
18
+
19
+ ```
20
+ resource.type="cloud_run_revision"
21
+ resource.labels.service_name="e2e-runner"
22
+ severity=ERROR
23
+ ```
Original file line number Diff line number Diff line change @@ -11,6 +11,7 @@ production on GCP. All of our responses here are specific to GCP.
11
11
- [ ElevatedLatencyGreaterThan2s] ( alerts/ElevatedLatencyGreaterThan2s.md )
12
12
- [ ElevatedRateLimitedCount] ( alerts/ElevatedRateLimitedCount.md )
13
13
- [ FastErrorBudgetBurn] ( alerts/FastErrorBudgetBurn.md )
14
+ - [ SlowErrorBudgetBurn] ( alerts/SlowErrorBudgetBurn.md )
14
15
- [ HostDown] ( alerts/HostDown.md )
15
16
- [ RealmTokenRemainingCapacityLow] ( alerts/RealmTokenRemainingCapacityLow.md )
16
17
- [ StackdriverExportFailed] ( alerts/StackdriverExportFailed.md )
Original file line number Diff line number Diff line change @@ -281,3 +281,41 @@ resource "google_monitoring_alert_policy" "fast_burn" {
281
281
google_monitoring_slo . availability-slo
282
282
]
283
283
}
284
+
285
+ # slow error budget burn alert
286
+ resource "google_monitoring_alert_policy" "slow_burn" {
287
+ project = var. verification-server-project
288
+ display_name = " SlowErrorBudgetBurn"
289
+ combiner = " OR"
290
+ enabled = " true"
291
+ # create only if using GCLB, which means there's an SLO created
292
+ count = var. https-forwarding-rule == " " ? 0 : 1
293
+ conditions {
294
+ display_name = " 5% burn in 6 hour"
295
+ condition_threshold {
296
+ filter = <<- EOT
297
+ select_slo_burn_rate("projects/${ var . verification-server-project } /services/verification-server/serviceLevelObjectives/availability-slo", "21600s")
298
+ EOT
299
+ duration = " 0s"
300
+ comparison = " COMPARISON_GT"
301
+ # burn rate = budget consumed * period / alerting window = .05 * (7 * 24 * 60)/360 = 1.4
302
+ threshold_value = 1.4
303
+ trigger {
304
+ count = 1
305
+ }
306
+ }
307
+ }
308
+
309
+ documentation {
310
+ content = " ${ local . playbook_prefix } /SlowErrorBudgetBurn.md"
311
+ mime_type = " text/markdown"
312
+ }
313
+
314
+ notification_channels = [
315
+ google_monitoring_notification_channel . email . id ,
316
+ ]
317
+
318
+ depends_on = [
319
+ google_monitoring_slo . availability-slo
320
+ ]
321
+ }
You can’t perform that action at this time.
0 commit comments