From 078d9f11228cb14ee6a9e3d117c4cd6cecc4be9a Mon Sep 17 00:00:00 2001 From: Yuri Grinshteyn Date: Tue, 8 Dec 2020 10:19:47 -0800 Subject: [PATCH] Added conditions to error budget burn policies --- terraform/alerting/alerts.tf | 40 +++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/terraform/alerting/alerts.tf b/terraform/alerting/alerts.tf index c8aa226eb..d10799e2d 100644 --- a/terraform/alerting/alerts.tf +++ b/terraform/alerting/alerts.tf @@ -236,12 +236,13 @@ resource "google_monitoring_alert_policy" "StackdriverExportFailed" { resource "google_monitoring_alert_policy" "fast_burn" { project = var.verification-server-project display_name = "FastErrorBudgetBurn" - combiner = "OR" + combiner = "AND" enabled = "true" # create only if using GCLB, which means there's an SLO created count = var.https-forwarding-rule == "" ? 0 : 1 + conditions { - display_name = "2% burn in 1 hour" + display_name = "Fast burn over last hour" condition_threshold { filter = <<-EOT select_slo_burn_rate("projects/${var.verification-server-project}/services/verification-server/serviceLevelObjectives/availability-slo", "3600s") @@ -256,6 +257,22 @@ resource "google_monitoring_alert_policy" "fast_burn" { } } + conditions { + display_name = "Fast burn over last 5 minutes" + condition_threshold { + filter = <<-EOT + select_slo_burn_rate("projects/${var.verification-server-project}/services/verification-server/serviceLevelObjectives/availability-slo", "300s") + EOT + duration = "0s" + comparison = "COMPARISON_GT" + # burn rate = budget consumed * period / alerting window = .02 * (7 * 24 * 60)/60 = 3.36 + threshold_value = 3.36 + trigger { + count = 1 + } + } + } + documentation { content = "${local.playbook_prefix}/FastErrorBudgetBurn.md" mime_type = "text/markdown" @@ -276,8 +293,9 @@ resource "google_monitoring_alert_policy" "slow_burn" { enabled = "true" # create only if using GCLB, which means there's an SLO created count = var.https-forwarding-rule == "" ? 0 : 1 + conditions { - display_name = "5% burn in 6 hour" + display_name = "Slow burn over last 6 hours" condition_threshold { filter = <<-EOT select_slo_burn_rate("projects/${var.verification-server-project}/services/verification-server/serviceLevelObjectives/availability-slo", "21600s") @@ -292,6 +310,22 @@ resource "google_monitoring_alert_policy" "slow_burn" { } } + conditions { + display_name = "Slow burn over last 30 minutes" + condition_threshold { + filter = <<-EOT + select_slo_burn_rate("projects/${var.verification-server-project}/services/verification-server/serviceLevelObjectives/availability-slo", "1800s") + EOT + duration = "0s" + comparison = "COMPARISON_GT" + # burn rate = budget consumed * period / alerting window = .05 * (7 * 24 * 60)/360 = 1.4 + threshold_value = 1.4 + trigger { + count = 1 + } + } + } + documentation { content = "${local.playbook_prefix}/SlowErrorBudgetBurn.md" mime_type = "text/markdown"