diff --git a/docs/playbooks/Fast_Error_Budget_Burn.md b/docs/playbooks/Fast_Error_Budget_Burn.md new file mode 100644 index 000000000..ce6610a1c --- /dev/null +++ b/docs/playbooks/Fast_Error_Budget_Burn.md @@ -0,0 +1,23 @@ +# Fast Error Budget Alert + +This alert fires when 2% of the error budget, as determined by the availability SLO, is consumed in an hour. + +* First check if the site is up + * Check if https://encv.org/ loads (or appropriate domain for this environment) + * Check if you can login + * If you can, admins aren't affected + * If you can't login everyone is affected +* Post in chat that you've got the alert + * Communicate to your team that you are actively looking at this alert to lower confusion. +* Look at services dashboard + * Load https://console.cloud.google.com/monitoring/services + * Look at the Verification Server service and determine its health + * Look at the e2e-runner service request logs. This is a service executing code issue and key upload logic and may contain more information on what's going on + * Look for servers with elevated 5xx + * Look at request logs, you can navigate by hand or use the following query + +``` +resource.type="cloud_run_revision" +resource.labels.service_name="e2e-runner" +severity=ERROR +``` \ No newline at end of file diff --git a/docs/playbooks/index.md b/docs/playbooks/index.md index ae570e8bf..e413e81d9 100644 --- a/docs/playbooks/index.md +++ b/docs/playbooks/index.md @@ -9,6 +9,7 @@ This folder contains documents for playbooks both for responding to alerts and c - [Elevated Rate Limited Count](Elevated_Rate_Limited_Count.md) - [Elevated Latency Greater than 2s](Elevated_Latency_Greater_than_2s.md) - [Realm Token Capacity Utilization Above Threshold](Realm_Token_Capacity_Utilization_Above_Threshold.md) + - [Fast Error Budget Burn](Fast_Error_Budget_Burn.md) ## Common Actions diff --git a/terraform/alerting/verification-server/alerts.tf b/terraform/alerting/verification-server/alerts.tf index 50c69689a..13d832396 100644 --- a/terraform/alerting/verification-server/alerts.tf +++ b/terraform/alerting/verification-server/alerts.tf @@ -327,3 +327,45 @@ EOT google_logging_metric.stackdriver_export_error_count ] } + +# fast error budget burn alert +resource "google_monitoring_alert_policy" "fast_burn" { + project = var.verification-server-project + display_name = "Fast error budget burn" + combiner = "OR" + enabled = "true" + # create only if using GCLB, which means there's an SLO created + count = var.https-forwarding-rule == "" ? 0 : 1 + conditions { + display_name = "2% burn in 1 hour" + condition_threshold { + filter = <<-EOT + select_slo_burn_rate("projects/${var.verification-server-project}/services/verification-server/serviceLevelObjectives/availability-slo", "3600s") + EOT + duration = "0s" + comparison = "COMPARISON_GT" + # burn rate = budget consumed * period / alerting window = .02 * (7 * 24 * 60)/60 = 3.36 + threshold_value = 3.36 + trigger { + count = 1 + } + } + } + + documentation { + content = <<-EOT +## $${policy.display_name} + +See [the playbook](https://github.com/google/exposure-notifications-verification-server/blob/main/docs/playbooks/Fast_Error_Budget_Burn.md) for information about triaging and mitigating this alert. +EOT + mime_type = "text/markdown" + } + + notification_channels = [ + google_monitoring_notification_channel.email.id, + ] + + depends_on = [ + google_monitoring_slo.availability-slo + ] +}