From 61e20633ad6dbebe13aa1f386fb906789787b6e5 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 26 Mar 2026 10:23:51 -0500 Subject: [PATCH 1/2] feat(observability): add VMAlertmanagerConfig with alert inhibition rules (#146) Add activity-alert-inhibitions VMAlertmanagerConfig to suppress redundant downstream alerts when root-cause alerts are already firing. Covers apiserver down, ClickHouse unavailable, NATS stalled, processor down, SLO burn-rate, and ClickHouse keeper/merge scenarios. Co-Authored-By: Claude Sonnet 4.6 --- .claude/settings.local.json | 11 ++- .../alerts/alertmanager-config.yaml | 69 +++++++++++++++++++ .../observability/kustomization.yaml | 2 + 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 config/components/observability/alerts/alertmanager-config.yaml diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 73c22add..06086fb4 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -13,7 +13,16 @@ "Bash(grep:*)", "Bash(gh pr checks:*)", "Bash(gh api:*)", - "Bash(git cherry-pick:*)" + "Bash(git cherry-pick:*)", + "Bash(find /Users/scotwells/repos/datum-cloud -name \"kustomization.yaml\" -path \"*/config/milo/iam*\" -exec head -10 {} + 2>/dev/null | head -30)", + "Bash(find /Users/scotwells/repos/datum-cloud/activity -type f -name *.rules.yaml -o -name *.rules.yml)", + "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n telemetry-system)", + "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get svc -n telemetry-system)", + "Bash(curl -s 'http://localhost:8082/api/v1/alerts')", + "Bash(python3 -m json.tool)", + "Bash(curl -s http://localhost:8082/api/v1/alerts)", + "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n activity-system -o custom-columns='NAME:.metadata.name,RESTARTS:.status.containerStatuses[*].restartCount,LAST_STATE:.status.containerStatuses[*].lastState' 2>&1)", + "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n activity-system 2>&1)" ] } } diff --git a/config/components/observability/alerts/alertmanager-config.yaml b/config/components/observability/alerts/alertmanager-config.yaml new file mode 100644 index 00000000..9220a187 --- /dev/null +++ b/config/components/observability/alerts/alertmanager-config.yaml @@ -0,0 +1,69 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMAlertmanagerConfig +metadata: + name: activity-alert-inhibitions + namespace: activity-system + labels: + app.kubernetes.io/part-of: activity +spec: + inhibit_rules: + # Severity-based: critical suppresses related warnings for same component + - source_matchers: + - severity="critical" + target_matchers: + - severity="warning" + equal: + - component + + # ActivityAPIServerDown suppresses error rate and latency alerts + - source_matchers: + - alertname="ActivityAPIServerDown" + target_matchers: + - alertname=~"ActivityHighErrorRate|ActivityQueryLatencyHigh" + + # ClickHouse unavailable suppresses query latency and pipeline stall + - source_matchers: + - alertname="ActivityClickHouseUnavailable" + target_matchers: + - alertname=~"ActivityQueryLatencyHigh|ActivityDataPipelineStalled" + + # All NATS sources stalled suppresses individual consumer alerts + - source_matchers: + - alertname="VectorAllNATSSourcesStalled" + target_matchers: + - alertname=~"VectorNATSActivitiesConsumerStalled|VectorNATSEventsConsumerStalled|VectorNATSAuditStalledWithBacklog|VectorNATSAuditSourceStopped" + + # Processor down suppresses generation stalled, error rate, and DLQ alerts + - source_matchers: + - alertname="ActivityProcessorDown" + target_matchers: + - alertname=~"ActivityGenerationStalled|ActivityProcessorHighErrorRate|ActivityProcessorNoPolicies|DLQPublishErrors|DLQRetryIneffective|DLQQueueGrowing|DLQSlowLeak" + + # NATS disconnected suppresses generation stalled + - source_matchers: + - alertname="ActivityProcessorNATSDisconnected" + target_matchers: + - alertname="ActivityGenerationStalled" + + # Keeper session errors suppress ZooKeeper exceptions + - source_matchers: + - alertname="ClickHouseKeeperSessionErrors" + target_matchers: + - alertname="ClickHouseZooKeeperExceptions" + + # Critical pipeline backlog suppresses individual consumer lag alerts + - source_matchers: + - alertname="ActivityPipelineBacklogCritical" + target_matchers: + - alertname="NATSConsumerLagHigh" + + # SLO page-burn alerts suppress equivalent threshold alerts + - source_matchers: + - alertname="ActivitySLOAvailabilityPageBurn" + target_matchers: + - alertname="ActivityHighErrorRate" + + - source_matchers: + - alertname="ActivitySLOAuditQueryPageBurn" + target_matchers: + - alertname="ActivityQueryLatencyHigh" diff --git a/config/components/observability/kustomization.yaml b/config/components/observability/kustomization.yaml index 9bfafc45..6601d094 100644 --- a/config/components/observability/kustomization.yaml +++ b/config/components/observability/kustomization.yaml @@ -18,6 +18,8 @@ resources: - alerts/vector-alerts.yaml # Recording rules (generated from Jsonnet mixin) - alerts/generated/activity-recordings.yaml + # Alertmanager inhibition rules to suppress redundant alerts + - alerts/alertmanager-config.yaml # PrometheusRule resources for component-specific alerts - prometheusrules/clickhouse-alerts.yaml From 5748c4c792d3e5c9aeaf8b86e60036dbe3c6a0e4 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 26 Mar 2026 10:40:10 -0500 Subject: [PATCH 2/2] Remove settings.local.json from tracking Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/settings.local.json | 28 ----- .../alerts/alertmanager-config.yaml | 117 +++++++++++------- 2 files changed, 73 insertions(+), 72 deletions(-) delete mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 06086fb4..00000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(golangci-lint run:*)", - "Bash(git checkout:*)", - "Bash(go build:*)", - "Bash(go test:*)", - "Bash(npm run build:*)", - "Bash(git add:*)", - "Bash(git commit:*)", - "Bash(git push:*)", - "Bash(task generate:*)", - "Bash(grep:*)", - "Bash(gh pr checks:*)", - "Bash(gh api:*)", - "Bash(git cherry-pick:*)", - "Bash(find /Users/scotwells/repos/datum-cloud -name \"kustomization.yaml\" -path \"*/config/milo/iam*\" -exec head -10 {} + 2>/dev/null | head -30)", - "Bash(find /Users/scotwells/repos/datum-cloud/activity -type f -name *.rules.yaml -o -name *.rules.yml)", - "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n telemetry-system)", - "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get svc -n telemetry-system)", - "Bash(curl -s 'http://localhost:8082/api/v1/alerts')", - "Bash(python3 -m json.tool)", - "Bash(curl -s http://localhost:8082/api/v1/alerts)", - "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n activity-system -o custom-columns='NAME:.metadata.name,RESTARTS:.status.containerStatuses[*].restartCount,LAST_STATE:.status.containerStatuses[*].lastState' 2>&1)", - "Bash(KUBECONFIG=~/.kube/gke-staging kubectl get pods -n activity-system 2>&1)" - ] - } -} diff --git a/config/components/observability/alerts/alertmanager-config.yaml b/config/components/observability/alerts/alertmanager-config.yaml index 9220a187..45791af7 100644 --- a/config/components/observability/alerts/alertmanager-config.yaml +++ b/config/components/observability/alerts/alertmanager-config.yaml @@ -1,69 +1,98 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMAlertmanagerConfig +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig metadata: name: activity-alert-inhibitions namespace: activity-system labels: app.kubernetes.io/part-of: activity spec: - inhibit_rules: + route: + receiver: default + continue: true + receivers: + - name: default + inhibitRules: # Severity-based: critical suppresses related warnings for same component - - source_matchers: - - severity="critical" - target_matchers: - - severity="warning" + - sourceMatch: + - name: severity + value: critical + targetMatch: + - name: severity + value: warning equal: - component # ActivityAPIServerDown suppresses error rate and latency alerts - - source_matchers: - - alertname="ActivityAPIServerDown" - target_matchers: - - alertname=~"ActivityHighErrorRate|ActivityQueryLatencyHigh" + - sourceMatch: + - name: alertname + value: ActivityAPIServerDown + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityHighErrorRate|ActivityQueryLatencyHigh" # ClickHouse unavailable suppresses query latency and pipeline stall - - source_matchers: - - alertname="ActivityClickHouseUnavailable" - target_matchers: - - alertname=~"ActivityQueryLatencyHigh|ActivityDataPipelineStalled" + - sourceMatch: + - name: alertname + value: ActivityClickHouseUnavailable + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityQueryLatencyHigh|ActivityDataPipelineStalled" # All NATS sources stalled suppresses individual consumer alerts - - source_matchers: - - alertname="VectorAllNATSSourcesStalled" - target_matchers: - - alertname=~"VectorNATSActivitiesConsumerStalled|VectorNATSEventsConsumerStalled|VectorNATSAuditStalledWithBacklog|VectorNATSAuditSourceStopped" + - sourceMatch: + - name: alertname + value: VectorAllNATSSourcesStalled + targetMatch: + - name: alertname + matchType: "=~" + value: "VectorNATSActivitiesConsumerStalled|VectorNATSEventsConsumerStalled|VectorNATSAuditStalledWithBacklog|VectorNATSAuditSourceStopped" # Processor down suppresses generation stalled, error rate, and DLQ alerts - - source_matchers: - - alertname="ActivityProcessorDown" - target_matchers: - - alertname=~"ActivityGenerationStalled|ActivityProcessorHighErrorRate|ActivityProcessorNoPolicies|DLQPublishErrors|DLQRetryIneffective|DLQQueueGrowing|DLQSlowLeak" + - sourceMatch: + - name: alertname + value: ActivityProcessorDown + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityGenerationStalled|ActivityProcessorHighErrorRate|ActivityProcessorNoPolicies|DLQPublishErrors|DLQRetryIneffective|DLQQueueGrowing|DLQSlowLeak" # NATS disconnected suppresses generation stalled - - source_matchers: - - alertname="ActivityProcessorNATSDisconnected" - target_matchers: - - alertname="ActivityGenerationStalled" + - sourceMatch: + - name: alertname + value: ActivityProcessorNATSDisconnected + targetMatch: + - name: alertname + value: ActivityGenerationStalled # Keeper session errors suppress ZooKeeper exceptions - - source_matchers: - - alertname="ClickHouseKeeperSessionErrors" - target_matchers: - - alertname="ClickHouseZooKeeperExceptions" + - sourceMatch: + - name: alertname + value: ClickHouseKeeperSessionErrors + targetMatch: + - name: alertname + value: ClickHouseZooKeeperExceptions - # Critical pipeline backlog suppresses individual consumer lag alerts - - source_matchers: - - alertname="ActivityPipelineBacklogCritical" - target_matchers: - - alertname="NATSConsumerLagHigh" + # Critical pipeline backlog suppresses consumer lag warning + - sourceMatch: + - name: alertname + value: ActivityPipelineBacklogCritical + targetMatch: + - name: alertname + value: NATSConsumerLagHigh # SLO page-burn alerts suppress equivalent threshold alerts - - source_matchers: - - alertname="ActivitySLOAvailabilityPageBurn" - target_matchers: - - alertname="ActivityHighErrorRate" + - sourceMatch: + - name: alertname + value: ActivitySLOAvailabilityPageBurn + targetMatch: + - name: alertname + value: ActivityHighErrorRate - - source_matchers: - - alertname="ActivitySLOAuditQueryPageBurn" - target_matchers: - - alertname="ActivityQueryLatencyHigh" + - sourceMatch: + - name: alertname + value: ActivitySLOAuditQueryPageBurn + targetMatch: + - name: alertname + value: ActivityQueryLatencyHigh