diff --git a/config/components/observability/alerts/alertmanager-config.yaml b/config/components/observability/alerts/alertmanager-config.yaml new file mode 100644 index 00000000..45791af7 --- /dev/null +++ b/config/components/observability/alerts/alertmanager-config.yaml @@ -0,0 +1,98 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: activity-alert-inhibitions + namespace: activity-system + labels: + app.kubernetes.io/part-of: activity +spec: + route: + receiver: default + continue: true + receivers: + - name: default + inhibitRules: + # Severity-based: critical suppresses related warnings for same component + - sourceMatch: + - name: severity + value: critical + targetMatch: + - name: severity + value: warning + equal: + - component + + # ActivityAPIServerDown suppresses error rate and latency alerts + - sourceMatch: + - name: alertname + value: ActivityAPIServerDown + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityHighErrorRate|ActivityQueryLatencyHigh" + + # ClickHouse unavailable suppresses query latency and pipeline stall + - sourceMatch: + - name: alertname + value: ActivityClickHouseUnavailable + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityQueryLatencyHigh|ActivityDataPipelineStalled" + + # All NATS sources stalled suppresses individual consumer alerts + - sourceMatch: + - name: alertname + value: VectorAllNATSSourcesStalled + targetMatch: + - name: alertname + matchType: "=~" + value: "VectorNATSActivitiesConsumerStalled|VectorNATSEventsConsumerStalled|VectorNATSAuditStalledWithBacklog|VectorNATSAuditSourceStopped" + + # Processor down suppresses generation stalled, error rate, and DLQ alerts + - sourceMatch: + - name: alertname + value: ActivityProcessorDown + targetMatch: + - name: alertname + matchType: "=~" + value: "ActivityGenerationStalled|ActivityProcessorHighErrorRate|ActivityProcessorNoPolicies|DLQPublishErrors|DLQRetryIneffective|DLQQueueGrowing|DLQSlowLeak" + + # NATS disconnected suppresses generation stalled + - sourceMatch: + - name: alertname + value: ActivityProcessorNATSDisconnected + targetMatch: + - name: alertname + value: ActivityGenerationStalled + + # Keeper session errors suppress ZooKeeper exceptions + - sourceMatch: + - name: alertname + value: ClickHouseKeeperSessionErrors + targetMatch: + - name: alertname + value: ClickHouseZooKeeperExceptions + + # Critical pipeline backlog suppresses consumer lag warning + - sourceMatch: + - name: alertname + value: ActivityPipelineBacklogCritical + targetMatch: + - name: alertname + value: NATSConsumerLagHigh + + # SLO page-burn alerts suppress equivalent threshold alerts + - sourceMatch: + - name: alertname + value: ActivitySLOAvailabilityPageBurn + targetMatch: + - name: alertname + value: ActivityHighErrorRate + + - sourceMatch: + - name: alertname + value: ActivitySLOAuditQueryPageBurn + targetMatch: + - name: alertname + value: ActivityQueryLatencyHigh diff --git a/config/components/observability/kustomization.yaml b/config/components/observability/kustomization.yaml index d809f7bb..f8f59f9e 100644 --- a/config/components/observability/kustomization.yaml +++ b/config/components/observability/kustomization.yaml @@ -20,6 +20,8 @@ resources: - alerts/slo-alerts.yaml # Recording rules (generated from Jsonnet mixin) - alerts/generated/activity-recordings.yaml + # Alertmanager inhibition rules to suppress redundant alerts + - alerts/alertmanager-config.yaml # PrometheusRule resources for component-specific alerts - prometheusrules/clickhouse-alerts.yaml