diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 73c22add..00000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(golangci-lint run:*)", - "Bash(git checkout:*)", - "Bash(go build:*)", - "Bash(go test:*)", - "Bash(npm run build:*)", - "Bash(git add:*)", - "Bash(git commit:*)", - "Bash(git push:*)", - "Bash(task generate:*)", - "Bash(grep:*)", - "Bash(gh pr checks:*)", - "Bash(gh api:*)", - "Bash(git cherry-pick:*)" - ] - } -} diff --git a/config/components/observability/alerts/activity-alerts.yaml b/config/components/observability/alerts/activity-alerts.yaml index 02d5928f..6e92b99d 100644 --- a/config/components/observability/alerts/activity-alerts.yaml +++ b/config/components/observability/alerts/activity-alerts.yaml @@ -76,6 +76,34 @@ spec: description: "Cannot connect to ClickHouse ({{ $value }} errors/sec). Audit log data is inaccessible." impact: "Complete service degradation - no data can be retrieved" + # AuditLogQuery 504 timeout errors + - alert: AuditLogQuery504Errors + expr: | + rate(apiserver_request_total{job="activity-apiserver",code="504",resource="auditlogqueries"}[5m]) > 0.01 + for: 5m + labels: + severity: warning + component: activity-apiserver + sli: availability + annotations: + summary: "Audit log queries returning timeout errors" + description: "{{ $value }} 504 errors/sec on auditlogqueries. Queries are timing out." + impact: "Users experiencing failed audit log queries" + + # ClickHouse query row iteration errors + - alert: ClickHouseQueryIterationErrors + expr: | + rate(activity_clickhouse_query_errors_total{error_type="iteration"}[5m]) > 0.5 + for: 5m + labels: + severity: warning + component: activity-apiserver + sli: availability + annotations: + summary: "ClickHouse query row iteration errors" + description: "{{ $value }} iteration errors/sec. May indicate client disconnects or ClickHouse issues." + impact: "Some queries failing during result streaming" + # ========================================================================= # Data Pipeline Health - Ensures Fresh Data # ========================================================================= @@ -85,7 +113,7 @@ spec: # Data Freshness SLI - Critical for audit compliance - alert: ActivityDataPipelineStalled expr: | - rate(vector_events_out_total{component_id="clickhouse"}[5m]) == 0 + sum(rate(vector_events_out_total{component_type="clickhouse"}[5m])) == 0 for: 15m labels: severity: critical @@ -96,6 +124,20 @@ spec: description: "No new audit events are being stored in ClickHouse. Data is becoming stale." impact: "Users querying outdated audit data - compliance risk" + # NATS Consumer Lag - Early warning before backlog becomes critical + - alert: NATSConsumerLagHigh + expr: | + nats_jetstream_consumer_num_pending{stream="AUDIT_EVENTS",consumer=~"clickhouse-ingest.*"} > 5000 + for: 5m + labels: + severity: warning + component: nats + sli: data_freshness + annotations: + summary: "Audit event consumer backlog growing" + description: "{{ $value }} messages pending in AUDIT_EVENTS stream. Events not reaching ClickHouse." + impact: "Audit log queries may return incomplete results" + # NATS Consumer Lag - Leading indicator of pipeline issues - alert: ActivityPipelineBacklogCritical expr: | @@ -110,6 +152,22 @@ spec: description: "{{ $value }} audit events pending. Risk of data loss if retention exceeded." impact: "Large delay in audit event availability - potential data loss" + # Vector write pipeline stopped while still receiving from NATS + - alert: VectorClickHouseWritesStopped + expr: | + sum(rate(vector_component_sent_events_total{component_type="clickhouse",namespace="activity-system"}[15m])) == 0 + AND + sum(rate(vector_component_received_events_total{component_type="nats",namespace="activity-system"}[15m])) > 0 + for: 15m + labels: + severity: warning + component: vector-aggregator + sli: data_freshness + annotations: + summary: "Vector receiving events but not writing to ClickHouse" + description: "Vector NATS sources are receiving data but ClickHouse sinks have zero output for 15+ minutes." + impact: "Audit events not being stored — queries returning stale data" + # Event Exporter Availability - alert: EventExporterDown expr: up{job="k8s-event-exporter"} == 0 diff --git a/config/components/observability/alerts/dlq-alerts.yaml b/config/components/observability/alerts/dlq-alerts.yaml index ac5b0fbd..35be49b5 100644 --- a/config/components/observability/alerts/dlq-alerts.yaml +++ b/config/components/observability/alerts/dlq-alerts.yaml @@ -61,6 +61,21 @@ spec: impact: "Events failing persistently - policy or cluster issue preventing recovery" runbook_url: "https://github.com/datum-cloud/activity/blob/main/docs/runbooks/dlq/dlq-high-retry-count.md" + # DLQ slow-leak - low-rate but persistent failures + - alert: DLQSlowLeak + expr: | + increase(activity_processor_dlq_events_published_total[6h]) > 10 + for: 1h + labels: + severity: warning + component: activity-processor + team: platform-sre + annotations: + summary: "DLQ receiving events at a slow but steady rate" + description: "{{ $value }} events sent to DLQ in the last 6 hours. A policy may be failing for specific event shapes." + impact: "Some activities silently not being generated" + runbook_url: "https://github.com/datum-cloud/activity/blob/main/docs/runbooks/dlq/dlq-growth.md" + # ========================================================================= # DLQ Retry Effectiveness # =========================================================================