Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions .claude/settings.local.json

This file was deleted.

60 changes: 59 additions & 1 deletion config/components/observability/alerts/activity-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,34 @@ spec:
description: "Cannot connect to ClickHouse ({{ $value }} errors/sec). Audit log data is inaccessible."
impact: "Complete service degradation - no data can be retrieved"

# AuditLogQuery 504 timeout errors
- alert: AuditLogQuery504Errors
expr: |
rate(apiserver_request_total{job="activity-apiserver",code="504",resource="auditlogqueries"}[5m]) > 0.01
for: 5m
labels:
severity: warning
component: activity-apiserver
sli: availability
annotations:
summary: "Audit log queries returning timeout errors"
description: "{{ $value }} 504 errors/sec on auditlogqueries. Queries are timing out."
impact: "Users experiencing failed audit log queries"

# ClickHouse query row iteration errors
- alert: ClickHouseQueryIterationErrors
expr: |
rate(activity_clickhouse_query_errors_total{error_type="iteration"}[5m]) > 0.5
for: 5m
labels:
severity: warning
component: activity-apiserver
sli: availability
annotations:
summary: "ClickHouse query row iteration errors"
description: "{{ $value }} iteration errors/sec. May indicate client disconnects or ClickHouse issues."
impact: "Some queries failing during result streaming"

# =========================================================================
# Data Pipeline Health - Ensures Fresh Data
# =========================================================================
Expand All @@ -85,7 +113,7 @@ spec:
# Data Freshness SLI - Critical for audit compliance
- alert: ActivityDataPipelineStalled
expr: |
rate(vector_events_out_total{component_id="clickhouse"}[5m]) == 0
sum(rate(vector_events_out_total{component_type="clickhouse"}[5m])) == 0
for: 15m
labels:
severity: critical
Expand All @@ -96,6 +124,20 @@ spec:
description: "No new audit events are being stored in ClickHouse. Data is becoming stale."
impact: "Users querying outdated audit data - compliance risk"

# NATS Consumer Lag - Early warning before backlog becomes critical
- alert: NATSConsumerLagHigh
expr: |
nats_jetstream_consumer_num_pending{stream="AUDIT_EVENTS",consumer=~"clickhouse-ingest.*"} > 5000
for: 5m
labels:
severity: warning
component: nats
sli: data_freshness
annotations:
summary: "Audit event consumer backlog growing"
description: "{{ $value }} messages pending in AUDIT_EVENTS stream. Events not reaching ClickHouse."
impact: "Audit log queries may return incomplete results"

# NATS Consumer Lag - Leading indicator of pipeline issues
- alert: ActivityPipelineBacklogCritical
expr: |
Expand All @@ -110,6 +152,22 @@ spec:
description: "{{ $value }} audit events pending. Risk of data loss if retention exceeded."
impact: "Large delay in audit event availability - potential data loss"

# Vector write pipeline stopped while still receiving from NATS
- alert: VectorClickHouseWritesStopped
expr: |
sum(rate(vector_component_sent_events_total{component_type="clickhouse",namespace="activity-system"}[15m])) == 0
AND
sum(rate(vector_component_received_events_total{component_type="nats",namespace="activity-system"}[15m])) > 0
for: 15m
labels:
severity: warning
component: vector-aggregator
sli: data_freshness
annotations:
summary: "Vector receiving events but not writing to ClickHouse"
description: "Vector NATS sources are receiving data but ClickHouse sinks have zero output for 15+ minutes."
impact: "Audit events not being stored — queries returning stale data"

# Event Exporter Availability
- alert: EventExporterDown
expr: up{job="k8s-event-exporter"} == 0
Expand Down
15 changes: 15 additions & 0 deletions config/components/observability/alerts/dlq-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ spec:
impact: "Events failing persistently - policy or cluster issue preventing recovery"
runbook_url: "https://github.com/datum-cloud/activity/blob/main/docs/runbooks/dlq/dlq-high-retry-count.md"

# DLQ slow-leak - low-rate but persistent failures
- alert: DLQSlowLeak
expr: |
increase(activity_processor_dlq_events_published_total[6h]) > 10
for: 1h
labels:
severity: warning
component: activity-processor
team: platform-sre
annotations:
summary: "DLQ receiving events at a slow but steady rate"
description: "{{ $value }} events sent to DLQ in the last 6 hours. A policy may be failing for specific event shapes."
impact: "Some activities silently not being generated"
runbook_url: "https://github.com/datum-cloud/activity/blob/main/docs/runbooks/dlq/dlq-growth.md"

# =========================================================================
# DLQ Retry Effectiveness
# =========================================================================
Expand Down
Loading