Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ clickhouse-data/
# Dashboard build outputs
dashboards/vendor/
dashboards/jsonnetfile.lock.json
config/components/observability/dashboards/generated/
observability/vendor/
observability/jsonnetfile.lock.json

# Directory used for the test-infra repo to manage the test-infra environment.
.test-infra
Expand Down
6 changes: 6 additions & 0 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ includes:
CLICKHOUSE_USERNAME: "{{.CLICKHOUSE_USERNAME}}"
CLICKHOUSE_PASSWORD: "{{.CLICKHOUSE_PASSWORD}}"
ROOT_DIR: "{{.USER_WORKING_DIR}}"
observability:
taskfile: ./observability/Taskfile.yaml
dir: ./observability
vars:
ROOT_DIR: "{{.USER_WORKING_DIR}}"

# Performance testing with k6
load:
Expand Down Expand Up @@ -351,6 +356,7 @@ tasks:
- generate:openapi
- migrations:generate
- load:generate
- observability:build-mixin
- generate:docs
cmds:
- |
Expand Down
111 changes: 111 additions & 0 deletions config/components/observability/alerts/activity-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: activity-alerts
namespace: activity-system
labels:
prometheus: activity
app.kubernetes.io/part-of: activity
monitoring: "true"
spec:
groups:
# =========================================================================
# Key SLI Alerts - User-Facing Service Quality
# =========================================================================
- name: activity-sli
interval: 30s
rules:
# Service Availability SLI
- alert: ActivityAPIServerDown
expr: up{job="activity-apiserver"} == 0
for: 5m
labels:
severity: critical
component: activity-apiserver
sli: availability
annotations:
summary: "Activity is unavailable"
description: "Activity has been down for more than 5 minutes. Users cannot query audit logs."
impact: "Complete service outage - no audit log queries possible"

# Request Success Rate SLI (Error Budget)
- alert: ActivityHighErrorRate
expr: |
sum(rate(apiserver_request_total{job="activity-apiserver",code=~"5.."}[5m]))
/
sum(rate(apiserver_request_total{job="activity-apiserver"}[5m]))
> 0.01
for: 10m
labels:
severity: warning
component: activity-apiserver
sli: success_rate
annotations:
summary: "High error rate in Activity"
description: "{{ $value | humanizePercentage }} of requests are failing (target: <1%)"
impact: "Users experiencing failed audit log queries"

# Query Latency SLI - Most Critical for User Experience
- alert: ActivityQueryLatencyHigh
expr: |
histogram_quantile(0.99,
sum(rate(activity_clickhouse_query_duration_seconds_bucket{operation="total"}[5m]))
by (le)
) > 10
for: 10m
labels:
severity: warning
component: activity-apiserver
sli: latency
annotations:
summary: "Audit log queries are slow"
description: "p99 query latency is {{ $value }}s (target: <10s). Users experiencing slow responses."
impact: "Degraded user experience - queries taking too long"

# Data Availability SLI - Backend Health
- alert: ActivityClickHouseUnavailable
expr: |
rate(activity_clickhouse_query_errors_total{error_type="connection"}[5m]) > 0.1
for: 5m
labels:
severity: critical
component: clickhouse
sli: availability
annotations:
summary: "ClickHouse database is unavailable"
description: "Cannot connect to ClickHouse ({{ $value }} errors/sec). Audit log data is inaccessible."
impact: "Complete service degradation - no data can be retrieved"

# =========================================================================
# Data Pipeline Health - Ensures Fresh Data
# =========================================================================
- name: activity-pipeline
interval: 30s
rules:
# Data Freshness SLI - Critical for audit compliance
- alert: ActivityDataPipelineStalled
expr: |
rate(vector_events_out_total{component_id="clickhouse"}[5m]) == 0
for: 15m
labels:
severity: critical
component: vector-aggregator
sli: data_freshness
annotations:
summary: "Audit event pipeline has stalled"
description: "No new audit events are being stored in ClickHouse. Data is becoming stale."
impact: "Users querying outdated audit data - compliance risk"

# NATS Consumer Lag - Leading indicator of pipeline issues
- alert: ActivityPipelineBacklogCritical
expr: |
nats_jetstream_consumer_num_pending{stream="AUDIT_EVENTS"} > 500000
for: 10m
labels:
severity: critical
component: nats
sli: data_freshness
annotations:
summary: "Audit event backlog is critical"
description: "{{ $value }} audit events pending. Risk of data loss if retention exceeded."
impact: "Large delay in audit event availability - potential data loss"
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"apiVersion": "monitoring.coreos.com/v1"
"kind": "PrometheusRule"
"metadata":
"labels":
"app.kubernetes.io/part-of": "activity"
"monitoring": "true"
"prometheus": "activity"
"name": "activity-alerts"
"namespace": "activity-system"
"spec":
"groups":
- "interval": "30s"
"name": "activity-sli"
"rules":
- "alert": "Activity API ServerDown"
"annotations":
"description": "Activity has been down for more than 5 minutes. Users cannot query audit logs."
"impact": "Complete service outage - no audit log queries possible"
"summary": "Activity is unavailable"
"expr": "up{job=\"activity-apiserver\"} == 0"
"for": "5m"
"labels":
"component": "activity-apiserver"
"severity": "critical"
"sli": "availability"
- "alert": "ActivityHighErrorRate"
"annotations":
"description": "{{ $value | humanizePercentage }} of requests are failing (target: <1%)"
"impact": "Users experiencing failed requests"
"summary": "High error rate in Activity"
"expr": |
sum(rate(apiserver_request_total{job="activity-apiserver",code=~"5.."}[5m]))
/
sum(rate(apiserver_request_total{job="activity-apiserver"}[5m]))
> 0.01
"for": "10m"
"labels":
"component": "activity-apiserver"
"severity": "warning"
"sli": "success_rate"
- "alert": "ActivityQueryLatencyHigh"
"annotations":
"description": "p99 query latency is {{ $value }}s (target: <10s). Users experiencing slow responses."
"impact": "Degraded user experience - queries taking too long"
"summary": "Audit log queries are slow"
"expr": |
histogram_quantile(0.99,
sum(rate(activity_clickhouse_query_duration_seconds_bucket{operation="total"}[5m]))
by (le)
) > 10
"for": "10m"
"labels":
"component": "ActivityQuery"
"severity": "warning"
"sli": "latency"
- "alert": "ActivityClickHouseUnavailable"
"annotations":
"description": "Cannot connect to ClickHouse ({{ $value }} errors/sec). Data is inaccessible."
"impact": "Complete service degradation - no data can be retrieved"
"summary": "ClickHouse database is unavailable"
"expr": |
rate(activity_clickhouse_query_errors_total{error_type="connection"}[5m]) > 0.10000000000000001
"for": "5m"
"labels":
"component": "clickhouse"
"severity": "critical"
"sli": "availability"
- "interval": "30s"
"name": "activity-pipeline"
"rules":
- "alert": "ActivityDataPipelinePipelineStalled"
"annotations":
"description": "No new audit events are being stored in ClickHouse. Data is becoming stale."
"impact": "Users querying outdated audit data - compliance risk"
"summary": "Audit event pipeline has stalled"
"expr": |
rate(vector_events_out_total{component_id="clickhouse"}[5m]) == 0
"for": "15m"
"labels":
"component": "vector-aggregator"
"severity": "critical"
"sli": "data_freshness"
- "alert": "ActivityPipelineBacklogCritical"
"annotations":
"description": "{{ $value }} audit events pending. Risk of data loss if retention exceeded."
"impact": "Large delay in audit event availability - potential data loss"
"summary": "Audit event backlog is critical"
"expr": |
nats_jetstream_consumer_num_pending{stream="AUDIT_EVENTS"} > 500000
"for": "10m"
"labels":
"component": "nats"
"severity": "critical"
"sli": "data_freshness"
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"apiVersion": "monitoring.coreos.com/v1"
"kind": "PrometheusRule"
"metadata":
"labels":
"app.kubernetes.io/part-of": "activity"
"monitoring": "true"
"prometheus": "activity"
"name": "activity-recordings"
"namespace": "activity-system"
"spec":
"groups":
- "interval": "30s"
"name": "activity-recordings"
"rules":
- "expr": |
sum(rate(apiserver_request_total{job="activity-apiserver"}[5m]))
by (verb, resource, code)
"record": "activity:request_rate:5m"
- "expr": |
sum(rate(apiserver_request_total{job="activity-apiserver"}[5m]))
"record": "activity:request_rate_total:5m"
- "expr": |
sum(rate(apiserver_request_total{job="activity-apiserver",code=~"5.."}[5m]))
/
sum(rate(apiserver_request_total{job="activity-apiserver"}[5m]))
"record": "activity:error_rate:5m"
- "expr": |
histogram_quantile(0.50,
sum(rate(apiserver_request_duration_seconds_bucket{job="activity-apiserver"}[5m]))
by (le)
)
"record": "activity:apiserver_request_duration:p50"
- "expr": |
histogram_quantile(0.95,
sum(rate(apiserver_request_duration_seconds_bucket{job="activity-apiserver"}[5m]))
by (le)
)
"record": "activity:apiserver_request_duration:p95"
- "expr": |
histogram_quantile(0.99,
sum(rate(apiserver_request_duration_seconds_bucket{job="activity-apiserver"}[5m]))
by (le)
)
"record": "activity:apiserver_request_duration:p99"
- "expr": |
histogram_quantile(0.50,
sum(rate(activity_clickhouse_query_duration_seconds_bucket{operation="total"}[5m]))
by (le)
)
"record": "activity:query_duration:p50"
- "expr": |
histogram_quantile(0.95,
sum(rate(activity_clickhouse_query_duration_seconds_bucket{operation="total"}[5m]))
by (le)
)
"record": "activity:query_duration:p95"
- "expr": |
histogram_quantile(0.99,
sum(rate(activity_clickhouse_query_duration_seconds_bucket{operation="total"}[5m]))
by (le)
)
"record": "activity:query_duration:p99"
- "expr": |
sum(rate(activity_clickhouse_query_total[5m]))
by (status)
"record": "activity:query_rate:5m"
- "expr": |
sum(rate(activity_clickhouse_query_errors_total[5m]))
by (error_type)
"record": "activity:clickhouse_error_rate:5m"
- "expr": |
sum(rate(vector_component_received_events_total{component_id="nats_consumer",namespace="activity-system"}[5m]))
"record": "activity:vector_throughput:5m"
- "expr": |
sum(rate(vector_component_sent_events_total{component_id="clickhouse",namespace="activity-system"}[5m]))
"record": "activity:vector_writes:5m"
- "expr": |
sum(rate(vector_component_received_events_total{component_id="nats_consumer",namespace="activity-system"}[5m]))
-
sum(rate(vector_component_sent_events_total{component_id="clickhouse",namespace="activity-system"}[5m]))
"record": "activity:pipeline_lag:5m"
- "expr": |
nats_consumer_num_pending{stream_name="AUDIT_EVENTS",consumer_name="clickhouse-ingest"}
"record": "activity:nats_consumer_lag"
- "expr": |
rate(nats_stream_total_messages{stream_name="AUDIT_EVENTS"}[5m])
"record": "activity:nats_message_rate:5m"
- "expr": |
sum(rate(container_cpu_usage_seconds_total{namespace="activity-system"}[5m]))
by (pod)
/
sum(container_spec_cpu_quota{namespace="activity-system"} / container_spec_cpu_period{namespace="activity-system"})
by (pod)
"record": "activity:cpu_utilization"
- "expr": |
sum(container_memory_working_set_bytes{namespace="activity-system"})
by (pod)
/
sum(container_spec_memory_limit_bytes{namespace="activity-system"})
by (pod)
"record": "activity:memory_utilization"
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: activity-apiserver-dashboard
labels:
dashboards: grafana
spec:
allowCrossNamespaceImport: true
instanceSelector:
matchLabels:
dashboards: grafana
resyncPeriod: 30s
configMapRef:
name: activity-apiserver-dashboard
key: activity-apiserver.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: audit-pipeline-dashboard
labels:
dashboards: grafana
spec:
allowCrossNamespaceImport: true
instanceSelector:
matchLabels:
dashboards: grafana
resyncPeriod: 30s
configMapRef:
name: audit-pipeline-dashboard
key: audit-pipeline.json
Loading