-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
prometheus-alerts.yaml
193 lines (193 loc) · 6.64 KB
/
prometheus-alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
---
groups:
- name: logging_loki.alerts
rules:
- alert: LokiRequestErrors
annotations:
message: |-
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: "At least 10% of requests are responded by 5xx server errors."
runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors"
expr: |
sum(
job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."}
) by (job, namespace, route)
/
sum(
job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m
) by (job, namespace, route)
* 100
> 10
for: 15m
labels:
severity: critical
- alert: LokiStackWriteRequestErrors
annotations:
message: |-
{{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors.
summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors."
runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors"
expr: |
sum(
code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"}
) by (job, namespace)
/
sum(
code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"}
) by (job, namespace)
* 100
> 10
for: 15m
labels:
severity: critical
- alert: LokiStackReadRequestErrors
annotations:
message: |-
{{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors.
summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors."
runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors"
expr: |
sum(
code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"}
) by (job, namespace)
/
sum(
code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"}
) by (job, namespace)
* 100
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: |-
{{ $labels.job }} is experiencing an increase of {{ $value }} panics.
summary: "A panic was triggered."
runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics"
expr: |
sum(
increase(
loki_panic_total[10m]
)
) by (job, namespace)
> 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: |-
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: "The 99th percentile is experiencing high latency (higher than 1 second)."
runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency"
expr: |
histogram_quantile(0.99,
sum(
irate(
loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m]
)
) by (job, le, namespace, route)
)
> 1
for: 15m
labels:
severity: critical
- alert: LokiTenantRateLimit
annotations:
message: |-
{{ $labels.job }} {{ $labels.route }} is experiencing 429 errors.
summary: "At least 10% of requests are responded with the rate limit error code."
runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit"
expr: |
sum(
job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"}
) by (job, namespace, route)
/
sum(
job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m
) by (job, namespace, route)
* 100
> 10
for: 15m
labels:
severity: warning
- alert: LokiStorageSlowWrite
annotations:
message: |-
The storage path is experiencing slow write response rates.
summary: "The storage path is experiencing slow write response rates."
runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write"
expr: |
histogram_quantile(0.99,
sum(
job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"}
) by (job, le, namespace)
)
> 1
for: 15m
labels:
severity: warning
- alert: LokiStorageSlowRead
annotations:
message: |-
The storage path is experiencing slow read response rates.
summary: "The storage path is experiencing slow read response rates."
runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read"
expr: |
histogram_quantile(0.99,
sum(
job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"}
) by (job, le, namespace)
)
> 5
for: 15m
labels:
severity: warning
- alert: LokiWritePathHighLoad
annotations:
message: |-
The write path is experiencing high load.
summary: "The write path is experiencing high load, causing backpressure storage flushing."
runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load"
expr: |
sum(
loki_ingester_wal_replay_flushing
) by (job, namespace)
> 0
for: 15m
labels:
severity: warning
- alert: LokiReadPathHighLoad
annotations:
message: |-
The read path is experiencing high load.
summary: "The read path has high volume of queries, causing longer response times."
runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load"
expr: |
histogram_quantile(0.99,
sum(
rate(
loki_logql_querystats_latency_seconds_bucket[5m]
)
) by (job, le, namespace)
)
> 30
for: 15m
labels:
severity: warning
- alert: LokistackSchemaUpgradesRequired
annotations:
message: |-
The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" is using a storage schema
configuration that does not contain the latest schema version. It is recommended to update the schema
configuration to update the schema version to the latest version in the future.
summary: "One or more of the deployed LokiStacks contains an outdated storage schema configuration."
runbook_url: "[[ .RunbookURL ]]#Lokistack-Schema-Upgrades-Required"
expr: |
sum (
lokistack_status_condition{reason="StorageNeedsSchemaUpdate",status="true"}
) by (stack_namespace, stack_name)
> 0
for: 1m
labels:
severity: warning