-
Notifications
You must be signed in to change notification settings - Fork 2
/
alert.rules.yml
86 lines (76 loc) · 3.52 KB
/
alert.rules.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
groups:
- name: ecs.alert.rules
rules:
- alert: NodeRebootsTooOften
expr: changes(node_boot_time_seconds[1d]) > 1
labels:
severity: warning
annotations:
description: "Node reboots too often ({{ $value }} is more than 1 time per 1d)"
summary: Node reboots too often
- alert: NodeFilesystemFreeLow
expr: (100 * node_filesystem_avail_bytes{mountpoint=~"^(/rootfs)?(|/|/boot|/data|/volatile)$"} / node_filesystem_size_bytes{mountpoint=~"^(/rootfs)?(|/|/boot|/data|/volatile)$"}) < 20
labels:
severity: warning
annotations:
description: "Node filesystem {{ $labels.device }} mounted at {{ $labels.mountpoint }} has less than 20% available disk space remaining."
summary: "Node filesystem available space is low (< 20%)"
- alert: NodeFilesystemFreeLowCritical
expr: (100 * node_filesystem_avail_bytes{mountpoint=~"^(/rootfs)?(|/|/boot|/data|/volatile)$"} / node_filesystem_size_bytes{mountpoint=~"^(/rootfs)?(|/|/boot|/data|/volatile)$"}) < 15
labels:
severity: critical
annotations:
description: "Node filesystem {{ $labels.device }} mounted at {{ $labels.mountpoint }} has less than 15% available disk space remaining."
summary: "Node filesystem available space is critically low (< 15%)"
- alert: NodeMemoryUsageHigh400mb
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes) / 1048576 < 400
for: 30s
labels:
severity: error
annotations:
description: "Node free & cached memory together is less than the threshold (400mb) with a value of {{ $value }}."
summary: "Node memory usage is high (< 400mb free)"
- alert: NodeMemoryUsageHigh80percent
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 80
for: 30s
labels:
severity: warning
annotations:
description: "Node used memory is more than the threshold (80%) with a value of {{ $value }}."
summary: "Node memory usage is high (> 80% used)"
- alert: NodeLoadHigh
expr: node_load1 > machine_cpu_cores / 2
for: 2m
labels:
severity: warning
annotations:
description: "Node is under high load (load1 > cores /2 for 2 min)."
summary: "Node under high load"
- alert: BackupMissed
expr: (time() / 3600) - (backup_last_start_time / 3600) > 32
labels:
severity: error
annotations:
description: "Node did not have a sucessful backup since 32h. Last sucessful backup was at {{ $value }}. Backup should run every 24h."
summary: "Backup run missed"
- alert: UpdateMissed
expr: (time() / 86400) - (update_last_call / 86400) > 8
labels:
severity: error
annotations:
description: "Node did not have a sucessful update run since more than 8 days. Last update run was {{ $value }} days ago. Update should run once every 7 days."
summary: "Update run missed"
- alert: LetsencryptCertValidityLow
expr: (letsencrypt_valid_until - time() ) / 86400 < 10
labels:
severity: warning
annotations:
description: "Node letsencrypt certificate is less than 10 days ({{ $value }}) valid."
summary: "Letsencrypt certificate is less than 10 days valid"
- alert: LetsencryptCertValidityCritical
expr: (letsencrypt_valid_until - time() ) / 86400 < 3
labels:
severity: critical
annotations:
description: "Node letsencrypt certificate is less than 3 days ({{ $value }}) valid."
summary: "Letsencrypt certificate is less than 3 days valid"