Skip to content

Commit

Permalink
Overhaul warning alerts
Browse files Browse the repository at this point in the history
- Alertmanagers will now send out warning alerts
- Adjusted or removed warning alerts
- Added tests for warning alerts
- Added `kube_pod_status_ready` to Prometheus whitelist
  • Loading branch information
wyb1 committed Apr 24, 2019
1 parent eceb113 commit d0dded2
Show file tree
Hide file tree
Showing 26 changed files with 364 additions and 316 deletions.
1 change: 0 additions & 1 deletion charts/seed-bootstrap/templates/alertmanager/_config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ route:
# email only for critical and blocker
- match_re:
visibility: ^(all|operator)$
severity: ^(blocker|critical|info)$
receiver: email-kubernetes-ops


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ route:
# email only for critical and blocker
- match_re:
visibility: ^(all|owner)$
severity: ^(critical|blocker|info)$
receiver: email-kubernetes-ops

inhibit_rules:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# ApiServerUnreachableViaKubernetesService
- series: 'probe_success{job="blackbox-exporter-k8s-service-check"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# CloudControllerManagerDown
- series: 'up{job="cloud-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# CoreDNSDown
- series: 'up{job="coredns"}'
values: '0+0x40'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# ApiServerNotReachable
- series: 'probe_success{instance="api.example.com/healthz", job="blackbox-apiserver"}'
values: '0+0x10'
# KubeApiserverDown
- series: 'up{job="kube-apiserver"}'
values: '0+0x10'
# KubeApiServerTooManyOpenFileDescriptors
- series: 'process_open_fds{job="kube-apiserver", instance="instance"}'
values: '81+0x60'
- series: 'process_max_fds{job="kube-apiserver", instance="instance"}'
values: '100+0x60'
# KubeApiServerLatency
- series: 'apiserver_request_latencies_bucket{verb="POST", le="5000000"}'
values: '100+0x60'
- series: 'apiserver_request_latencies_bucket{verb="POST", le="+Inf"}'
values: '100+0x60'
alert_rule_test:
- eval_time: 5m
alertname: ApiServerNotReachable
Expand Down Expand Up @@ -48,6 +56,7 @@ tests:
type: seed
job: kube-apiserver
instance: instance
visibility: owner
exp_annotations:
description: 'The API server (instance) is using 81% of the available file/socket descriptors.'
summary: 'The API server has too many open file descriptors'
Expand All @@ -61,4 +70,16 @@ tests:
exp_annotations:
description: 'The API server (instance) is using 81% of the available file/socket descriptors.'
summary: 'The API server has too many open file descriptors'
- eval_time: 30m
alertname: KubeApiServerLatency
exp_alerts:
- exp_labels:
service: kube-apiserver
severity: warning
type: seed
visibility: owner
verb: POST
exp_annotations:
description: Kube API server latency for verb POST is high. This could be because the shoot workers and the control plane are in different regions. 99th percentile of request latency is greater than 3 second.
summary: Kubernetes API server latency is high

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeControllerManagerDown
- series: 'up{job="kube-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeEtcdMainDown
- series: 'up{job="kube-etcd3",role="main"}'
values: '0+0x20'
# KubeEtcdEventsDown
- series: 'up{job="kube-etcd3",role="events"}'
values: '0+0x30'
# KubeEtcd3MainNoLeader
- series: 'etcd_server_has_leader{job="kube-etcd3",role="main"}'
values: '0+0x20'
# KubeEtcd3EventsNoLeader
- series: 'etcd_server_has_leader{job="kube-etcd3",role="events"}'
values: '0+0x30'
# KubeEtcd3HighNumberOfFailedProposals
- series: 'etcd_server_proposals_failed_total{job="kube-etcd3", pod="etcd"}'
values: '0+0x15 1+0x15 2+0x15 3+0x15 4+0x15 5+0x15 6+0x15 7+0x15'
alert_rule_test:
- eval_time: 5m
alertname: KubeEtcdMainDown
Expand Down Expand Up @@ -58,4 +65,18 @@ tests:
visibility: operator
exp_annotations:
description: Etcd3 events has no leader. No communication with etcd events possible. New cluster events cannot be collected. Events can only be read.
summary: Etcd3 events has no leader.
summary: Etcd3 events has no leader.
- eval_time: 1h
alertname: KubeEtcd3HighNumberOfFailedProposals
exp_alerts:
- exp_labels:
service: etcd
severity: warning
type: seed
visibility: operator
pod: etcd
job: kube-etcd3
exp_annotations:
description: Etcd3 pod etcd has seen 7 proposal failures
within the last hour.
summary: High number of failed etcd proposals
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,24 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeKubeletNodeDown
- series: 'up{job="kube-kubelet", type="shoot"}'
values: '0+0x60'
# KubeletTooManyOpenFileDescriptorsShoot
- series: 'process_open_fds{kubernetes_io_hostname="hostname", job="kube-kubelet"}'
values: '81+0x60'
- series: 'process_max_fds{kubernetes_io_hostname="hostname", job="kube-kubelet"}'
values: '100+0x60'
# KubeletTooManyOpenFileDescriptorsSeed
- series: 'process_open_fds{kubernetes_io_hostname="hostname", job="kube-kubelet-seed"}'
values: '81+0x60'
- series: 'process_max_fds{kubernetes_io_hostname="hostname", job="kube-kubelet-seed"}'
values: '100+0x60'
# KubePersistentVolumeUsageCritical KubePersistentVolumeFullInFourDays
- series: 'kubelet_volume_stats_available_bytes{job="kube-kubelet", type="seed", persistentvolumeclaim="pvc1"}'
values: '2+0x60'
values: '0+0x240'
- series: 'kubelet_volume_stats_capacity_bytes{job="kube-kubelet", type="seed", persistentvolumeclaim="pvc1"}'
values: '100+0x60'
values: '100+0x240'
alert_rule_test:
- eval_time: 30m
alertname: KubeletTooManyOpenFileDescriptorsShoot
Expand All @@ -41,15 +49,42 @@ tests:
description: 'Shoot-kubelet (hostname) is using 81% of the available file/socket descriptors. Kubelet could be under heavy load.'
summary: 'Shoot-kubelet has too many open file descriptors.'
- eval_time: 30m
alertname: KubeletTooManyOpenFileDescriptorsSeed
exp_alerts:
- exp_labels:
job: kube-kubelet-seed
service: kube-kubelet-seed
severity: critical
visibility: operator
type: seed
kubernetes_io_hostname: hostname
exp_annotations:
description: 'Seed-kubelet (hostname) is using 81% of the available file/socket descriptors. Kubelet could be under heavy load.'
summary: 'Seed-kubelet has too many open file descriptors.'
- eval_time: 1h
alertname: KubePersistentVolumeUsageCritical
exp_alerts:
- exp_labels:
job: kube-kubelet
service: kube-kubelet-seed
severity: critical
type: seed
visibility: operator
persistentvolumeclaim: pvc1
exp_annotations:
description: The PersistentVolume claimed by pvc1 is only 2.00% free.
description: The PersistentVolume claimed by pvc1 is only 0.00% free.
summary: PersistentVolume almost full.
- eval_time: 2h
alertname: KubePersistentVolumeFullInFourDays
exp_alerts:
- exp_labels:
job: kube-kubelet
service: kube-kubelet-seed
severity: warning
type: seed
visibility: operator
persistentvolumeclaim: pvc1
exp_annotations:
description: Based on recent sampling, the PersistentVolume claimed by pvc1 is expected to fill up within four days. Currently 0.00% is available.
summary: PersistentVolume will be full in four days.

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
rule_files:
- ../rules/kube-pods.rules.yaml

evaluation_interval: 30s

tests:
- interval: 30s
input_series:
# KubePodPendingShoot
- series: 'kube_pod_status_phase{phase="Pending", type="shoot", pod="pendingPod"}'
values: '1+0x120'
- series: 'kube_pod_labels{label_origin="gardener", pod="pendingPod"}'
values: '1+0x120'
# KubePodPendingControlPlane
- series: 'kube_pod_status_phase{phase="Pending", type="seed", pod="pendingPod"}'
values: '1+0x60'
# KubePodNotReadyShoot
- series: 'kube_pod_status_ready{condition="true", type="shoot", pod="notReadyPod"}'
values: '0+0x120'
- series: 'kube_pod_labels{label_origin="gardener", pod="notReadyPod"}'
values: '1+0x120'
# KubePodNotReadyControlPlane
- series: 'kube_pod_status_ready{condition="true", type="seed", pod="cpPodNotReady"}'
values: '0+0x60'
alert_rule_test:
- eval_time: 1h
alertname: KubePodPendingShoot
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: owner
type: shoot
pod: pendingPod
phase: Pending
exp_annotations:
description: Pod pendingPod is stuck in "Pending" state for more than 1 hour.
summary: Shoot pod stuck in "Pending" state
- eval_time: 30m
alertname: KubePodPendingControlPlane
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: operator
type: seed
pod: pendingPod
phase: Pending
exp_annotations:
description: Pod pendingPod is stuck in "Pending" state for more than 30 minutes.
summary: Control plane pod stuck in "Pending" state
- eval_time: 1h
alertname: KubePodNotReadyShoot
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: owner
type: shoot
pod: notReadyPod
condition: true
exp_annotations:
description: Pod notReadyPod is not ready for more than 1 hour.
summary: Shoot pod is in a not ready state
- eval_time: 30m
alertname: KubePodNotReadyControlPlane
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: operator
type: seed
pod: cpPodNotReady
condition: true
exp_annotations:
description: Pod cpPodNotReady is not ready for more than 30 minutes.
summary: Control plane pod is in a not ready state
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeSchedulerDown
- series: 'up{job="kube-scheduler"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeStateMetricsShootDown
- series: 'up{job="kube-state-metrics", type="shoot"}'
values: '0+0x20'
# KubeStateMetricsSeedDown
- series: 'up{job="kube-state-metrics", type="seed"}'
values: '0+0x20'
# NoWorkerNodes
- series: 'kube_node_spec_unschedulable'
values: '2+0x20'
- series: 'kube_node_spec_unschedulable'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# MachineControllerManagerDown
- series: 'up{job="machine-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down

0 comments on commit d0dded2

Please sign in to comment.