Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overhaul warning alerts #954

Merged
merged 1 commit into from
Apr 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion charts/seed-bootstrap/templates/alertmanager/_config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ route:
# email only for critical and blocker
- match_re:
visibility: ^(all|operator)$
severity: ^(blocker|critical|info)$
receiver: email-kubernetes-ops


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ route:
# email only for critical and blocker
- match_re:
visibility: ^(all|owner)$
severity: ^(critical|blocker|info)$
receiver: email-kubernetes-ops

inhibit_rules:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# ApiServerUnreachableViaKubernetesService
- series: 'probe_success{job="blackbox-exporter-k8s-service-check"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# CloudControllerManagerDown
- series: 'up{job="cloud-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# CoreDNSDown
- series: 'up{job="coredns"}'
values: '0+0x40'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# ApiServerNotReachable
- series: 'probe_success{instance="api.example.com/healthz", job="blackbox-apiserver"}'
values: '0+0x10'
# KubeApiserverDown
- series: 'up{job="kube-apiserver"}'
values: '0+0x10'
# KubeApiServerTooManyOpenFileDescriptors
- series: 'process_open_fds{job="kube-apiserver", instance="instance"}'
values: '81+0x60'
- series: 'process_max_fds{job="kube-apiserver", instance="instance"}'
values: '100+0x60'
# KubeApiServerLatency
- series: 'apiserver_request_latencies_bucket{verb="POST", le="5000000"}'
values: '100+0x60'
- series: 'apiserver_request_latencies_bucket{verb="POST", le="+Inf"}'
values: '100+0x60'
alert_rule_test:
- eval_time: 5m
alertname: ApiServerNotReachable
Expand Down Expand Up @@ -48,6 +56,7 @@ tests:
type: seed
job: kube-apiserver
instance: instance
visibility: owner
exp_annotations:
description: 'The API server (instance) is using 81% of the available file/socket descriptors.'
summary: 'The API server has too many open file descriptors'
Expand All @@ -61,4 +70,16 @@ tests:
exp_annotations:
description: 'The API server (instance) is using 81% of the available file/socket descriptors.'
summary: 'The API server has too many open file descriptors'
- eval_time: 30m
alertname: KubeApiServerLatency
exp_alerts:
- exp_labels:
service: kube-apiserver
severity: warning
type: seed
visibility: owner
verb: POST
exp_annotations:
description: Kube API server latency for verb POST is high. This could be because the shoot workers and the control plane are in different regions. 99th percentile of request latency is greater than 3 second.
summary: Kubernetes API server latency is high

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeControllerManagerDown
- series: 'up{job="kube-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeEtcdMainDown
- series: 'up{job="kube-etcd3",role="main"}'
values: '0+0x20'
# KubeEtcdEventsDown
- series: 'up{job="kube-etcd3",role="events"}'
values: '0+0x30'
# KubeEtcd3MainNoLeader
- series: 'etcd_server_has_leader{job="kube-etcd3",role="main"}'
values: '0+0x20'
# KubeEtcd3EventsNoLeader
- series: 'etcd_server_has_leader{job="kube-etcd3",role="events"}'
values: '0+0x30'
# KubeEtcd3HighNumberOfFailedProposals
- series: 'etcd_server_proposals_failed_total{job="kube-etcd3", pod="etcd"}'
values: '0+0x15 1+0x15 2+0x15 3+0x15 4+0x15 5+0x15 6+0x15 7+0x15'
alert_rule_test:
- eval_time: 5m
alertname: KubeEtcdMainDown
Expand Down Expand Up @@ -58,4 +65,18 @@ tests:
visibility: operator
exp_annotations:
description: Etcd3 events has no leader. No communication with etcd events possible. New cluster events cannot be collected. Events can only be read.
summary: Etcd3 events has no leader.
summary: Etcd3 events has no leader.
- eval_time: 1h
alertname: KubeEtcd3HighNumberOfFailedProposals
exp_alerts:
- exp_labels:
service: etcd
severity: warning
type: seed
visibility: operator
pod: etcd
job: kube-etcd3
exp_annotations:
description: Etcd3 pod etcd has seen 7 proposal failures
within the last hour.
summary: High number of failed etcd proposals
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,24 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeKubeletNodeDown
- series: 'up{job="kube-kubelet", type="shoot"}'
values: '0+0x60'
# KubeletTooManyOpenFileDescriptorsShoot
- series: 'process_open_fds{kubernetes_io_hostname="hostname", job="kube-kubelet"}'
values: '81+0x60'
- series: 'process_max_fds{kubernetes_io_hostname="hostname", job="kube-kubelet"}'
values: '100+0x60'
# KubeletTooManyOpenFileDescriptorsSeed
- series: 'process_open_fds{kubernetes_io_hostname="hostname", job="kube-kubelet-seed"}'
values: '81+0x60'
- series: 'process_max_fds{kubernetes_io_hostname="hostname", job="kube-kubelet-seed"}'
values: '100+0x60'
# KubePersistentVolumeUsageCritical KubePersistentVolumeFullInFourDays
- series: 'kubelet_volume_stats_available_bytes{job="kube-kubelet", type="seed", persistentvolumeclaim="pvc1"}'
values: '2+0x60'
values: '0+0x240'
- series: 'kubelet_volume_stats_capacity_bytes{job="kube-kubelet", type="seed", persistentvolumeclaim="pvc1"}'
values: '100+0x60'
values: '100+0x240'
alert_rule_test:
- eval_time: 30m
alertname: KubeletTooManyOpenFileDescriptorsShoot
Expand All @@ -41,15 +49,42 @@ tests:
description: 'Shoot-kubelet (hostname) is using 81% of the available file/socket descriptors. Kubelet could be under heavy load.'
summary: 'Shoot-kubelet has too many open file descriptors.'
- eval_time: 30m
alertname: KubeletTooManyOpenFileDescriptorsSeed
exp_alerts:
- exp_labels:
job: kube-kubelet-seed
service: kube-kubelet-seed
severity: critical
visibility: operator
type: seed
kubernetes_io_hostname: hostname
exp_annotations:
description: 'Seed-kubelet (hostname) is using 81% of the available file/socket descriptors. Kubelet could be under heavy load.'
summary: 'Seed-kubelet has too many open file descriptors.'
- eval_time: 1h
alertname: KubePersistentVolumeUsageCritical
exp_alerts:
- exp_labels:
job: kube-kubelet
service: kube-kubelet-seed
severity: critical
type: seed
visibility: operator
persistentvolumeclaim: pvc1
exp_annotations:
description: The PersistentVolume claimed by pvc1 is only 2.00% free.
description: The PersistentVolume claimed by pvc1 is only 0.00% free.
summary: PersistentVolume almost full.
- eval_time: 2h
alertname: KubePersistentVolumeFullInFourDays
exp_alerts:
- exp_labels:
job: kube-kubelet
service: kube-kubelet-seed
severity: warning
type: seed
visibility: operator
persistentvolumeclaim: pvc1
exp_annotations:
description: Based on recent sampling, the PersistentVolume claimed by pvc1 is expected to fill up within four days. Currently 0.00% is available.
summary: PersistentVolume will be full in four days.

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
rule_files:
- ../rules/kube-pods.rules.yaml

evaluation_interval: 30s

tests:
- interval: 30s
input_series:
# KubePodPendingShoot
- series: 'kube_pod_status_phase{phase="Pending", type="shoot", pod="pendingPod"}'
values: '1+0x120'
- series: 'kube_pod_labels{label_origin="gardener", pod="pendingPod"}'
values: '1+0x120'
# KubePodPendingControlPlane
- series: 'kube_pod_status_phase{phase="Pending", type="seed", pod="pendingPod"}'
values: '1+0x60'
# KubePodNotReadyShoot
- series: 'kube_pod_status_ready{condition="true", type="shoot", pod="notReadyPod"}'
values: '0+0x120'
- series: 'kube_pod_labels{label_origin="gardener", pod="notReadyPod"}'
values: '1+0x120'
# KubePodNotReadyControlPlane
- series: 'kube_pod_status_ready{condition="true", type="seed", pod="cpPodNotReady"}'
values: '0+0x60'
alert_rule_test:
- eval_time: 1h
alertname: KubePodPendingShoot
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: owner
type: shoot
pod: pendingPod
phase: Pending
exp_annotations:
description: Pod pendingPod is stuck in "Pending" state for more than 1 hour.
summary: Shoot pod stuck in "Pending" state
- eval_time: 30m
alertname: KubePodPendingControlPlane
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: operator
type: seed
pod: pendingPod
phase: Pending
exp_annotations:
description: Pod pendingPod is stuck in "Pending" state for more than 30 minutes.
summary: Control plane pod stuck in "Pending" state
- eval_time: 1h
alertname: KubePodNotReadyShoot
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: owner
type: shoot
pod: notReadyPod
condition: true
exp_annotations:
description: Pod notReadyPod is not ready for more than 1 hour.
summary: Shoot pod is in a not ready state
- eval_time: 30m
alertname: KubePodNotReadyControlPlane
exp_alerts:
- exp_labels:
service: kube-kubelet
severity: warning
visibility: operator
type: seed
pod: cpPodNotReady
condition: true
exp_annotations:
description: Pod cpPodNotReady is not ready for more than 30 minutes.
summary: Control plane pod is in a not ready state
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeSchedulerDown
- series: 'up{job="kube-scheduler"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# KubeStateMetricsShootDown
- series: 'up{job="kube-state-metrics", type="shoot"}'
values: '0+0x20'
# KubeStateMetricsSeedDown
- series: 'up{job="kube-state-metrics", type="seed"}'
values: '0+0x20'
# NoWorkerNodes
- series: 'kube_node_spec_unschedulable'
values: '2+0x20'
- series: 'kube_node_spec_unschedulable'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ evaluation_interval: 30s
tests:
- interval: 30s
input_series:
# MachineControllerManagerDown
- series: 'up{job="machine-controller-manager"}'
values: '0+0x30'
alert_rule_test:
Expand Down
Loading