grafana · pracucci · Jun 8, 2021 · Jun 7, 2021 · Jun 8, 2021
@@ -5,6 +5,7 @@
 * [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. #311
 * [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315
 * [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316
+* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. #321
 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
 

@@ -198,10 +198,13 @@
         {
           alert: 'CortexIngesterRestarts',
           expr: |||
-            changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) > 1
+            changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2
           |||,
           labels: {
-            severity: 'critical',
+            // This alert is on a cause not symptom. A couple of ingesters restarts may be suspicious but
+            // not necessarily an issue (eg. may happen because of the K8S node autoscaler), so we're
+            // keeping the alert as warning as a signal in case of an outage.
+            severity: 'warning',
           },
           annotations: {
             message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.',