From 2381b4bb8ac1d9f4f263d7b72196cc6cbcd0b237 Mon Sep 17 00:00:00 2001
From: Marco Pracucci <marco@pracucci.com>
Date: Wed, 23 Jun 2021 10:27:29 +0200
Subject: [PATCH 1/2] Add playbook for CortexRequestErrors and config option to
 exclude specific routes

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 CHANGELOG.md                         |  1 +
 cortex-mixin/alerts/alerts.libsonnet | 21 ++++++++++++++++-----
 cortex-mixin/config.libsonnet        |  3 +++
 cortex-mixin/docs/playbooks.md       | 17 ++++++++++++-----
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47a64a96..7e5af29c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@
 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
 * [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
 * [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. #331
+* [ENHANCEMENT] cortex-mixin: Added `alert_excluded_routes` config to exclude specific routes from alerts. #326
 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335
diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet
index 582aba4b..11640f7a 100644
--- a/cortex-mixin/alerts/alerts.libsonnet
+++ b/cortex-mixin/alerts/alerts.libsonnet
@@ -21,11 +21,14 @@
           // Note if alert_aggregation_labels is "job", this will repeat the label. But
           // prometheus seems to tolerate that.
           expr: |||
-            100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
+            100 * sum by (%(group_by)s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m]))
               /
-            sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
+            sum by (%(group_by)s, job, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m]))
               > 1
-          ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
+          ||| % {
+            group_by: $._config.alert_aggregation_labels,
+            excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes),
+          },
           'for': '15m',
           labels: {
             severity: 'critical',
@@ -39,10 +42,18 @@
         {
           alert: 'CortexRequestLatency',
           expr: |||
-            %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
+            %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"%(excluded_routes)s"}
                >
             %(cortex_p99_latency_threshold_seconds)s
-          ||| % $._config,
+          ||| % $._config {
+            excluded_routes: std.join('|', [
+              'metrics',
+              '/frontend.Frontend/Process',
+              'ready',
+              '/schedulerpb.SchedulerForFrontend/FrontendLoop',
+              '/schedulerpb.SchedulerForQuerier/QuerierLoop',
+            ] + $._config.alert_excluded_routes),
+          },
           'for': '15m',
           labels: {
             severity: 'warning',
diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet
index dacd06ea..917ffd51 100644
--- a/cortex-mixin/config.libsonnet
+++ b/cortex-mixin/config.libsonnet
@@ -64,5 +64,8 @@
       writes: true,
       reads: true,
     },
+
+    // The routes to exclude from alerts.
+    alert_excluded_routes: [],
   },
 }
diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md
index 9114f564..4373bad8 100644
--- a/cortex-mixin/docs/playbooks.md
+++ b/cortex-mixin/docs/playbooks.md
@@ -109,7 +109,18 @@ Right now most of the execution time will be spent in PromQL's innerEval. NB tha
 
 ### CortexRequestErrors
 
-_TODO: this playbook has not been written yet._
+This alert fires when the rate of 5xx errors of a specific route is > 1% for some time.
+
+This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one.
+
+How to **investigate**:
+- Check for which route the alert fired
+  - Write path: open the `Cortex / Writes` dashboard
+  - Read path: open the `Cortex / Reads` dashboard
+- Looking at the dashboard you should see in which Cortex service the error originates
+  - The panels in the dashboard are vertically sorted by the network path (eg. on the write path: cortex-gw -> distributor -> ingester)
+- If the failing service is going OOM (`OOMKilled`): scale up or increase the memory
+- If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there
 
 ### CortexTransferFailed
 This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n <namespace> describe pod <pod>`
@@ -355,10 +366,6 @@ WAL corruptions are only detected at startups, so at this point the WAL/Checkpoi
   2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss.
   3. Equal or more than the replication factor: Then there is definitely some data loss.
 
-### CortexRequestErrors
-
-_TODO: this playbook has not been written yet._
-
 ### CortexTableSyncFailure
 
 _This alert applies to Cortex chunks storage only._

From 967ab576f210c3ce96e14f1431f7cd8dcf4c52e5 Mon Sep 17 00:00:00 2001
From: Marco Pracucci <marco@pracucci.com>
Date: Wed, 23 Jun 2021 13:58:38 +0200
Subject: [PATCH 2/2] Fixed PR number in CHANGELOG

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e5af29c..b446c176 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@
 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
 * [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
 * [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. #331
-* [ENHANCEMENT] cortex-mixin: Added `alert_excluded_routes` config to exclude specific routes from alerts. #326
+* [ENHANCEMENT] cortex-mixin: Added `alert_excluded_routes` config to exclude specific routes from alerts. #338
 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335