From 1e56681df85727749920a5856696f698d12b70be Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 12:20:19 +0200 Subject: [PATCH 1/2] Added playbook for CortexAllocatingTooMuchMemory Signed-off-by: Marco Pracucci --- cortex-mixin/alerts/alerts.libsonnet | 6 +++--- cortex-mixin/docs/playbooks.md | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 11640f7a..befc83bf 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -479,7 +479,7 @@ }, annotations: { message: ||| - High QPS for ingesters, add more ingesters. + Ingesters in {{ $labels.namespace }} have an high samples/sec rate. |||, }, }, @@ -498,7 +498,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. + Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. |||, }, }, @@ -517,7 +517,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. + Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. |||, }, }, diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 825fa1c6..21f143fd 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -451,7 +451,25 @@ How to **fix**: ### CortexAllocatingTooMuchMemory -_TODO: this playbook has not been written yet._ +This alert fires when an ingester memory utilization is getting closer to the limit. + +How it **works**: +- Cortex ingesters are a stateful service +- Having 2+ ingesters `OOMKilled` may cause a cluster outage +- Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) +- Ingester memory short spikes are primarily influenced by queries +- A pod gets `OOMKilled` once it's working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) + +How to **fix**: +- Check if the issue occurs only for few ingesters. If so: + - Restart affected ingesters 1 by 1 (proceed with the next one once the previous pod has restarted and it's Ready) + ``` + kubectl -n delete pod ingester-XXX + ``` + - Restarting an ingester typically reduces the memory allocated by mmap-ed files. Such memory could be reallocated again, but may let you gain more time while working on a longer term solution +- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: + - Scale up ingesters + - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) ### CortexGossipMembersMismatch From f8b162b91e2d18daf074e8c472fd5f9268738164 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 13:34:13 +0200 Subject: [PATCH 2/2] Address review feedback Signed-off-by: Marco Pracucci --- cortex-mixin/alerts/alerts.libsonnet | 2 +- cortex-mixin/docs/playbooks.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index befc83bf..71655505 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -479,7 +479,7 @@ }, annotations: { message: ||| - Ingesters in {{ $labels.namespace }} have an high samples/sec rate. + Ingesters in {{ $labels.namespace }} ingest too many samples per second. |||, }, }, diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 21f143fd..dc505852 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -457,8 +457,8 @@ How it **works**: - Cortex ingesters are a stateful service - Having 2+ ingesters `OOMKilled` may cause a cluster outage - Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) -- Ingester memory short spikes are primarily influenced by queries -- A pod gets `OOMKilled` once it's working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) +- Ingester memory short spikes are primarily influenced by queries and TSDB head compaction into new blocks (occurring every 2h) +- A pod gets `OOMKilled` once its working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) How to **fix**: - Check if the issue occurs only for few ingesters. If so: @@ -466,7 +466,7 @@ How to **fix**: ``` kubectl -n delete pod ingester-XXX ``` - - Restarting an ingester typically reduces the memory allocated by mmap-ed files. Such memory could be reallocated again, but may let you gain more time while working on a longer term solution + - Restarting an ingester typically reduces the memory allocated by mmap-ed files. After the restart, ingester may allocate this memory again over time, but it may give more time while working on a longer term solution - Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: - Scale up ingesters - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h)