From c4df6555f491962b3396c69cde1f611104fc6058 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 13 Dec 2022 19:39:54 +1100 Subject: [PATCH] alerts: Fix MimirIngesterHasNotShippedBlocks for other deployment modes (#3627) * alerts: Fix MimirIngesterHasNotShippedBlocks for other deployment modes This fixes the job regex for monolithic and read-write deployment modes for MimirIngesterHasNotShippedBlocks and MimirIngesterHasNotShippedBlocksSinceStart * Update CHANGELOG * Add new metric for tracking last shipped block time * Update IngesterHasNotShippedBlocks to use new metric Use the new mimir_ingester_shipper_bucket_last_successful_upload_time metric which only counts for ingester shipper rather than any uploaded blocks. * Apply suggestions from code review Co-authored-by: Marco Pracucci * Update CHANGELOG * Update alert to use new metric * Fix gauge type * Update helm test * Group code a bit cleaner * Add test for new metric * Apply suggestions from code review Co-authored-by: Marco Pracucci --- CHANGELOG.md | 2 ++ .../templates/metamonitoring/mixin-alerts.yaml | 6 +++--- .../mimir-mixin-compiled-baremetal/alerts.yaml | 6 +++--- operations/mimir-mixin-compiled/alerts.yaml | 6 +++--- operations/mimir-mixin/alerts/blocks.libsonnet | 18 +++++++++++++----- pkg/ingester/shipper.go | 15 +++++++++++---- pkg/ingester/shipper_test.go | 8 ++++++++ 7 files changed, 43 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b99a10279a..aae25593a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Grafana Mimir * [FEATURE] Store-gateway: streaming of series. The store-gateway can now stream results back to the querier instead of buffering them. This is expected to greatly reduce peak memory consumption while keeping latency the same. You can enable this feature by setting `-blocks-storage.bucket-store.batch-series-size` to a value in the high thousands (5000-10000). This is still an experimental feature and is subject to a changing API and instability. #3540 #3546 #3587 #3620 #3645 #3355 +* [ENHANCEMENT] Added new metric `thanos_shipper_last_successful_upload_time`: Unix timestamp (in seconds) of the last successful TSDB block uploaded to the bucket. #3627 * [ENHANCEMENT] Ruler: Added `-ruler.alertmanager-client.tls-enabled` configuration for alertmanager client. #3432 #3597 * [ENHANCEMENT] Activity tracker logs now have `component=activity-tracker` label. #3556 * [ENHANCEMENT] Distributor: remove labels with empty values #2439 @@ -18,6 +19,7 @@ ### Mixin * [ENHANCEMENT] Alerts: Added `MimirIngesterInstanceHasNoTenants` alert that fires when an ingester replica is not receiving write requests for any tenant. #3681 +* [BUGFIX] Alerts: Fixed `MimirIngesterHasNotShippedBlocks` and `MimirIngesterHasNotShippedBlocksSinceStart` alerts for when Mimir is deployed in read-write or monolithic modes and updated them to use new `thanos_shipper_last_successful_upload_time` metric. #3627 * [BUGFIX] Alerts: Fixed `MemoryMapAreasTooHigh` alert when Mimir is deployed in read-write mode. #3626 * [BUGFIX] Alerts: Fixed `MimirCompactorSkippedBlocksWithOutOfOrderChunks` matching on non-existent label. #3628 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 2dc26c65fd..1b8607eb33 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -596,9 +596,9 @@ spec: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks expr: | - (min by(cluster, namespace, pod) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(cluster, namespace, pod) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4) and - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) > 0) and # Only if the ingester has ingested samples over the last 4h. (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) @@ -617,7 +617,7 @@ spec: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart expr: | - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) == 0) and (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) for: 4h diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index b4e9d600fd..1eb30c4d6b 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -584,9 +584,9 @@ groups: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks expr: | - (min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(cluster, namespace, instance) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4) and - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) + (max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) > 0) and # Only if the ingester has ingested samples over the last 4h. (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) @@ -605,7 +605,7 @@ groups: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart expr: | - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) + (max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) == 0) and (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) for: 4h diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 9a69ebb582..21e180aaab 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -584,9 +584,9 @@ groups: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks expr: | - (min by(cluster, namespace, pod) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(cluster, namespace, pod) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4) and - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) > 0) and # Only if the ingester has ingested samples over the last 4h. (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) @@ -605,7 +605,7 @@ groups: }} has not shipped any block in the last 4 hours. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart expr: | - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) + (max by(cluster, namespace, pod) (thanos_shipper_last_successful_upload_time) == 0) and (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) for: 4h diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet index 9c075e987e..45401a65b9 100644 --- a/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/operations/mimir-mixin/alerts/blocks.libsonnet @@ -9,9 +9,9 @@ alert: $.alertName('IngesterHasNotShippedBlocks'), 'for': '15m', expr: ||| - (min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - thanos_objstore_bucket_last_successful_upload_time{%(per_job_label)s=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4) and - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{%(per_job_label)s=~".+/ingester.*"}) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_shipper_last_successful_upload_time) > 0) and # Only if the ingester has ingested samples over the last 4h. (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) @@ -21,7 +21,11 @@ # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) - ||| % $._config, + ||| % { + alert_aggregation_labels: $._config.alert_aggregation_labels, + per_instance_label: $._config.per_instance_label, + alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + }, labels: { severity: 'critical', }, @@ -35,10 +39,14 @@ alert: $.alertName('IngesterHasNotShippedBlocksSinceStart'), 'for': '4h', expr: ||| - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{%(per_job_label)s=~".+/ingester.*"}) == 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_shipper_last_successful_upload_time) == 0) and (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - ||| % $._config, + ||| % { + alert_aggregation_labels: $._config.alert_aggregation_labels, + per_instance_label: $._config.per_instance_label, + alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + }, labels: { severity: 'critical', }, diff --git a/pkg/ingester/shipper.go b/pkg/ingester/shipper.go index da5796f6ff..340555b716 100644 --- a/pkg/ingester/shipper.go +++ b/pkg/ingester/shipper.go @@ -29,10 +29,11 @@ import ( ) type metrics struct { - dirSyncs prometheus.Counter - dirSyncFailures prometheus.Counter - uploads prometheus.Counter - uploadFailures prometheus.Counter + dirSyncs prometheus.Counter + dirSyncFailures prometheus.Counter + uploads prometheus.Counter + uploadFailures prometheus.Counter + lastSuccessfulUploadTime prometheus.Gauge } func newMetrics(reg prometheus.Registerer) *metrics { @@ -54,6 +55,11 @@ func newMetrics(reg prometheus.Registerer) *metrics { Name: "thanos_shipper_upload_failures_total", Help: "Total number of block upload failures", }) + m.lastSuccessfulUploadTime = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "thanos_shipper_last_successful_upload_time", + Help: "Unix timestamp (in seconds) of the last successful TSDB block uploaded to the bucket.", + }) + return &m } @@ -166,6 +172,7 @@ func (s *Shipper) Sync(ctx context.Context) (uploaded int, err error) { meta.Uploaded = append(meta.Uploaded, m.ULID) uploaded++ s.metrics.uploads.Inc() + s.metrics.lastSuccessfulUploadTime.SetToCurrentTime() } if err := writeShipperMetaFile(s.logger, s.dir, meta); err != nil { level.Warn(s.logger).Log("msg", "updating meta file failed", "err", err) diff --git a/pkg/ingester/shipper_test.go b/pkg/ingester/shipper_test.go index 7fb3fc2949..25a727602d 100644 --- a/pkg/ingester/shipper_test.go +++ b/pkg/ingester/shipper_test.go @@ -14,10 +14,12 @@ import ( "path/filepath" "sort" "testing" + "time" "github.com/go-kit/log" "github.com/grafana/dskit/concurrency" "github.com/oklog/ulid" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/tsdb" "github.com/stretchr/testify/require" "github.com/thanos-io/objstore" @@ -59,6 +61,9 @@ func TestShipper(t *testing.T) { id1 := ulid.MustNew(1, nil) t.Run("sync first block", func(t *testing.T) { + // No blocks have been uploaded yet. + require.Equal(t, float64(0), testutil.ToFloat64(s.metrics.lastSuccessfulUploadTime)) + createBlock(t, blocksDir, id1, metadata.Meta{ BlockMeta: tsdb.BlockMeta{ ULID: id1, @@ -77,6 +82,9 @@ func TestShipper(t *testing.T) { require.NoError(t, err) require.Equal(t, 1, uploaded) + // Verify that the lastSuccessfulUploadTime was updated to within the last 2 seconds. + require.WithinDuration(t, time.Now(), time.UnixMilli(int64(testutil.ToFloat64(s.metrics.lastSuccessfulUploadTime)*1000)), 2*time.Second) + // Verify that shipper has created a file for itself. shipped, err := readShippedBlocks(blocksDir) require.NoError(t, err)