diff --git a/.circleci/config.yml b/.circleci/config.yml
index 71e41d63..41cd2ad2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,9 +11,12 @@ workflows:
jobs:
lint:
docker:
- - image: grafana/cortex-jsonnet-build-image:e19ece2
+ - image: grafana/cortex-jsonnet-build-image:3527936
steps:
- checkout
+ - run:
+ name: "Check white noise"
+ command: make check-white-noise
- run:
name: "Lint mixin"
command: make lint-mixin
@@ -23,7 +26,7 @@ jobs:
build:
docker:
- - image: grafana/cortex-jsonnet-build-image:e19ece2
+ - image: grafana/cortex-jsonnet-build-image:3527936
steps:
- checkout
- run: make build-mixin
@@ -32,7 +35,7 @@ jobs:
test-readme:
docker:
- - image: grafana/cortex-jsonnet-build-image:e19ece2
+ - image: grafana/cortex-jsonnet-build-image:3527936
steps:
- checkout
- run: make test-readme
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fdb86cf8..3eebd056 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
## master / unreleased
-* [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328
+* [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328
* [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. #311
* [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315
* [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316
diff --git a/Makefile b/Makefile
index 266f128d..d0ca2f52 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,11 @@
JSONNET_FMT := jsonnetfmt
+# Support gsed/gfind on OSX (installed via brew), falling back to sed/find. On Linux
+# systems gsed/gfind won't be installed, so will use sed/gfind as expected.
+SED ?= $(shell which gsed 2>/dev/null || which sed)
+FIND ?= $(shell which gfind 2>/dev/null || which find)
+
lint: lint-mixin lint-playbooks
lint-mixin: lint-mixin-with-mixtool lint-mixin-with-jsonnetfmt
@@ -50,3 +55,10 @@ test-readme:
cp -r ../cortex ./vendor/ && \
cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet && \
PAGER=cat tk show environments/default
+
+clean-white-noise:
+ @$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \
+ SED_BIN="$(SED)" xargs ./scripts/cleanup-white-noise.sh
+
+check-white-noise: clean-white-noise
+ @git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false)
diff --git a/build-image/Dockerfile b/build-image/Dockerfile
index 7e753e86..5b0f50aa 100644
--- a/build-image/Dockerfile
+++ b/build-image/Dockerfile
@@ -30,7 +30,7 @@ FROM golang:1.15-alpine AS mixtool-builder
RUN GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5
FROM alpine:3.13
-RUN apk add --no-cache git make libgcc libstdc++ zip
+RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed
COPY --from=jsonnet-builder /usr/bin/jsonnetfmt /usr/bin
COPY --from=jsonnet-builder /usr/bin/jsonnet /usr/bin
COPY --from=jb-builder /usr/bin/jb /usr/bin
diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet
index a93ffe05..582aba4b 100644
--- a/cortex-mixin/alerts/alerts.libsonnet
+++ b/cortex-mixin/alerts/alerts.libsonnet
@@ -599,7 +599,7 @@
container_memory_working_set_bytes{container="etcd"}
/
container_spec_memory_limit_bytes{container="etcd"}
- ) > 0.65
+ ) > 0.65
|||,
'for': '15m',
labels: {
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index c965b265..c54ae659 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -383,8 +383,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
name="%(cacheName)s"
}[$__rate_interval]
)
- )
- /
+ )
+ /
sum(
rate(
thanos_cache_memcached_requests_total{
@@ -405,20 +405,20 @@ local utils = import 'mixin-utils/utils.libsonnet';
ignoring(%s) group_right() (
label_replace(
count by(
- %s,
- %s,
+ %s,
+ %s,
device
- )
+ )
(
container_fs_writes_bytes_total{
%s,
container="%s",
device!~".*sda.*"
}
- ),
- "device",
- "$1",
- "device",
+ ),
+ "device",
+ "$1",
+ "device",
"/dev/(.*)"
) * 0
)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 965e0e76..9bc9b7d6 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -16,15 +16,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the time range of the query).
For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.
-
- The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
- the query results cache, the metadata cache, the chunks cache, and the index cache.
+ The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
+ the query results cache, the metadata cache, the chunks cache, and the index cache.
- These panels will show “no data” if the caches are not deployed.
+ These panels will show “no data” if the caches are not deployed.
- Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage. + Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage.
|||), ) @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; route=~"(prometheus|api_prom)_api_v1_query" }[$__rate_interval] ) - ) + + ) + sum( rate( cortex_prometheus_rule_evaluations_total{ @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Instant queries per second', ||| Rate of instant queries per second being made to the system. - Includes both queries made to the /prometheus API as + Includes both queries made to the /prometheus API as well as queries from the ruler. ||| ), @@ -83,8 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Range queries per second', ||| - Rate of range queries per second being made to - Cortex via the /prometheus API. + Rate of range queries per second being made to + Cortex via the /prometheus API. ||| ), ) @@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; The query scheduler is an optional service that moves the internal queue from the query-frontend into a separate component. - If this service is not deployed, + If this service is not deployed, these panels will show "No data." ||| @@ -286,8 +286,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; %s }[$__rate_interval] ) - ) - / + ) + / sum by(item_type) ( rate( thanos_store_index_cache_requests_total{ @@ -307,7 +307,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Hit Ratio', ||| Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an - in-memory blocks index cache. + in-memory blocks index cache. ||| ), ) diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index 8a77be1c..e99faee4 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -11,16 +11,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.textPanel('', |||
This dashboard shows various health metrics for the Cortex write path.
- It is broken into sections for each service on the write path,
+ It is broken into sections for each service on the write path,
and organized by the order in which the write request flows.
Incoming metrics data travels from the gateway → distributor → ingester.
For each service, there are 3 panels showing
- (1) requests per second to that service,
- (2) average, median, and p99 latency of requests to that service, and
+ (1) requests per second to that service,
+ (2) average, median, and p99 latency of requests to that service, and
(3) p99 latency of requests to each instance of that service.
-
It also includes metrics for the key-value (KV) stores used to manage the high-availability tracker and the ingesters. @@ -216,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Uploaded blocks / sec', ||| - The rate of blocks being uploaded from the ingesters + The rate of blocks being uploaded from the ingesters to object storage. ||| ), @@ -227,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Upload latency', ||| - The average, median (50th percentile), and 99th percentile time + The average, median (50th percentile), and 99th percentile time the ingesters take to upload blocks to object storage. ||| ), @@ -247,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each active time series; these blocks get periodically compacted (by default, every 2h). - This panel shows the rate of compaction operations across all TSDBs on all ingesters. + This panel shows the rate of compaction operations across all TSDBs on all ingesters. ||| ), ) @@ -275,7 +275,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations per second', ||| - The WAL is truncated each time a new TSDB block is written. This panel measures the rate of + The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. ||| ), @@ -289,7 +289,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Checkpoints created per second', ||| - Checkpoints are created as part of the WAL truncation process. + Checkpoints are created as part of the WAL truncation process. This metric measures the rate of checkpoint creation. ||| ), @@ -301,7 +301,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| - Average time taken to perform a full WAL truncation, + Average time taken to perform a full WAL truncation, including the time taken for the checkpointing to complete. ||| ), diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 2dc2e26b..292d932a 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -16,7 +16,7 @@ In events you're looking for things like: ``` 57m Normal NodeControllerEviction Pod Marking for deletion Pod ingester-01 from Node cloud-provider-node-01 37m Normal SuccessfulDelete ReplicaSet (combined from similar events): Deleted pod: ingester-01 -32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady +32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady 28m Normal DeletingAllPods Node Node cloud-provider-node-01 event: Deleting all Pods from Node cloud-provider-node-01. ``` @@ -313,7 +313,7 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ### CortexBucketIndexNotUpdated -This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. +This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. How to **investigate**: - Ensure the compactor is successfully running @@ -557,7 +557,7 @@ metadata: spec: accessModes: - ReadWriteOnce - capacity: + capacity: storage: 150Gi gcePersistentDisk: fsType: ext4 diff --git a/cortex-mixin/groups.libsonnet b/cortex-mixin/groups.libsonnet index 63076672..6d33ea36 100644 --- a/cortex-mixin/groups.libsonnet +++ b/cortex-mixin/groups.libsonnet @@ -29,7 +29,7 @@ if alert_aggregation_labels_override != null then std.trace( ||| - Deprecated: _config.alert_aggregation_labels + Deprecated: _config.alert_aggregation_labels This field has been explicitly overridden to "%s". Instead, express the override in terms of _config.cluster_labels. E.g., cluster_labels: %s will automatically convert to "%s". diff --git a/scripts/cleanup-white-noise.sh b/scripts/cleanup-white-noise.sh new file mode 100755 index 00000000..ac13874b --- /dev/null +++ b/scripts/cleanup-white-noise.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SED_BIN=${SED_BIN:-sed} + +${SED_BIN} -i 's/[ \t]*$//' "$@"