From cde7a7ed410b6d9856a0e557b3212416ebac197f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 18 Feb 2019 10:57:31 +0000 Subject: [PATCH 01/18] Bump github.com/pkg/errors from 0.8.0 to 0.8.1 (#793) Bumps [github.com/pkg/errors](https://github.com/pkg/errors) from 0.8.0 to 0.8.1. - [Release notes](https://github.com/pkg/errors/releases) - [Commits](https://github.com/pkg/errors/compare/v0.8.0...v0.8.1) Signed-off-by: dependabot[bot] --- Gopkg.lock | 6 +++--- Gopkg.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 5fc954c460..038ad0e0ab 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -433,12 +433,12 @@ version = "v1.0.2" [[projects]] - digest = "1:7365acd48986e205ccb8652cc746f09c8b7876030d53710ea6ef7d0bd0dcd7ca" + digest = "1:1d7e1867c49a6dd9856598ef7c3123604ea3daabf5b83f303ff457bcbc410b1d" name = "github.com/pkg/errors" packages = ["."] pruneopts = "" - revision = "645ef00459ed84a119197bfb8d8205042c6df63d" - version = "v0.8.0" + revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" + version = "v0.8.1" [[projects]] digest = "1:37e79889eaa743256a4923e15fb6338ad14cfe413985a075322a13744fa3602b" diff --git a/Gopkg.toml b/Gopkg.toml index c68717f3b5..57422d4180 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -26,7 +26,7 @@ ignored = ["github.com/improbable-eng/thanos/benchmark/*"] [[constraint]] name = "github.com/pkg/errors" - version = "0.8.0" + version = "0.8.1" [[constraint]] name = "github.com/prometheus/client_golang" From 0ceacc933e0a92d0f4f6b16433ebcdebcb89dba1 Mon Sep 17 00:00:00 2001 From: Dominic Green Date: Mon, 18 Feb 2019 10:58:15 +0000 Subject: [PATCH 02/18] ran make (#846) * ran make * fmt fmt fmt --- cmd/thanos/downsample.go | 2 +- pkg/pool/pool_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/thanos/downsample.go b/cmd/thanos/downsample.go index fed8946bd4..728dbdbf4a 100644 --- a/cmd/thanos/downsample.go +++ b/cmd/thanos/downsample.go @@ -3,7 +3,6 @@ package main import ( "context" "encoding/json" - "github.com/improbable-eng/thanos/pkg/component" "os" "path" "path/filepath" @@ -14,6 +13,7 @@ import ( "github.com/improbable-eng/thanos/pkg/block" "github.com/improbable-eng/thanos/pkg/block/metadata" "github.com/improbable-eng/thanos/pkg/compact/downsample" + "github.com/improbable-eng/thanos/pkg/component" "github.com/improbable-eng/thanos/pkg/objstore" "github.com/improbable-eng/thanos/pkg/objstore/client" "github.com/improbable-eng/thanos/pkg/runutil" diff --git a/pkg/pool/pool_test.go b/pkg/pool/pool_test.go index 0af9341af2..ae26e6536a 100644 --- a/pkg/pool/pool_test.go +++ b/pkg/pool/pool_test.go @@ -14,7 +14,7 @@ func TestBytesPool(t *testing.T) { allocs := uint64(0) wrapped := chunkPool.new chunkPool.new = func(sz int) []byte { - allocs ++ + allocs++ return wrapped(sz) } testutil.Equals(t, []int{10, 20, 40, 80}, chunkPool.sizes) From 46ae2cd8bf6922d641cfbb9690a4e4b73aeafdfa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 18 Feb 2019 11:02:02 +0000 Subject: [PATCH 03/18] Bump github.com/grpc-ecosystem/go-grpc-prometheus (#790) Bumps [github.com/grpc-ecosystem/go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus) from `93bf462` to `68e3a13`. - [Release notes](https://github.com/grpc-ecosystem/go-grpc-prometheus/releases) - [Commits](https://github.com/grpc-ecosystem/go-grpc-prometheus/compare/93bf4626fba73b751b0f3cdf2649be4ce0c420cd...68e3a13e41175110a447e67246a59873d6aca902) Signed-off-by: dependabot[bot] --- Gopkg.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 038ad0e0ab..05d4d04915 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -236,11 +236,11 @@ [[projects]] branch = "master" - digest = "1:9d155c1c6b496391d43b4f4d17ca0d8d533b2d28e9d7ae3757682d25d30ccfa2" + digest = "1:8d90bab8af001cc03bb2aac86126cdb76fe0f86b6edd328d3de84bc429ab6ca7" name = "github.com/grpc-ecosystem/go-grpc-prometheus" packages = ["."] pruneopts = "" - revision = "93bf4626fba73b751b0f3cdf2649be4ce0c420cd" + revision = "68e3a13e41175110a447e67246a59873d6aca902" [[projects]] digest = "1:8e3bd93036b4a925fe2250d3e4f38f21cadb8ef623561cd80c3c50c114b13201" From f56e2f9d81582ed9daac61a78dbda84fd46148e8 Mon Sep 17 00:00:00 2001 From: Dominic Green Date: Mon, 18 Feb 2019 14:35:13 +0000 Subject: [PATCH 04/18] updating version and changelog for v0.3.1 (#853) --- CHANGELOG.md | 7 +++++++ VERSION | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffdc6fa58b..4a49487847 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,13 @@ We use *breaking* word for marking changes that are not backward compatible (rel ## Unreleased +## [v0.3.1](https://github.com/improbable-eng/thanos/releases/tag/v0.3.0) - 2019.02.18 + +### Fixed +- [#829](https://github.com/improbable-eng/thanos/issues/829) Store Gateway crashing due to `slice bounds out of range`. +- [#834](https://github.com/improbable-eng/thanos/issues/834) fixed matcher regression for `<>` `!=`. + + ## [v0.3.0](https://github.com/improbable-eng/thanos/releases/tag/v0.3.0) - 2019.02.08 ### Added diff --git a/VERSION b/VERSION index aca6b71554..9e11b32fca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.0-master +0.3.1 From e1ef24d9599cdc297a2de99b4137801d0320afe9 Mon Sep 17 00:00:00 2001 From: Dominic Green Date: Tue, 19 Feb 2019 03:18:03 +0000 Subject: [PATCH 05/18] V0.3.1 master (#855) * updating version * remove new line --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 9e11b32fca..0cb66a557e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.1 +0.3.1-master From 0c730c1a1b52d8ded40fb58139f8282f761e20c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Thu, 21 Feb 2019 05:35:54 -0500 Subject: [PATCH 06/18] store: Fixed intersect matching when one matcher filters all series. (#862) Fixes https://github.com/improbable-eng/thanos/issues/833 Signed-off-by: Bartek Plotka --- CHANGELOG.md | 5 ++++- pkg/store/bucket.go | 15 +++++---------- pkg/store/bucket_e2e_test.go | 13 +++++++++++++ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a49487847..ec71a45830 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,14 @@ We use *breaking* word for marking changes that are not backward compatible (rel ## Unreleased +### Fixed +- [#833](https://github.com/improbable-eng/thanos/issues/833) Store Gateway matcher regression for intersecting with empty posting. + ## [v0.3.1](https://github.com/improbable-eng/thanos/releases/tag/v0.3.0) - 2019.02.18 ### Fixed - [#829](https://github.com/improbable-eng/thanos/issues/829) Store Gateway crashing due to `slice bounds out of range`. -- [#834](https://github.com/improbable-eng/thanos/issues/834) fixed matcher regression for `<>` `!=`. +- [#834](https://github.com/improbable-eng/thanos/issues/834) Store Gateway matcher regression for `<>` `!=`. ## [v0.3.0](https://github.com/improbable-eng/thanos/releases/tag/v0.3.0) - 2019.02.08 diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go index 91fb304eb8..775210e3a1 100644 --- a/pkg/store/bucket.go +++ b/pkg/store/bucket.go @@ -1182,13 +1182,8 @@ func (r *bucketIndexReader) ExpandedPostings(ms []labels.Matcher) ([]uint64, err // NOTE: Derived from tsdb.PostingsForMatchers. for _, m := range ms { - matchingGroup := toPostingGroup(r.LabelValues, m) - if matchingGroup == nil { - continue - } - // Each group is separate to tell later what postings are intersecting with what. - postingGroups = append(postingGroups, matchingGroup) + postingGroups = append(postingGroups, toPostingGroup(r.LabelValues, m)) } if len(postingGroups) == 0 { @@ -1240,6 +1235,10 @@ func (p *postingGroup) Fill(i int, posting index.Postings) { } func (p *postingGroup) Postings() index.Postings { + if len(p.keys) == 0 { + return index.EmptyPostings() + } + return p.aggregate(p.postings) } @@ -1289,10 +1288,6 @@ func toPostingGroup(lvalsFn func(name string) []string, m labels.Matcher) *posti } } - if len(matchingLabels) == 0 { - return nil - } - return newPostingGroup(matchingLabels, merge) } diff --git a/pkg/store/bucket_e2e_test.go b/pkg/store/bucket_e2e_test.go index d900cea9b0..8133e9bc86 100644 --- a/pkg/store/bucket_e2e_test.go +++ b/pkg/store/bucket_e2e_test.go @@ -130,6 +130,7 @@ func testBucketStore_e2e(t testing.TB, ctx context.Context, s *storeSuite) { testutil.Ok(t, err) testutil.Equals(t, []string{"1", "2"}, vals.Values) + // TODO(bwplotka): Add those test cases to TSDB querier_test.go as well, there are no tests for matching. for i, tcase := range []struct { req *storepb.SeriesRequest expected [][]storepb.Label @@ -293,6 +294,18 @@ func testBucketStore_e2e(t testing.TB, ctx context.Context, s *storeSuite) { {{Name: "a", Value: "2"}, {Name: "c", Value: "2"}, {Name: "ext2", Value: "value2"}}, }, }, + // Regression https://github.com/improbable-eng/thanos/issues/833. + // Problem: Matcher that was selecting NO series, was ignored instead of passed as emptyPosting to Intersect. + { + req: &storepb.SeriesRequest{ + Matchers: []storepb.LabelMatcher{ + {Type: storepb.LabelMatcher_EQ, Name: "a", Value: "1"}, + {Type: storepb.LabelMatcher_RE, Name: "non_existing", Value: "something"}, + }, + MinTime: mint, + MaxTime: maxt, + }, + }, } { t.Log("Run ", i) From e01a5955bf46705012a12a321b688413957a7796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Thu, 21 Feb 2019 23:41:35 -0500 Subject: [PATCH 07/18] Replace old manifests with new ones presented in demo. (#809) * Removed old kube manifests with updated k8s tutorial based on demo. Signed-off-by: Bartek Plotka * Updated kubernetes-demo manifests to use improbable/thanos:v0.3.0 docker image instead of the rc. (#827) Signed-off-by: David Calvert * Fixed links. Signed-off-by: Bartek Plotka * Improved docs (#840) * Updated kubernetes-demo manifests to use improbable/thanos:v0.3.0 docker image instead of the rc. Signed-off-by: David Calvert * renamed thanos-gateway to thanos-store-gateway and fixed headless servicename in order to resolve dns. Signed-off-by: David Calvert * Updated docs/getting_started.md path for manifests/thanos-store-gateway.yaml Signed-off-by: David Calvert --- docs/getting_started.md | 18 +- kube/README.md | 79 -- kube/envs.sh | 12 - kube/manifests/prometheus-gcs.yaml | 199 ---- kube/manifests/prometheus.yaml | 225 ----- kube/manifests/thanos-query.yaml | 60 -- kube/manifests/thanos-store.yaml | 55 - kube/run-local.sh | 63 -- tutorials/kubernetes-demo/.gitignore | 2 + tutorials/kubernetes-demo/README.md | 3 + .../kubernetes-demo/apply-pv-gen-metrics.sh | 17 + .../blockgen/container_mem_metrics_eu1.json | 20 + .../blockgen/container_mem_metrics_us1.json | 20 + tutorials/kubernetes-demo/blockgen/main.go | 304 ++++++ .../kubernetes-demo/blockgen/main_test.go | 37 + tutorials/kubernetes-demo/cluster-down.sh | 9 + tutorials/kubernetes-demo/cluster-up.sh | 20 + .../manifests/alertmanager.yaml | 122 +++ .../grafana-datasources-querier.yaml | 20 + .../manifests/grafana-datasources.yaml | 29 + .../kubernetes-demo/manifests/grafana.yaml | 379 +++++++ .../manifests/kube-state-metrics.yaml | 169 ++++ .../kubernetes-demo/manifests/minio.yaml | 46 + .../manifests/prometheus-ha-sidecar-lts.yaml | 316 ++++++ .../manifests/prometheus-ha-sidecar.yaml | 300 ++++++ .../manifests/prometheus-ha.yaml | 223 ++++ .../manifests/prometheus-pv-0.yaml | 15 + .../manifests/prometheus-pv-1.yaml | 14 + .../manifests/prometheus-rules.yaml | 949 ++++++++++++++++++ .../kubernetes-demo/manifests/prometheus.yaml | 206 ++++ .../manifests/thanos-compactor.yaml | 44 + .../manifests/thanos-querier-no-us1.yaml | 67 ++ .../manifests/thanos-querier.yaml | 70 ++ .../manifests/thanos-ruler.yaml | 108 ++ .../manifests/thanos-store-gateway.yaml | 65 ++ tutorials/kubernetes-demo/setup.sh | 47 + .../kubernetes-demo/slides/globalview-ha.svg | 1 + .../kubernetes-demo/slides/initial-setup.svg | 1 + .../slides/unlimited-retention.svg | 1 + 39 files changed, 3633 insertions(+), 702 deletions(-) delete mode 100644 kube/README.md delete mode 100644 kube/envs.sh delete mode 100644 kube/manifests/prometheus-gcs.yaml delete mode 100644 kube/manifests/prometheus.yaml delete mode 100644 kube/manifests/thanos-query.yaml delete mode 100644 kube/manifests/thanos-store.yaml delete mode 100755 kube/run-local.sh create mode 100644 tutorials/kubernetes-demo/.gitignore create mode 100644 tutorials/kubernetes-demo/README.md create mode 100755 tutorials/kubernetes-demo/apply-pv-gen-metrics.sh create mode 100644 tutorials/kubernetes-demo/blockgen/container_mem_metrics_eu1.json create mode 100644 tutorials/kubernetes-demo/blockgen/container_mem_metrics_us1.json create mode 100644 tutorials/kubernetes-demo/blockgen/main.go create mode 100644 tutorials/kubernetes-demo/blockgen/main_test.go create mode 100755 tutorials/kubernetes-demo/cluster-down.sh create mode 100755 tutorials/kubernetes-demo/cluster-up.sh create mode 100644 tutorials/kubernetes-demo/manifests/alertmanager.yaml create mode 100644 tutorials/kubernetes-demo/manifests/grafana-datasources-querier.yaml create mode 100644 tutorials/kubernetes-demo/manifests/grafana-datasources.yaml create mode 100644 tutorials/kubernetes-demo/manifests/grafana.yaml create mode 100644 tutorials/kubernetes-demo/manifests/kube-state-metrics.yaml create mode 100644 tutorials/kubernetes-demo/manifests/minio.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-ha.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-pv-0.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-pv-1.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus-rules.yaml create mode 100644 tutorials/kubernetes-demo/manifests/prometheus.yaml create mode 100644 tutorials/kubernetes-demo/manifests/thanos-compactor.yaml create mode 100644 tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml create mode 100644 tutorials/kubernetes-demo/manifests/thanos-querier.yaml create mode 100644 tutorials/kubernetes-demo/manifests/thanos-ruler.yaml create mode 100644 tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml create mode 100755 tutorials/kubernetes-demo/setup.sh create mode 100644 tutorials/kubernetes-demo/slides/globalview-ha.svg create mode 100644 tutorials/kubernetes-demo/slides/initial-setup.svg create mode 100644 tutorials/kubernetes-demo/slides/unlimited-retention.svg diff --git a/docs/getting_started.md b/docs/getting_started.md index a42eacd21c..57423f56bc 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -59,8 +59,8 @@ Rolling this out has little to zero impact on the running Prometheus instance. I If you are not interested in backing up any data, the `--objstore.config-file` flag can simply be omitted. -* _[Example Kubernetes manifest](../kube/manifests/prometheus.yaml)_ -* _[Example Kubernetes manifest with GCS upload](../kube/manifests/prometheus-gcs.yaml)_ +* _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml)_ +* _[Example Kubernetes manifest with Minio upload](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml)_ * _[Details & Config for other object stores](./storage.md)_ ### [Store API](/pkg/store/storepb/rpc.proto#L19) @@ -78,8 +78,8 @@ thanos sidecar \ --grpc-address 0.0.0.0:19090 # GRPC endpoint for StoreAPI ``` -* _[Example Kubernetes manifest](../kube/manifests/prometheus.yaml)_ -* _[Example Kubernetes manifest with GCS upload](../kube/manifests/prometheus-gcs.yaml)_ +* _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml)_ +* _[Example Kubernetes manifest with GCS upload](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml)_ ### External Labels @@ -142,7 +142,7 @@ thanos query \ Go to the configured HTTP address, and you should now be able to query across all Prometheus instances and receive de-duplicated data. -* _[Example Kubernetes manifest](../kube/manifests/thanos-query.yaml)_ +* _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/thanos-querier.yaml)_ ## Communication Between Components @@ -218,8 +218,8 @@ When to use gossip vs store flags? Configuration of initial peers is flexible and the argument can be repeated for Thanos to try different approaches. Additional flags for cluster configuration exist but are typically not needed. Check the `--help` output for further information. -* _[Example Kubernetes manifest](../kube/manifests/prometheus.yaml)_ -* _[Example Kubernetes manifest with GCS upload](../kube/manifests/prometheus-gcs.yaml)_ +* _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml)_ +* _[Example Kubernetes manifest with GCS upload](../tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml)_ ## [Store Gateway](components/store.md) @@ -237,7 +237,7 @@ thanos store \ The store gateway occupies small amounts of disk space for caching basic information about data in the object storage. This will rarely exceed more than a few gigabytes and is used to improve restart times. It is not useful but not required to preserve it across restarts. -* _[Example Kubernetes manifest](../kube/manifests/thanos-store.yaml)_ +* _[Example Kubernetes manifest](../tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml)_ ## [Compactor](components/compact.md) @@ -263,7 +263,7 @@ which does rule and alert evaluation on top of given Thanos Querier. # All-in-one example -You can find one-box example with minikube [here](../kube/README.md). +You can find kubernetes manifests [here](../tutorials/kubernetes-demo/manifests). # Dashboards diff --git a/kube/README.md b/kube/README.md deleted file mode 100644 index d1406f2469..0000000000 --- a/kube/README.md +++ /dev/null @@ -1,79 +0,0 @@ -# PromLTS kubernetes test setup. - -This directory contains example, runnable scripts and k8s resource definitions for Thanos. - -## Local mini-kube - -To run minikube with Prometheus: - -`bash ./kube/run-local.sh -i -d none` for linux or `bash ./kube/run-local.sh -i -d ` with some vm driver for MacOS (e.g virtualbox). -What it does: - - run minikube - - setup kubectl and local custom kube/config - -To use cluster from your terminal do: -`source ./kube/envs.sh` - -From now on you can use `kubectl` as well as `minikube` command, including `minikube stop` to stop the whole cluster. - -## Example setup - -This section covers are required k8s manifest to start example setup that will include: -- Thanos headless service for discovery purposes. -- Prometheus + Thanos sidecar. -- Thanos query node - -This setup will have GCS upload disabled, but will show how we can proxy requests from Prometheus. - -This example can be easily extended to show the HA Prometheus use case. - -To run example setup: -1. `source ./kube/envs.sh` -2. `kubectl apply -f kube/manifests/prometheus.yaml` -3. `kubectl apply -f kube/manifests/thanos-query.yaml` - -You will be now able to reach Prometheus on http://prometheus.default.svc.cluster.local:9090/graph -And Thanos Query UI on http://thanos-query.default.svc.cluster.local:9090/graph - -If you cannot access these items from browser ensure that you have `10.0.0.10` address in your resolv.conf. -Alternatively you can look for service address using `kubectl get svc` and go to proper IP address. - -Thanos Query UI should show exactly the same data as Prometheus. - -To tear down example setup: -1. `source ./kube/envs.sh` -2. `kubectl delete -f kube/manifests/prometheus.yaml` -3. `kubectl delete -f kube/manifests/thanos-query.yaml` - -## Long term storage setup - -This example is running setup that is supposed to upload blocks to GCS for long term storage. This setup includes: -- Thanos headless service for discovery purposes. -- Prometheus + Thanos sidecar with GCS shipper configured -- Thanos query node -- Thanos store gateway. - -To run example setup: -1. Create GCS bucket in your GCP project. Either name it "thanos-test" or put its name into - * manifest/prometheus-gcs.yaml inside `"--gcs.bucket` flag. - * manifest/thanos-store.yaml inside `"--gcs.bucket` flag. -2. Create service account that have permission to this bucket -3. Download JSON credentials for service account and run: `kubectl create secret generic gcs-credentials --from-file=` -4. `source ./kube/envs.sh` -5. `kubectl apply -f kube/manifests/prometheus-gcs.yaml` -6. `kubectl apply -f kube/manifests/thanos-query.yaml` -7. `kubectl apply -f kube/manifests/thanos-store.yaml` - -You will be now able to reach Prometheus on http://prometheus-gcs.default.svc.cluster.local:9090/graph -And Thanos Query UI on http://thanos-query.default.svc.cluster.local:9090/graph - -Thanos Query UI should show exactly the same data as Prometheus, but also older data if it's running longer that 24h. - -After 3h (default `storage.tsdb.{min,max}-block-duration` flag value) sidecar should upload first block to GCS. -You can make that quicker by changing prometheus `storage.tsdb.{min,max}-block-duration` to smaller value (e.g 20m) - -To tear down example setup: -1. `source ./kube/envs.sh` -2. `kubectl delete -f kube/manifests/prometheus-gcs.yaml` -3. `kubectl delete -f kube/manifests/thanos-query.yaml` -4. `kubectl delete -f kube/manifests/thanos-store.yaml` \ No newline at end of file diff --git a/kube/envs.sh b/kube/envs.sh deleted file mode 100644 index a8c8d15d5f..0000000000 --- a/kube/envs.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -DIR="$( dirname $(pwd)/$0)" - -export MINIKUBE_WANTUPDATENOTIFICATION=false -export MINIKUBE_WANTKUBECTLDOWNLOADMSG=false -export MINIKUBE_WANTREPORTERRORPROMPT=false -export MINIKUBE_HOME=${DIR} -export CHANGE_MINIKUBE_NONE_USER=true - -export PATH=${DIR}/bin:${PATH} -export KUBECONFIG=${DIR}/.kube/config diff --git a/kube/manifests/prometheus-gcs.yaml b/kube/manifests/prometheus-gcs.yaml deleted file mode 100644 index c12910ca9a..0000000000 --- a/kube/manifests/prometheus-gcs.yaml +++ /dev/null @@ -1,199 +0,0 @@ -apiVersion: apps/v1beta1 -kind: StatefulSet -metadata: - name: prometheus-gcs - labels: - app: prometheus-gcs - thanos-peer: "true" -spec: - serviceName: "prometheus" - replicas: 2 - selector: - matchLabels: - app: prometheus-gcs - thanos-peer: "true" - template: - metadata: - labels: - app: prometheus-gcs - thanos-peer: "true" - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "10902" - spec: -## Commented out because Minikube has only one node, should be commented in for any production setup -# affinity: -# podAntiAffinity: -# requiredDuringSchedulingIgnoredDuringExecution: -# - labelSelector: -# matchExpressions: -# - key: app -# operator: In -# values: -# - prometheus -# topologyKey: kubernetes.io/hostname - containers: - - name: prometheus - image: quay.io/prometheus/prometheus:v2.0.0 - args: - - "--storage.tsdb.retention=24h" - - "--config.file=/etc/prometheus-shared/prometheus.yml" - - "--storage.tsdb.path=/var/prometheus" - - "--storage.tsdb.min-block-duration=2h" - - "--storage.tsdb.max-block-duration=2h" - - "--web.enable-lifecycle" - ports: - - name: prom-http - containerPort: 9090 - volumeMounts: - - name: config-shared - mountPath: /etc/prometheus-shared - - name: data - mountPath: /var/prometheus - - name: thanos-sidecar - # Always use explicit image tags (release or master--sha) instead of ambigous `latest` or `master`. - image: improbable/thanos:v0.2.1 - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /creds/gcs-credentials.json - args: - - "sidecar" - - "--log.level=debug" - - "--tsdb.path=/var/prometheus" - - "--prometheus.url=http://127.0.0.1:9090" - - "--cluster.peers=thanos-peers.default.svc.cluster.local:10900" - # NOTE: This is required to be added in GCS prior startup of this. - - "--objstore.config=" - - "--reloader.config-file=/etc/prometheus/prometheus.yml.tmpl" - - "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml" - ports: - - name: sidecar-http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 - volumeMounts: - - name: data - mountPath: /var/prometheus - - name: gcs-credentials - mountPath: /creds/ - - name: config-shared - mountPath: /etc/prometheus-shared - - name: config - mountPath: /etc/prometheus - volumes: - - name: config - configMap: - name: prometheus-config-gcs - - name: config-shared - emptyDir: {} - - name: data - emptyDir: {} - - name: gcs-credentials - secret: - defaultMode: 420 - # NOTE: gcs-credentials secret with single file gcs-credentials.json (GCS service account) is required. - secretName: gcs-credentials ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-config-gcs -data: - prometheus.yml.tmpl: |- - global: - external_labels: - monitor: prometheus - replica: '$(HOSTNAME)' - - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - "127.0.0.1:9090" - - - job_name: kubelets - kubernetes_sd_configs: - - role: node - - - job_name: kube_pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: ${1}:${2} - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_pod_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - - # Scrapes the endpoint lists for the main Prometheus endpoints - - job_name: kube_endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_label_app] - regex: prometheus - - action: replace - source_labels: [__meta_kubernetes_service_label_app] - target_label: job - - action: replace - target_label: prometheus - source_labels: [__meta_kubernetes_service_label_prometheus] ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: prometheus-gcs - name: prometheus-gcs -spec: - externalTrafficPolicy: Cluster - ports: - - port: 9090 - protocol: TCP - targetPort: prom-http - name: http-prometheus - - port: 10902 - protocol: TCP - targetPort: sidecar-http - name: http-sidecar-metrics - selector: - app: prometheus-gcs - sessionAffinity: None - type: NodePort -status: - loadBalancer: {} - ---- -apiVersion: v1 -kind: Service -metadata: - name: thanos-peers -spec: - type: ClusterIP - clusterIP: None - ports: - - name: cluster - port: 10900 - targetPort: cluster - selector: - # Useful endpoint for gathering all thanos components for common gossip cluster. - thanos-peer: "true" diff --git a/kube/manifests/prometheus.yaml b/kube/manifests/prometheus.yaml deleted file mode 100644 index 5ae892574d..0000000000 --- a/kube/manifests/prometheus.yaml +++ /dev/null @@ -1,225 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: default ---- -apiVersion: apps/v1beta1 -kind: StatefulSet -metadata: - name: prometheus - labels: - app: prometheus - thanos-peer: "true" -spec: - serviceName: "prometheus" - replicas: 2 - selector: - matchLabels: - app: prometheus - thanos-peer: "true" - template: - metadata: - labels: - app: prometheus - thanos-peer: "true" - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "10902" - spec: - serviceAccountName: prometheus -## Commented out because Minikube has only one node, should be commented in for any production setup -# affinity: -# podAntiAffinity: -# requiredDuringSchedulingIgnoredDuringExecution: -# - labelSelector: -# matchExpressions: -# - key: app -# operator: In -# values: -# - prometheus -# topologyKey: kubernetes.io/hostname - containers: - - name: prometheus - image: quay.io/prometheus/prometheus:v2.0.0 - args: - - "--storage.tsdb.retention=24h" - - "--config.file=/etc/prometheus-shared/prometheus.yml" - - "--storage.tsdb.path=/var/prometheus" - - "--storage.tsdb.min-block-duration=2h" - - "--storage.tsdb.max-block-duration=2h" - - "--web.enable-lifecycle" - ports: - - name: prom-http - containerPort: 9090 - volumeMounts: - - name: config-shared - mountPath: /etc/prometheus-shared - - name: data - mountPath: /var/prometheus - - name: thanos-sidecar - # Always use explicit image tags (release or master--sha) instead of ambigous `latest` or `master`. - image: improbable/thanos:v0.2.1 - args: - - "sidecar" - - "--log.level=debug" - - "--tsdb.path=/var/prometheus" - - "--prometheus.url=http://127.0.0.1:9090" - - "--cluster.peers=thanos-peers.default.svc.cluster.local:10900" - - "--reloader.config-file=/etc/prometheus/prometheus.yml.tmpl" - - "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml" - ports: - - name: sidecar-http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 - volumeMounts: - - name: data - mountPath: /var/prometheus - - name: config-shared - mountPath: /etc/prometheus-shared - - name: config - mountPath: /etc/prometheus - volumes: - - name: config - configMap: - name: prometheus-config - - name: config-shared - emptyDir: {} - - name: data - emptyDir: {} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-config -data: - prometheus.yml.tmpl: |- - global: - external_labels: - monitor: prometheus - replica: '$(HOSTNAME)' - - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - "127.0.0.1:9090" - - - job_name: kubelets - kubernetes_sd_configs: - - role: node - - - job_name: kube_pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: ${1}:${2} - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_pod_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - - # Scrapes the endpoint lists for the main Prometheus endpoints - - job_name: kube_endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_label_app] - regex: prometheus - - action: replace - source_labels: [__meta_kubernetes_service_label_app] - target_label: job - - action: replace - target_label: prometheus - source_labels: [__meta_kubernetes_service_label_prometheus] ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: prometheus - name: prometheus -spec: - externalTrafficPolicy: Cluster - ports: - - port: 9090 - protocol: TCP - targetPort: prom-http - name: http-prometheus - - port: 10902 - protocol: TCP - targetPort: sidecar-http - name: http-sidecar-metrics - selector: - app: prometheus - sessionAffinity: None - type: NodePort -status: - loadBalancer: {} - ---- -apiVersion: v1 -kind: Service -metadata: - name: thanos-peers -spec: - type: ClusterIP - clusterIP: None - ports: - - name: cluster - port: 10900 - targetPort: cluster - selector: - # Useful endpoint for gathering all thanos components for common gossip cluster. - thanos-peer: "true" diff --git a/kube/manifests/thanos-query.yaml b/kube/manifests/thanos-query.yaml deleted file mode 100644 index e013627e56..0000000000 --- a/kube/manifests/thanos-query.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: thanos-query - labels: - app: thanos-query - thanos-peer: "true" -spec: - replicas: 2 - selector: - matchLabels: - app: thanos-query - thanos-peer: "true" - template: - metadata: - labels: - app: thanos-query - thanos-peer: "true" - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "10902" - spec: - containers: - - name: thanos-query - # Always use explicit image tags (release or master--sha) instead of ambigous `latest` or `master`. - image: improbable/thanos:v0.2.1 - args: - - "query" - - "--log.level=debug" - - "--cluster.peers=thanos-peers.default.svc.cluster.local:10900" - - "--query.replica-label=replica" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 - livenessProbe: - httpGet: - path: /-/healthy - port: http ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: thanos-query - name: thanos-query -spec: - externalTrafficPolicy: Cluster - ports: - - port: 9090 - protocol: TCP - targetPort: http - name: http-query - selector: - app: thanos-query - sessionAffinity: None - type: NodePort diff --git a/kube/manifests/thanos-store.yaml b/kube/manifests/thanos-store.yaml deleted file mode 100644 index 723cbb3c1e..0000000000 --- a/kube/manifests/thanos-store.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: thanos-store -spec: - serviceName: "thanos-store" - replicas: 1 - selector: - matchLabels: - app: thanos - thanos-peer: "true" - template: - metadata: - labels: - app: thanos - thanos-peer: "true" - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "10902" - spec: - containers: - - name: thanos-store - # Always use explicit image tags (release or master--sha) instead of ambigous `latest` or `master`. - image: improbable/thanos:v0.2.1 - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /creds/gcs-credentials.json - args: - - "store" - - "--log.level=debug" - - "--data-dir=/var/thanos/store" - - "--cluster.peers=thanos-peers.default.svc.cluster.local:10900" - # NOTE: This is required to be added in GCS prior startup of this. - - "--objstore.config=" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 - volumeMounts: - - mountPath: /creds/ - name: gcs-credentials - readOnly: true - - name: data - mountPath: /var/thanos/store - volumes: - - name: data - emptyDir: {} - - name: gcs-credentials - secret: - defaultMode: 420 - # NOTE: gcs-credentials secret with single file gcs-credentials.json is required. - secretName: gcs-credentials diff --git a/kube/run-local.sh b/kube/run-local.sh deleted file mode 100755 index fe3f1a32f8..0000000000 --- a/kube/run-local.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -source ${DIR}/envs.sh - -usage() { echo "Usage: $0 [-d ] (specify vm-driver, by default none - works only on linux) [-i] (install required binaries)" 1>&2; exit 1; } - -install() { - mkdir -p ${DIR}/bin - pushd ${DIR}/bin - echo "Downloading kubectl 1.9 locally" - curl -Lo kubectl https://storage.googleapis.com/kubernetes-release/release/v1.9.0/bin/linux/amd64/kubectl && chmod +x kubectl - - echo "Downloading minikube" - curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 && chmod +x minikube - popd -} - -DRIVER="none" -while getopts ":d:i" o; do - case "${o}" in - d) - DRIVER=${OPTARG} - ;; - i) - install - ;; - *) - usage - exit 1 - ;; - esac -done -shift $((OPTIND-1)) - -echo "Starting local k8s cluster with config inside ${KUBECONFIG}. To use it, you need to do -'source kube/envs.sh' to set up needed environment variables. You can stop the local k8s cluster using: - minikube stop. Also make sure you have 'namespace 10.0.0.10' inside the /etc/resolv.conf to have kube-dns entries - accessible on your host." - -mkdir -p .kube || true -touch .kube/config - -if minikube status | grep -E "(Stopped)|minikube: $"; then - sudo -E ${DIR}/bin/minikube start --vm-driver=${DRIVER} --kubernetes-version=v1.9.0 -fi - -# This for loop waits until kubectl can access the api server that Minikube has created. -for i in {1..150}; do # timeout for 5 minutes - kubectl get po &> /dev/null - if [ $? -ne 1 ]; then - break - fi - sleep 2 -done - -# Making sure in best-effort way that k8s generates fresh certs. -kubectl delete secret $(kubectl get secret | grep default-token | cut -d " " -f 1) 2>/dev/null || true -kubectl delete secret -n kube-public $(kubectl get secret -n kube-public | grep default-token | cut -d " " -f 1) 2>/dev/null || true -kubectl delete secret -n kube-system $(kubectl get secret -n kube-system | grep default-token | cut -d " " -f 1) 2>/dev/null || true - -echo "Cluster is running. See README.md for example deployments you can apply." diff --git a/tutorials/kubernetes-demo/.gitignore b/tutorials/kubernetes-demo/.gitignore new file mode 100644 index 0000000000..ed5eee1a37 --- /dev/null +++ b/tutorials/kubernetes-demo/.gitignore @@ -0,0 +1,2 @@ +blockgen/out/* +.demo-last-step \ No newline at end of file diff --git a/tutorials/kubernetes-demo/README.md b/tutorials/kubernetes-demo/README.md new file mode 100644 index 0000000000..3755030d9d --- /dev/null +++ b/tutorials/kubernetes-demo/README.md @@ -0,0 +1,3 @@ +# Thanos - Transforming Prometheus to a Global Scale in a Seven Simple Steps + +TODO: Describe full demo. diff --git a/tutorials/kubernetes-demo/apply-pv-gen-metrics.sh b/tutorials/kubernetes-demo/apply-pv-gen-metrics.sh new file mode 100755 index 0000000000..362f2cf461 --- /dev/null +++ b/tutorials/kubernetes-demo/apply-pv-gen-metrics.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +cluster=$1 +replica=$2 +retention=$3 + +# Add volume and generated metrics inside it. +kubectl apply --context=${cluster} -f manifests/prometheus-pv-${replica}.yaml + +rm -rf -- /tmp/prom-out +mkdir /tmp/prom-out +go run ./blockgen/main.go --input=./blockgen/container_mem_metrics_${cluster}.json --output-dir=/tmp/prom-out --retention=${retention} +chmod -R 775 /tmp/prom-out +# Fun with permissions because Prometheus process is run a "noone" in a pod... ): +minikube -p ${cluster} ssh "sudo rm -rf /data/pv-prometheus-${replica} && sudo mkdir /data/pv-prometheus-${replica} && sudo chmod -R 777 /data/pv-prometheus-${replica}" +scp -r -i $(minikube -p ${cluster} ssh-key) /tmp/prom-out/* docker@$(minikube -p ${cluster} ip):/data/pv-prometheus-${replica}/ +minikube -p ${cluster} ssh "sudo chmod -R 777 /data/pv-prometheus-${replica}" \ No newline at end of file diff --git a/tutorials/kubernetes-demo/blockgen/container_mem_metrics_eu1.json b/tutorials/kubernetes-demo/blockgen/container_mem_metrics_eu1.json new file mode 100644 index 0000000000..a99bc78951 --- /dev/null +++ b/tutorials/kubernetes-demo/blockgen/container_mem_metrics_eu1.json @@ -0,0 +1,20 @@ +[ { + "type": "gauge", + "changeInterval": "1h", + "jitter": 30000000, + "max": 200000000, + "min": 10000000, + "result": {"resultType":"vector","result":[{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"default","pod":"alertmanager-0"},"value":[1548809388.643,"483328"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"default","pod":"grafana-554c585fc6-2p7gg"},"value":[1548809388.643,"491520"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"462848"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"503808"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"434176"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"454656"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"491520"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"430080"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"1257472"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"487424"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"442368"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"466944"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"397312"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"438272"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"addon-resizer","image":"sha256:b57c00a12f6cf8acf10de9c5e2c5adacbf355b181dd76f4d65bcfd3a936ea289","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"7045120"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"alertmanager","image":"sha256:23744b2d645c0574015adfba4a90283b79251aee3169dbe67f335d8465a8a63f","instance":"minikube","job":"kubelet","namespace":"default","pod":"alertmanager-0"},"value":[1548809388.643,"11284480"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"configmap-reload","image":"sha256:b70d7dba98e65dd440be03a3b8f28bbdd05ca0d55f6ab16f90292cd22cb961c5","instance":"minikube","job":"kubelet","namespace":"default","pod":"alertmanager-0"},"value":[1548809388.643,"1327104"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"coredns","image":"sha256:f59dcacceff45b5474d1385cd5f500d0c019ed9ca50ed5b814ac0c5fcec8699e","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"33697792"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"coredns","image":"sha256:f59dcacceff45b5474d1385cd5f500d0c019ed9ca50ed5b814ac0c5fcec8699e","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"18620416"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"etcd","image":"sha256:3cab8e1b9802cbe23a2703c2750ac4baa90b049b65e2a9e0a83e9e2c29f0724f","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"132898816"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"grafana","image":"sha256:920eb69ade2a293782a87bc56f0b68aadb9fd0989b96f9bc88d88981aea380a1","instance":"minikube","job":"kubelet","namespace":"default","pod":"grafana-554c585fc6-2p7gg"},"value":[1548809388.643,"38207488"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-addon-manager","image":"sha256:9c16409588eb19394b90703bdb5bcfb7c08fe75308a5db30b95ca8f6bd6bdc85","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"42958848"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-apiserver","image":"sha256:177db4b8e93a6a74ab19435edf17111d3ad18a8a4efef728712ea067ea8047c1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"374333440"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-controller-manager","image":"sha256:b9027a78d94c15a4aba54d45476c6f295c0db8f9dcb6fca34c8beff67d90a374","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"112422912"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-proxy","image":"sha256:01cfa56edcfc350d36cea9c2fc857949b36bc69bf69df6901e0fd9be3c826617","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"17539072"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-scheduler","image":"sha256:3193be46e0b3e215877b122052c0c7d3ef0902cf1dd6efaf3db95f37cf697002","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"41316352"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-state-metrics","image":"sha256:91599517197a204c99cd2c7e2175c25e18d82f9b53fc9d86f7d9976a3a6c6521","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"8130560"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kubernetes-dashboard","image":"sha256:f9aed6605b814b69e92dece6a50ed1e4e730144eb1cc971389dde9cb3820d124","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"12353536"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"prometheus","image":"sha256:5517f7057e7295a89a67c3c4869d60e019526d3d3ac0e45ae2e48c949b5c3f78","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"215818240"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"storage-provisioner","image":"sha256:4689081edb103a9e8174bf23a255bfbe0b2d9ed82edc907abab6989d1c60f02c","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"17342464"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet"},"value":[1548809388.643,"2690859008"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"default","pod":"alertmanager-0"},"value":[1548809388.643,"13094912"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"default","pod":"grafana-554c585fc6-2p7gg"},"value":[1548809388.643,"38690816"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"216293376"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"34201600"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"19054592"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"133349376"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"43450368"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"374763520"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"113680384"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"18026496"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"41758720"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"15642624"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"30040064"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"65912832"]}]} +},{ + "type": "gauge", + "jitter": 0, + "max": 200000000, + "min": 100000000, + "result": {"resultType":"vector","result":[{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"addon-resizer","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"alertmanager","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"alertmanager-0"},"value":[1548809388.643,"209715200"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"configmap-reload","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"alertmanager-0"},"value":[1548809388.643,"10485760"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"73400320"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"73400320"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"grafana","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"grafana-554c585fc6-2p7gg"},"value":[1548809388.643,"209715200"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"kube-addon-manager","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"eu1","container":"kube-state-metrics","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"106954752"]}]} +}, { + "type": "gauge", + "jitter": 0, + "max": 200000000, + "min": 100000000, + "result": {"resultType":"vector","result":[{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"addon-resizer","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"alertmanager","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"alertmanager-0"},"value":[1548809388.643,"209715200"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"configmap-reload","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"alertmanager-0"},"value":[1548809388.643,"10485760"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"178257920"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"178257920"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"grafana","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"default","node":"minikube","pod":"grafana-554c585fc6-2p7gg"},"value":[1548809388.643,"209715200"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"eu1","container":"kube-state-metrics","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"106954752"]}]} +}] \ No newline at end of file diff --git a/tutorials/kubernetes-demo/blockgen/container_mem_metrics_us1.json b/tutorials/kubernetes-demo/blockgen/container_mem_metrics_us1.json new file mode 100644 index 0000000000..86c4cccb8c --- /dev/null +++ b/tutorials/kubernetes-demo/blockgen/container_mem_metrics_us1.json @@ -0,0 +1,20 @@ +[ { + "type": "gauge", + "changeInterval": "1h", + "jitter": 30000000, + "max": 200000000, + "min": 10000000, + "result": {"resultType":"vector","result": [{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"462848"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"503808"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"434176"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"454656"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"491520"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"430080"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"1257472"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"487424"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"442368"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"466944"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"397312"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"POD","image":"k8s.gcr.io/pause:3.1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"438272"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"addon-resizer","image":"sha256:b57c00a12f6cf8acf10de9c5e2c5adacbf355b181dd76f4d65bcfd3a936ea289","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"7045120"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"coredns","image":"sha256:f59dcacceff45b5474d1385cd5f500d0c019ed9ca50ed5b814ac0c5fcec8699e","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"33697792"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"coredns","image":"sha256:f59dcacceff45b5474d1385cd5f500d0c019ed9ca50ed5b814ac0c5fcec8699e","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"18620416"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"etcd","image":"sha256:3cab8e1b9802cbe23a2703c2750ac4baa90b049b65e2a9e0a83e9e2c29f0724f","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"132898816"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-addon-manager","image":"sha256:9c16409588eb19394b90703bdb5bcfb7c08fe75308a5db30b95ca8f6bd6bdc85","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"42958848"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-apiserver","image":"sha256:177db4b8e93a6a74ab19435edf17111d3ad18a8a4efef728712ea067ea8047c1","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"374333440"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-controller-manager","image":"sha256:b9027a78d94c15a4aba54d45476c6f295c0db8f9dcb6fca34c8beff67d90a374","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"112422912"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-proxy","image":"sha256:01cfa56edcfc350d36cea9c2fc857949b36bc69bf69df6901e0fd9be3c826617","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"17539072"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-scheduler","image":"sha256:3193be46e0b3e215877b122052c0c7d3ef0902cf1dd6efaf3db95f37cf697002","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"41316352"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kube-state-metrics","image":"sha256:91599517197a204c99cd2c7e2175c25e18d82f9b53fc9d86f7d9976a3a6c6521","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"8130560"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"kubernetes-dashboard","image":"sha256:f9aed6605b814b69e92dece6a50ed1e4e730144eb1cc971389dde9cb3820d124","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"12353536"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"prometheus","image":"sha256:5517f7057e7295a89a67c3c4869d60e019526d3d3ac0e45ae2e48c949b5c3f78","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"215818240"]},{"metric":{"__name__":"container_memory_usage_bytes","container_name":"storage-provisioner","image":"sha256:4689081edb103a9e8174bf23a255bfbe0b2d9ed82edc907abab6989d1c60f02c","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"17342464"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet"},"value":[1548809388.643,"2690859008"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"default","pod":"prometheus-0"},"value":[1548809388.643,"216293376"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"34201600"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"19054592"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"etcd-minikube"},"value":[1548809388.643,"133349376"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"43450368"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-apiserver-minikube"},"value":[1548809388.643,"374763520"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-controller-manager-minikube"},"value":[1548809388.643,"113680384"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-proxy-6mn4k"},"value":[1548809388.643,"18026496"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-scheduler-minikube"},"value":[1548809388.643,"41758720"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"15642624"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"kubernetes-dashboard-ccc79bfc9-ckbkr"},"value":[1548809388.643,"30040064"]},{"metric":{"__name__":"container_memory_usage_bytes","instance":"minikube","job":"kubelet","namespace":"kube-system","pod":"storage-provisioner"},"value":[1548809388.643,"65912832"]}]} +},{ + "type": "gauge", + "jitter": 0, + "max": 200000000, + "min": 100000000, + "result": {"resultType":"vector","result":[{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"us1","container":"addon-resizer","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"us1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"73400320"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"us1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"73400320"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"us1","container":"kube-addon-manager","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-addon-manager-minikube"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_requests_memory_bytes","cluster":"us1","container":"kube-state-metrics","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"106954752"]}]} +}, { + "type": "gauge", + "jitter": 0, + "max": 200000000, + "min": 100000000, + "result": {"resultType":"vector","result":[{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"us1","container":"addon-resizer","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"52428800"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"us1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-262fj"},"value":[1548809388.643,"178257920"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"us1","container":"coredns","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"coredns-86c58d9df4-xv6ts"},"value":[1548809388.643,"178257920"]},{"metric":{"__name__":"kube_pod_container_resource_limits_memory_bytes","cluster":"us1","container":"kube-state-metrics","instance":"172.17.0.9:8080","job":"kube-state-metrics","namespace":"kube-system","node":"minikube","pod":"kube-state-metrics-68f6cc566c-vp566"},"value":[1548809388.643,"106954752"]}]} +}] \ No newline at end of file diff --git a/tutorials/kubernetes-demo/blockgen/main.go b/tutorials/kubernetes-demo/blockgen/main.go new file mode 100644 index 0000000000..50429b432f --- /dev/null +++ b/tutorials/kubernetes-demo/blockgen/main.go @@ -0,0 +1,304 @@ +package main + +import ( + "encoding/json" + "io/ioutil" + "math/rand" + "os" + "path/filepath" + "strings" + "time" + + "github.com/prometheus/prometheus/promql" + + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/pkg/timestamp" + "github.com/prometheus/tsdb/labels" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/prometheus/tsdb" + "gopkg.in/alecthomas/kingpin.v2" +) + +// Allow for more realistic output. +type series struct { + Type string // gauge, counter (if conunter we treat below as rate aim) + Jitter float64 + ChangeInterval string + Max float64 + Min float64 + Result queryData +} + +type queryData struct { + ResultType model.ValueType `json:"resultType"` + Result model.Vector `json:"result"` +} + +func main() { + app := kingpin.New(filepath.Base(os.Args[0]), "Generates artificial metrics from min time to given max time in compacted TSDB format (including head WAL).") + app.HelpFlag.Short('h') + input := app.Flag("input", "Input file for series config.").Required().String() + outputDir := app.Flag("output-dir", "Output directory for generated TSDB data.").Required().String() + scrapeInterval := app.Flag("scrape-interval", "Interval for to generate samples with.").Default("15s").Duration() + + retention := app.Flag("retention", "Defines the the max time in relation to current time for generated samples.").Required().Duration() + + logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr)) + _, err := app.Parse(os.Args[1:]) + if err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } + + f, err := ioutil.ReadFile(*input) + if err != nil { + level.Error(logger).Log("err", err, "file", input) + os.Exit(1) + } + + var s []series + if err := json.Unmarshal(f, &s); err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } + + // Same code as Prometheus for compaction levels and max block. + rngs := tsdb.ExponentialBlockRanges(int64(time.Duration(2*time.Hour).Seconds()*1000), 10, 3) + maxBlockDuration := *retention / 10 + for i, v := range rngs { + if v > int64(maxBlockDuration.Seconds()*1000) { + rngs = rngs[:i] + break + } + } + + if len(rngs) == 0 { + rngs = append(rngs, int64(time.Duration(2*time.Hour).Seconds()*1000)) + } + + if err := os.RemoveAll(*outputDir); err != nil { + level.Error(logger).Log("msg", "remove output dir", "err", err) + os.Exit(1) + } + + db, err := tsdb.Open(*outputDir, nil, nil, &tsdb.Options{ + BlockRanges: rngs, + RetentionDuration: uint64(retention.Seconds() * 1000), + NoLockfile: true, + }) + if err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } + + // Of course there will be small gap in minTime vs time.Now once we finish. + // We are fine with this. + n := time.Now() + maxTime := timestamp.FromTime(n) + minTime := timestamp.FromTime(n.Add(-*retention)) + + generators := make(map[string]gen) + for _, in := range s { + for _, r := range in.Result.Result { + lset := labels.New() + for n, v := range r.Metric { + lset = append(lset, labels.Label{Name: string(n), Value: string(v)}) + } + //level.Debug(logger).Log("msg", "scheduled generation of series", "lset", lset) + + var chInterval time.Duration + if in.ChangeInterval != "" { + chInterval, err = time.ParseDuration(in.ChangeInterval) + if err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } + } + + switch strings.ToLower(in.Type) { + case "counter": + // Does not work well (: Too naive. + generators[lset.String()] = &counterGen{ + interval: *scrapeInterval, + maxTime: maxTime, + minTime: minTime, + lset: lset, + min: in.Min, + max: in.Max, + jitter: in.Jitter, + rateInterval: 5 * time.Minute, + changeInterval: chInterval, + } + case "gauge": + generators[lset.String()] = &gaugeGen{ + interval: *scrapeInterval, + maxTime: maxTime, + minTime: minTime, + lset: lset, + min: in.Min, + max: in.Max, + jitter: in.Jitter, + changeInterval: chInterval, + } + default: + level.Error(logger).Log("msg", "unknown metric type", "type", in.Type) + os.Exit(1) + } + } + } + + a := db.Appender() + for _, generator := range generators { + for generator.Next() { + // Cache reference and use AddFast if we are too slow. + if _, err := a.Add(generator.Lset(), generator.Ts(), generator.Value()); err != nil { + level.Error(logger).Log("msg", "add", "err", err) + os.Exit(1) + } + } + } + + if err := a.Commit(); err != nil { + level.Error(logger).Log("msg", "commit", "err", err) + os.Exit(1) + } + + // Don't wait for compact, it will be compacted by Prometheus anyway. + + if err := db.Close(); err != nil { + level.Error(logger).Log("msg", "close", "err", err) + os.Exit(1) + } + + level.Info(logger).Log("msg", "generated artificial metrics", "series", len(generators)) +} + +type gaugeGen struct { + changeInterval time.Duration + interval time.Duration + maxTime, minTime int64 + + lset labels.Labels + min, max, jitter float64 + + v float64 + mod float64 + init bool + elapsed int64 +} + +func (g *gaugeGen) Lset() labels.Labels { + return g.lset +} + +func (g *gaugeGen) Next() bool { + if g.minTime > g.maxTime { + return false + } + defer func() { + g.minTime += int64(g.interval.Seconds() * 1000) + g.elapsed += int64(g.interval.Seconds() * 1000) + }() + + if !g.init { + g.v = g.min + rand.Float64()*((g.max-g.min)+1) + g.init = true + } + + // Technically only mod changes. + if g.jitter > 0 && g.elapsed >= int64(g.changeInterval.Seconds()*1000) { + g.mod = (rand.Float64() - 0.5) * g.jitter + g.elapsed = 0 + } + + return true +} + +func (g *gaugeGen) Ts() int64 { return g.minTime } +func (g *gaugeGen) Value() float64 { return g.v + g.mod } + +type counterGen struct { + maxTime, minTime int64 + + lset labels.Labels + min, max, jitter float64 + interval time.Duration + changeInterval time.Duration + rateInterval time.Duration + + v float64 + init bool + buff []promql.Point + + lastVal float64 + elapsed int64 +} + +func (g *counterGen) Lset() labels.Labels { + return g.lset +} + +func (g *counterGen) Next() bool { + defer func() { g.elapsed += int64(g.interval.Seconds() * 1000) }() + + if g.init && len(g.buff) == 0 { + return false + } + + if len(g.buff) > 0 { + // Pop front. + g.buff = g.buff[1:] + + if len(g.buff) > 0 { + return true + } + } + + if !g.init { + g.v = g.min + rand.Float64()*((g.max-g.min)+1) + g.init = true + } + + var mod float64 + if g.jitter > 0 && g.elapsed >= int64(g.changeInterval.Seconds()*1000) { + mod = (rand.Float64() - 0.5) * g.jitter + + if mod > g.v { + mod = g.v + } + + g.elapsed = 0 + } + + // Distribute goalV into multiple rateInterval/interval increments. + comps := make([]float64, int64(g.rateInterval/g.interval)) + var sum float64 + for i := range comps { + comps[i] = rand.Float64() + sum += comps[i] + } + + // That's the goal for our rate. + x := g.v + mod/sum + for g.minTime <= g.maxTime && len(comps) > 0 { + g.lastVal += x * comps[0] + comps = comps[1:] + + g.minTime += int64(g.interval.Seconds() * 1000) + g.buff = append(g.buff, promql.Point{T: g.minTime, V: g.lastVal}) + } + + return len(g.buff) > 0 +} + +func (g *counterGen) Ts() int64 { return g.buff[0].T } +func (g *counterGen) Value() float64 { return g.buff[0].V } + +type gen interface { + Lset() labels.Labels + Next() bool + Ts() int64 + Value() float64 +} diff --git a/tutorials/kubernetes-demo/blockgen/main_test.go b/tutorials/kubernetes-demo/blockgen/main_test.go new file mode 100644 index 0000000000..fc79021fd8 --- /dev/null +++ b/tutorials/kubernetes-demo/blockgen/main_test.go @@ -0,0 +1,37 @@ +package main + +import ( + "testing" + "time" + + "github.com/improbable-eng/thanos/pkg/testutil" +) + +func TestCounterGen(t *testing.T) { + g := &counterGen{ + minTime: 100, + maxTime: int64((24 * time.Hour).Seconds()) * 1000, + interval: 15 * time.Second, + rateInterval: 5 * time.Minute, + min: 100, + max: 400, + jitter: 300, + } + + lastV := float64(0) + lastT := int64(0) + + init := false + samples := int64(0) + for g.Next() { + samples++ + if init { + testutil.Assert(t, lastV <= g.Value(), "") + testutil.Assert(t, lastT <= g.Ts(), "") + init = true + } + lastV = g.Value() + lastT = g.Ts() + } + testutil.Equals(t, int64((24 * time.Hour)/(15 * time.Second)), samples) +} diff --git a/tutorials/kubernetes-demo/cluster-down.sh b/tutorials/kubernetes-demo/cluster-down.sh new file mode 100755 index 0000000000..d63a5d5861 --- /dev/null +++ b/tutorials/kubernetes-demo/cluster-down.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -e + +minikube -p us1 stop +minikube -p us1 delete + +minikube -p eu1 stop +minikube -p eu1 delete diff --git a/tutorials/kubernetes-demo/cluster-up.sh b/tutorials/kubernetes-demo/cluster-up.sh new file mode 100755 index 0000000000..c323578a08 --- /dev/null +++ b/tutorials/kubernetes-demo/cluster-up.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -e + +minikube start --cache-images --vm-driver=kvm2 -p us1 --kubernetes-version="v1.13.2" \ + --memory=8192 --cpus=4 \ + --extra-config=kubelet.authentication-token-webhook=true \ + --extra-config=kubelet.authorization-mode=Webhook \ + --extra-config=scheduler.address=0.0.0.0 \ + --extra-config=controller-manager.address=0.0.0.0 + +minikube start --cache-images --vm-driver=kvm2 -p eu1 --kubernetes-version="v1.13.2" \ + --memory=8192 --cpus=4 \ + --extra-config=kubelet.authentication-token-webhook=true \ + --extra-config=kubelet.authorization-mode=Webhook \ + --extra-config=scheduler.address=0.0.0.0 \ + --extra-config=controller-manager.address=0.0.0.0 + +ssh-keyscan $(minikube -p eu1 ip) >> ~/.ssh/known_hosts +ssh-keyscan $(minikube -p us1 ip) >> ~/.ssh/known_hosts \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/alertmanager.yaml b/tutorials/kubernetes-demo/manifests/alertmanager.yaml new file mode 100644 index 0000000000..f281f0f7e7 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/alertmanager.yaml @@ -0,0 +1,122 @@ +kind: PersistentVolume +apiVersion: v1 +metadata: + name: pv-alertmanager-0 + labels: + type: local +spec: + storageClassName: alert-manual + capacity: + storage: 5Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "/data/pv-alertmanager-0" +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: alertmanager + labels: + app: alertmanager +spec: + serviceName: "alertmanager" + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + containers: + - name: alertmanager + image: "prom/alertmanager:v0.14.0" + args: + - --config.file=/etc/config/alertmanager.yml + - --storage.path=/data + - --web.external-url=/ + ports: + - containerPort: 9093 + name: http + readinessProbe: + httpGet: + path: "/#/status" + port: 9093 + initialDelaySeconds: 30 + timeoutSeconds: 30 + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: alertmanager + mountPath: "/data" + subPath: "" + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 200m + memory: 200Mi + - name: configmap-reload + image: "jimmidyson/configmap-reload:v0.1" + args: + - --volume-dir=/etc/config + - --webhook-url=http://localhost:9093/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + resources: + limits: + cpu: 10m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumes: + - name: config-volume + configMap: + name: alertmanager + volumeClaimTemplates: + - metadata: + labels: + app: alertmanager + name: alertmanager + spec: + storageClassName: alert-manual + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9093 + selector: + app: alertmanager + type: NodePort +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager +data: + alertmanager.yml: | + global: null + receivers: + - name: default-receiver + route: + group_interval: 5m + group_wait: 10s + receiver: default-receiver + repeat_interval: 3h \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/grafana-datasources-querier.yaml b/tutorials/kubernetes-demo/manifests/grafana-datasources-querier.yaml new file mode 100644 index 0000000000..8c5c56f2c1 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/grafana-datasources-querier.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +data: + prometheus.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access": "proxy", + "editable": false, + "name": "all", + "orgId": 1, + "type": "prometheus", + "url": "http://thanos-querier.default.svc:9090", + "version": 1 + }, + ] + } +kind: ConfigMap +metadata: + name: grafana-datasources \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/grafana-datasources.yaml b/tutorials/kubernetes-demo/manifests/grafana-datasources.yaml new file mode 100644 index 0000000000..3a738df7d5 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/grafana-datasources.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +data: + prometheus.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access": "proxy", + "editable": false, + "name": "eu1", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus.default.svc:9090", + "version": 1 + }, + { + "access": "proxy", + "editable": false, + "name": "us1", + "orgId": 1, + "type": "prometheus", + "url": "%%PROM_US1_URL%%", + "version": 1 + } + ] + } +kind: ConfigMap +metadata: + name: grafana-datasources \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/grafana.yaml b/tutorials/kubernetes-demo/manifests/grafana.yaml new file mode 100644 index 0000000000..b9d59b5f0b --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/grafana.yaml @@ -0,0 +1,379 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: grafana + name: grafana +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - image: grafana/grafana:5.2.4 + name: grafana + ports: + - containerPort: 3000 + name: http + readinessProbe: + httpGet: + path: /api/health + port: http + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 200m + memory: 200Mi + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-storage + - mountPath: /etc/grafana/provisioning/datasources + readOnly: false + name: grafana-datasources + - mountPath: /etc/grafana/provisioning/dashboards + name: grafana-dashboards + - mountPath: /grafana-dashboard-definitions/0/pods-memory + name: grafana-pods-memory + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: "Admin" + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: grafana + volumes: + - emptyDir: {} + name: grafana-storage + - name: grafana-datasources + configMap: + name: grafana-datasources + - configMap: + name: grafana-dashboards + name: grafana-dashboards + - configMap: + name: grafana-pods-memory + name: grafana-pods-memory +--- +apiVersion: v1 +data: + dashboards.yaml: |- + { + "apiVersion": 1, + "providers": [ + { + "folder": "", + "name": "0", + "options": { + "path": "/grafana-dashboard-definitions/0" + }, + "orgId": 1, + "type": "file" + } + ] + } +kind: ConfigMap +metadata: + name: grafana-dashboards +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: grafana + name: grafana +spec: + ports: + - name: http + port: 3000 + targetPort: http + selector: + app: grafana + type: NodePort +--- +apiVersion: v1 +items: + - apiVersion: v1 + data: + pod.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1548898867144, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name, cluster) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container_name=~\"$container\", container_name!=\"POD\", cluster=~\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current: {{ container_name }} {{ cluster }}", + "refId": "A" + }, + { + "expr": "sum by(container, cluster) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", cluster=~\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }} {{ cluster }}", + "refId": "B" + }, + { + "expr": "sum by(container, cluster) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", cluster=~\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }} {{ cluster }}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "eu1", + "value": "eu1" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "options": [], + "query": "label_values(container_memory_usage_bytes, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "text": "default", + "value": "default" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(container_memory_usage_bytes, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "text": "prometheus-0", + "value": "prometheus-0" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(container_memory_usage_bytes{namespace=~\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "text": "prometheus", + "value": "prometheus" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(container_memory_usage_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"POD\"}, container_name)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1d", + "to": "now-1h" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Pods Memory", + "uid": "pods_memory", + "version": 1202 + } + kind: ConfigMap + metadata: + name: grafana-pods-memory +kind: ConfigMapList diff --git a/tutorials/kubernetes-demo/manifests/kube-state-metrics.yaml b/tutorials/kubernetes-demo/manifests/kube-state-metrics.yaml new file mode 100644 index 0000000000..6685433af8 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/kube-state-metrics.yaml @@ -0,0 +1,169 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system +spec: + selector: + matchLabels: + k8s-app: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + k8s-app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.5.0 + ports: + - name: http + containerPort: 8080 + - name: telemetry + containerPort: 8081 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + - name: addon-resizer + image: k8s.gcr.io/addon-resizer:1.8.3 + resources: + limits: + cpu: 150m + memory: 50Mi + requests: + cpu: 150m + memory: 50Mi + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /pod_nanny + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=1m + - --memory=100Mi + - --extra-memory=2Mi + - --threshold=5 + - --deployment=kube-state-metrics +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: + - apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] + - apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: kube-state-metrics + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + namespace: kube-system + name: kube-state-metrics-resizer +rules: + - apiGroups: [""] + resources: + - pods + verbs: ["get"] + - apiGroups: ["extensions"] + resources: + - deployments + resourceNames: ["kube-state-metrics"] + verbs: ["get", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: kube-state-metrics + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/minio.yaml b/tutorials/kubernetes-demo/manifests/minio.yaml new file mode 100644 index 0000000000..40a17ed52d --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/minio.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: minio + name: minio +spec: + ports: + - port: 9000 + protocol: TCP + targetPort: 9000 + selector: + statefulset.kubernetes.io/pod-name: minio-0 + type: NodePort +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: minio + labels: + app: minio +spec: + serviceName: "minio" + replicas: 1 + selector: + matchLabels: + app: minio + template: + metadata: + labels: + app: minio + spec: + # Yolo - no pv. + containers: + - name: minio + image: minio/minio:RELEASE.2019-01-31T00-31-19Z + args: + - server + - /data + env: + - name: MINIO_ACCESS_KEY + value: "smth" + - name: MINIO_SECRET_KEY + value: "Need8Chars" + ports: + - containerPort: 9000 \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml new file mode 100644 index 0000000000..51382ee068 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml @@ -0,0 +1,316 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +# We want to be able to access each replica. +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus-1 +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-1 + type: NodePort +--- +# minikube limitation: +# https://github.com/kubernetes/minikube/issues/3351#issuecomment-459898556 +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: sidecar +spec: + ports: + - port: 10901 + protocol: TCP + targetPort: grpc + name: grpc + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: sidecar-1 +spec: + ports: + - port: 10901 + protocol: TCP + targetPort: grpc + name: grpc + selector: + statefulset.kubernetes.io/pod-name: prometheus-1 + type: NodePort +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: prometheus + labels: + app: prometheus +spec: + serviceName: "prometheus" + replicas: 2 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + # We will use this label to put all StoreAPis + # under the same headless service for + # SRV lookup: thanos-store-api.default.svc + thanos-store-api: "true" + spec: + securityContext: + runAsUser: 1000 + fsGroup: 2000 + runAsNonRoot: true + serviceAccountName: prometheus + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v2.6.1 + args: + - --config.file=/etc/prometheus-shared/prometheus.yaml + - --storage.tsdb.path=/var/prometheus + - --web.enable-lifecycle + # TODO: Make retention shorter once all old blocks will be uploaded (!) + - --storage.tsdb.retention=2w + # Disable compaction. + - --storage.tsdb.min-block-duration=2h + - --storage.tsdb.max-block-duration=2h + - --web.enable-admin-api + ports: + - name: http-prometheus + containerPort: 9090 + volumeMounts: + - name: config-shared + mountPath: /etc/prometheus-shared + - name: rules + mountPath: /etc/prometheus/rules + - name: prometheus + mountPath: /var/prometheus + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - sidecar + - --log.level=debug + - --tsdb.path=/var/prometheus + - --prometheus.url=http://localhost:9090 + - --cluster.disable + - --reloader.config-file=/etc/prometheus/prometheus.yaml.tmpl + - --reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yaml + # Enable block uploading. + - | + --objstore.config=type: S3 + config: + bucket: demo-bucket + access_key: smth + secret_key: Need8Chars + endpoint: %%S3_ENDPOINT%% + insecure: true + # New flag for migrating old blocks. + - --shipper.upload-compacted-once + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - name: http-sidecar + containerPort: 10902 + - name: grpc + containerPort: 10901 + volumeMounts: + - name: prometheus + mountPath: /var/prometheus + - name: config-shared + mountPath: /etc/prometheus-shared + - name: config + mountPath: /etc/prometheus + volumes: + - name: config + configMap: + name: prometheus + - name: rules + configMap: + name: prometheus-rules + - name: config-shared + emptyDir: {} + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - metadata: + labels: + app: prometheus + name: prometheus + spec: + storageClassName: prom-manual + accessModes: + - ReadWriteOnce + resources: + requests: + # Normally, probably 15x more (: + storage: 4Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus +data: + prometheus.yaml.tmpl: |- + # Inspired by https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml + global: + scrape_interval: 15s + scrape_timeout: 10s + external_labels: + cluster: %%CLUSTER%% + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) + + alerting: + # We want our alerts to be deduplicated + # from different replicas. + alert_relabel_configs: + - regex: replica + action: labeldrop + + alertmanagers: + - static_configs: + - targets: + - %%ALERTMANAGER_URL%% + + rule_files: + - /etc/prometheus/rules/*rules.yaml + + scrape_configs: + - job_name: kube-apiserver + scheme: https + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + regex: default;kubernetes;https + action: keep + + - job_name: kubelet + scheme: https + kubernetes_sd_configs: + - role: node + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: id + action: labeldrop + - regex: name + action: labeldrop + - regex: pod_name + action: labeldrop + - target_label: cluster + replacement: %%CLUSTER%% + + - job_name: kube-pods + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: ^(http|http-.+|metrics)$ + action: keep + - source_labels: [__meta_kubernetes_pod_label_k8s_app] + target_label: job + - source_labels: [__meta_kubernetes_pod_label_app] + regex: ^(.+)$ + target_label: job + - source_labels: [job, __meta_kubernetes_pod_container_port_name] + regex: ^(.*);http-(.+)$ + target_label: job + - source_labels: [__meta_kubernetes_pod_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - target_label: cluster + replacement: %%CLUSTER%% + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: pod_name + action: labeldrop +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus + namespace: default +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml new file mode 100644 index 0000000000..de776f4107 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml @@ -0,0 +1,300 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +# We want to be able to access each replica. +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus-1 +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-1 + type: NodePort +--- +# minikube limitation: +# https://github.com/kubernetes/minikube/issues/3351#issuecomment-459898556 +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: sidecar +spec: + ports: + - port: 10901 + protocol: TCP + targetPort: grpc + name: grpc + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: sidecar-1 +spec: + ports: + - port: 10901 + protocol: TCP + targetPort: grpc + name: grpc + selector: + statefulset.kubernetes.io/pod-name: prometheus-1 + type: NodePort +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: prometheus + labels: + app: prometheus +spec: + serviceName: "prometheus" + replicas: 2 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + # We will use this label to put all StoreAPis + # under the same headless service for + # SRV lookup: thanos-store-api.default.svc + thanos-store-api: "true" + spec: + securityContext: + runAsUser: 1000 + fsGroup: 2000 + runAsNonRoot: true + serviceAccountName: prometheus + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v2.6.1 + args: + - --config.file=/etc/prometheus-shared/prometheus.yaml + - --storage.tsdb.path=/var/prometheus + - --web.enable-lifecycle + - --storage.tsdb.retention=2w + ports: + - name: http-prometheus + containerPort: 9090 + volumeMounts: + - name: config-shared + mountPath: /etc/prometheus-shared + - name: rules + mountPath: /etc/prometheus/rules + - name: prometheus + mountPath: /var/prometheus + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - sidecar + - --log.level=debug + - --tsdb.path=/var/prometheus + - --prometheus.url=http://localhost:9090 + - --cluster.disable + - --reloader.config-file=/etc/prometheus/prometheus.yaml.tmpl + - --reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - name: http-sidecar + containerPort: 10902 + - name: grpc + containerPort: 10901 + volumeMounts: + - name: prometheus + mountPath: /var/prometheus + - name: config-shared + mountPath: /etc/prometheus-shared + - name: config + mountPath: /etc/prometheus + volumes: + - name: config + configMap: + name: prometheus + - name: rules + configMap: + name: prometheus-rules + - name: config-shared + emptyDir: {} + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - metadata: + labels: + app: prometheus + name: prometheus + spec: + storageClassName: prom-manual + accessModes: + - ReadWriteOnce + resources: + requests: + # Normally, probably 15x more (: + storage: 4Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus +data: + prometheus.yaml.tmpl: |- + # Inspired by https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml + global: + scrape_interval: 15s + scrape_timeout: 10s + external_labels: + cluster: %%CLUSTER%% + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) + + alerting: + # We want our alerts to be deduplicated + # from different replicas. + alert_relabel_configs: + - regex: replica + action: labeldrop + + alertmanagers: + - static_configs: + - targets: + - %%ALERTMANAGER_URL%% + + rule_files: + - /etc/prometheus/rules/*rules.yaml + + scrape_configs: + - job_name: kube-apiserver + scheme: https + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + regex: default;kubernetes;https + action: keep + + - job_name: kubelet + scheme: https + kubernetes_sd_configs: + - role: node + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: id + action: labeldrop + - regex: name + action: labeldrop + - regex: pod_name + action: labeldrop + - target_label: cluster + replacement: %%CLUSTER%% + + - job_name: kube-pods + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: ^(http|http-.+|metrics)$ + action: keep + - source_labels: [__meta_kubernetes_pod_label_k8s_app] + target_label: job + - source_labels: [__meta_kubernetes_pod_label_app] + regex: ^(.+)$ + target_label: job + - source_labels: [job, __meta_kubernetes_pod_container_port_name] + regex: ^(.*);http-(.+)$ + target_label: job + - source_labels: [__meta_kubernetes_pod_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - target_label: cluster + replacement: %%CLUSTER%% + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: pod_name + action: labeldrop +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus + namespace: default +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha.yaml new file mode 100644 index 0000000000..5e621ab031 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha.yaml @@ -0,0 +1,223 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +# We want to be able to access each replica. +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus-1 +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-1 + type: NodePort +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: prometheus + labels: + app: prometheus +spec: + serviceName: "prometheus" + replicas: 2 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + securityContext: + runAsUser: 1000 + fsGroup: 2000 + runAsNonRoot: true + serviceAccountName: prometheus + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v2.6.1 + args: + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/var/prometheus + - --web.enable-lifecycle + - --storage.tsdb.retention=2w + ports: + - name: http-prometheus + containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: prometheus + mountPath: /var/prometheus + volumes: + - name: config + configMap: + name: prometheus + - name: rules + configMap: + name: prometheus-rules + volumeClaimTemplates: + - metadata: + labels: + app: prometheus + name: prometheus + spec: + storageClassName: prom-manual + accessModes: + - ReadWriteOnce + resources: + requests: + # Normally, probably 15x more (: + storage: 4Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus +data: + prometheus.yaml: |- + # Inspired by https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml + global: + scrape_interval: 15s + scrape_timeout: 10s + external_labels: + cluster: %%CLUSTER%% + + alerting: + alertmanagers: + - static_configs: + - targets: + - %%ALERTMANAGER_URL%% + + rule_files: + - /etc/prometheus/rules/*rules.yaml + + scrape_configs: + - job_name: kube-apiserver + scheme: https + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + regex: default;kubernetes;https + action: keep + + - job_name: kubelet + scheme: https + kubernetes_sd_configs: + - role: node + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: id + action: labeldrop + - regex: name + action: labeldrop + - regex: pod_name + action: labeldrop + - target_label: cluster + replacement: %%CLUSTER%% + + - job_name: kube-pods + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: ^(http|http-.+|metrics)$ + action: keep + - source_labels: [__meta_kubernetes_pod_label_k8s_app] + target_label: job + - source_labels: [__meta_kubernetes_pod_label_app] + regex: ^(.+)$ + target_label: job + - source_labels: [job, __meta_kubernetes_pod_container_port_name] + regex: ^(.*);http-(.+)$ + target_label: job + - source_labels: [__meta_kubernetes_pod_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - target_label: cluster + replacement: %%CLUSTER%% + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: pod_name + action: labeldrop +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus + namespace: default +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default diff --git a/tutorials/kubernetes-demo/manifests/prometheus-pv-0.yaml b/tutorials/kubernetes-demo/manifests/prometheus-pv-0.yaml new file mode 100644 index 0000000000..368bcb8835 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-pv-0.yaml @@ -0,0 +1,15 @@ +# Prepare 2 volumes for replicas. It is required only for demo purposes as we artifically generate metrics. +kind: PersistentVolume +apiVersion: v1 +metadata: + name: pv-prometheus-0 + labels: + type: local +spec: + storageClassName: prom-manual + capacity: + storage: 5Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "/data/pv-prometheus-0" \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/prometheus-pv-1.yaml b/tutorials/kubernetes-demo/manifests/prometheus-pv-1.yaml new file mode 100644 index 0000000000..69fce10529 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-pv-1.yaml @@ -0,0 +1,14 @@ +kind: PersistentVolume +apiVersion: v1 +metadata: + name: pv-prometheus-1 + labels: + type: local +spec: + storageClassName: prom-manual + capacity: + storage: 5Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "/data/pv-prometheus-1" \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/prometheus-rules.yaml b/tutorials/kubernetes-demo/manifests/prometheus-rules.yaml new file mode 100644 index 0000000000..2545ad47a1 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus-rules.yaml @@ -0,0 +1,949 @@ +# Copied from awesome kube-prometheus project (but moved to native configmap): +# https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +# For demo purposes simplification those are adjusted. +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-rules +data: + k8s.rules.yaml: |- + groups: + - name: k8s + rules: + - expr: | + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, pod, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) + ) + record: namespace_pod_container_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod, namespace) + * on (namespace, pod) group_left(label_name) + kube_pod_labels{job="kube-state-metrics"} + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + kube_pod_labels{job="kube-state-metrics"} + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + kube_pod_labels{job="kube-state-metrics"} + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + kube-scheduler.rules.yaml: |- + groups: + - name: kube-scheduler + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + kube-apiserver.rules.yaml: |- + groups: + - name: kube-apiserver + rules: + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + node.rules.yaml: |- + groups: + - name: node + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: | + max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilisation:avg1m + - expr: | + 1 - avg by (node) ( + rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilisation:avg1m + - expr: | + sum(node_load1{job="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: | + sum by (node) ( + node_load1{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: | + 1 - + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: ':node_memory_utilisation:' + - expr: | + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum + - expr: | + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum + - expr: | + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: | + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:node_memory_utilisation:ratio + - expr: | + 1e3 * sum( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: | + 1 - + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilisation:' + - expr: | + 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilisation_2:' + - expr: | + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: | + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) + record: :node_disk_utilisation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilisation:avg_irate + - expr: | + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) + record: :node_disk_saturation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: | + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' + - expr: | + sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_utilisation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilisation:sum_irate + - expr: | + sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_saturation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' + kube-prometheus-node-recording.rules.yaml: |- + groups: + - name: kube-prometheus-node-recording + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) + BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) + record: cluster:node_cpu:ratio + kubernetes-absent.rules.yaml: |- + groups: + - name: kubernetes-absent + rules: + - alert: AlertmanagerDown + annotations: + message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown + expr: | + absent(up{job="alertmanager-main"} == 1) and up{job="prometheus", cluster="eu1"} > -1 + for: 15m + labels: + severity: critical + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown + expr: | + absent(up{job="kube-dns"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown + expr: | + absent(up{job="kube-apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsDown + annotations: + message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown + expr: | + absent(up{job="kube-state-metrics"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown + expr: | + absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown + expr: | + absent(up{job="prometheus"} == 1) + for: 15m + labels: + severity: critical + kubernetes-apps.rules.yaml: |- + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready + expr: | + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not + matched the expected number of replicas for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch + expr: | + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + for: 1h + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch + expr: | + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + expr: | + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace + }}/{{ $labels.daemonset }} are scheduled and ready. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck + expr: | + kube_daemonset_status_number_ready{job="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: | + time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: | + kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + kubernetes-resources.rules.yaml: |- + groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Pods and cannot + tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: | + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Pods and cannot + tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: | + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal_bytes) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value + }}% of its {{ $labels.resource }} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + expr: | + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace + }} for container {{ $labels.container_name }} in pod {{ $labels.pod + }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", + }[5m])) by (container_name, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) + by (container_name, pod, namespace)\n > 25 \n" + for: 15m + labels: + severity: warning + kubernetes-storage.rules.yaml: |- + groups: + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is expected to fill up within four + days. Currently {{ printf "%0.2f" $value }}% is available. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays + expr: | + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) < 15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + kubernetes-system.rules.yaml: |- + groups: + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{ $labels.node }} has been unready for more than an hour.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{ $value }} different versions of Kubernetes components + running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch + expr: | + count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }} errors / second. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: | + sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close + to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods + expr: | + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 24 hours. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + alertmanager.rules.yaml: |- + groups: + - name: alertmanager + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` + are out of sync. + expr: | + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + expr: | + alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: | + alertmanager_cluster_members{job="alertmanager-main"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + for: 5m + labels: + severity: critical + general.rules.yaml: |- + groups: + - name: general + rules: + - alert: TargetDown + annotations: + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: AlertPropagationTest + annotations: + message: This is an AlertPropagationTest meant to ensure that the entire alerting + pipeline is functional. + expr: vector(1) + labels: + severity: none + kube-prometheus-node-alerting.rules.yaml: |- + groups: + - name: kube-prometheus-node-alerting + rules: + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 24 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 2 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) + for: 10m + labels: + severity: critical + prometheus.rules.yaml: |- + groups: + - name: prometheus + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Prometheus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full + expr: | + predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + tsdb_wal_corruptions_total{job="prometheus"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 + for: 10m + labels: + severity: warning diff --git a/tutorials/kubernetes-demo/manifests/prometheus.yaml b/tutorials/kubernetes-demo/manifests/prometheus.yaml new file mode 100644 index 0000000000..7638ec896a --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/prometheus.yaml @@ -0,0 +1,206 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + name: prometheus +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http-prometheus + name: http-prometheus + selector: + statefulset.kubernetes.io/pod-name: prometheus-0 + type: NodePort +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: prometheus + labels: + app: prometheus +spec: + serviceName: "prometheus" + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + securityContext: + runAsUser: 1000 + fsGroup: 2000 + runAsNonRoot: true + serviceAccountName: prometheus + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v2.6.1 + args: + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/var/prometheus + - --web.enable-lifecycle + - --storage.tsdb.retention=2w + ports: + - name: http-prometheus + containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: prometheus + mountPath: /var/prometheus + volumes: + - name: config + configMap: + name: prometheus + - name: rules + configMap: + name: prometheus-rules + volumeClaimTemplates: + - metadata: + labels: + app: prometheus + name: prometheus + spec: + storageClassName: prom-manual + accessModes: + - ReadWriteOnce + resources: + requests: + # Normally, probably 15x more (: + storage: 4Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus +data: + prometheus.yaml: |- + # Inspired by https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml + global: + scrape_interval: 15s + scrape_timeout: 10s + external_labels: + cluster: %%CLUSTER%% + + alerting: + alertmanagers: + - static_configs: + - targets: + - %%ALERTMANAGER_URL%% + + rule_files: + - /etc/prometheus/rules/*rules.yaml + + scrape_configs: + - job_name: kube-apiserver + scheme: https + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + regex: default;kubernetes;https + action: keep + + - job_name: kubelet + scheme: https + kubernetes_sd_configs: + - role: node + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + relabel_configs: + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: id + action: labeldrop + - regex: name + action: labeldrop + - regex: pod_name + action: labeldrop + - target_label: cluster + replacement: %%CLUSTER%% + + - job_name: kube-pods + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: ^(http|http-.+|metrics)$ + action: keep + - source_labels: [__meta_kubernetes_pod_label_k8s_app] + target_label: job + - source_labels: [__meta_kubernetes_pod_label_app] + regex: ^(.+)$ + target_label: job + - source_labels: [job, __meta_kubernetes_pod_container_port_name] + regex: ^(.*);http-(.+)$ + target_label: job + - source_labels: [__meta_kubernetes_pod_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - target_label: cluster + replacement: %%CLUSTER%% + metric_relabel_configs: + - source_labels: [pod_name] + regex: ^(.+)$ + target_label: pod + - regex: pod_name + action: labeldrop +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus + namespace: default +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default diff --git a/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml b/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml new file mode 100644 index 0000000000..dba722a14b --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: thanos-compactor + labels: + app: thanos-compactor +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-compactor + serviceName: thanos-compactor + template: + metadata: + labels: + app: thanos-compactor + spec: + containers: + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - compact + - --log.level=debug + - --data-dir=/data + - | + --objstore.config=type: S3 + config: + bucket: demo-bucket + access_key: smth + secret_key: Need8Chars + endpoint: %%S3_ENDPOINT%% + insecure: true + - --sync-delay=30m + - --wait + ports: + - name: http + containerPort: 10902 + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi diff --git a/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml b/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml new file mode 100644 index 0000000000..a2a70db48a --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml @@ -0,0 +1,67 @@ +# This allow us to do +# SRV lookup: thanos-store-api.default.svc +apiVersion: v1 +kind: Service +metadata: + name: thanos-store-gateway +spec: + type: ClusterIP + clusterIP: None + ports: + - name: grpc + port: 10901 + targetPort: grpc + selector: + thanos-store-api: "true" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-querier + labels: + app: thanos-querier +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-querier + template: + metadata: + labels: + app: thanos-querier + spec: + containers: + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - query + - --log.level=debug + - --query.replica-label=replica + - --cluster.disable + # Discover local store APIs using DNS SRV. + - --store=dnssrv+thanos-store-gateway.default.svc + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + livenessProbe: + httpGet: + path: /-/healthy + port: http +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: thanos-querier + name: thanos-querier +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http + name: http + selector: + app: thanos-querier + type: NodePort diff --git a/tutorials/kubernetes-demo/manifests/thanos-querier.yaml b/tutorials/kubernetes-demo/manifests/thanos-querier.yaml new file mode 100644 index 0000000000..05491760c3 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/thanos-querier.yaml @@ -0,0 +1,70 @@ +# This allow us to do +# SRV lookup: thanos-store-api.default.svc +apiVersion: v1 +kind: Service +metadata: + name: thanos-store-gateway +spec: + type: ClusterIP + clusterIP: None + ports: + - name: grpc + port: 10901 + targetPort: grpc + selector: + thanos-store-api: "true" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-querier + labels: + app: thanos-querier +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-querier + template: + metadata: + labels: + app: thanos-querier + spec: + containers: + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - query + - --log.level=debug + - --query.replica-label=replica + - --cluster.disable + # Discover local store APIs using DNS SRV. + - --store=dnssrv+thanos-store-gateway.default.svc + # Get remote store APIs by IP:Port. + - --store=%%SIDECAR_US1_0_URL%% + - --store=%%SIDECAR_US1_1_URL%% + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + livenessProbe: + httpGet: + path: /-/healthy + port: http +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: thanos-querier + name: thanos-querier +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http + name: http + selector: + app: thanos-querier + type: NodePort diff --git a/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml b/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml new file mode 100644 index 0000000000..f189c84271 --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml @@ -0,0 +1,108 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: thanos-ruler-rules +data: + alert_down_services.rules.yaml: | + groups: + - name: metamonitoring + rules: + - alert: PrometheusReplicaDown + annotations: + message: Prometheus replica in cluster {{$labels.cluster}} has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown + expr: | + label_replace( + absent(sum(up{job="prometheus", cluster="eu1", instance=~".*:9090"}) by (job, cluster) == 2), + "cluster", "eu1", "","" + ) + or + label_replace( + absent(sum(up{job="prometheus", cluster="us1", instance=~".*:9090"}) by (job, cluster) == 2), + "cluster", "us1", "","" + ) + for: 15s # for demo purposes + labels: + severity: critical +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + labels: + app: thanos-ruler + name: thanos-ruler +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-ruler + serviceName: thanos-ruler + template: + metadata: + labels: + app: thanos-ruler + thanos-store-api: "true" + spec: + containers: + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - rule + - --log.level=debug + - --data-dir=/data + - --eval-interval=15s + - --cluster.disable + - --rule-file=/etc/thanos-ruler/*.rules.yaml + - --alertmanagers.url=http://%%ALERTMANAGER_URL%% + - --query=thanos-querier.default.svc:9090 + - | + --objstore.config=type: S3 + config: + bucket: demo-bucket + access_key: smth + secret_key: Need8Chars + endpoint: %%S3_ENDPOINT%% + insecure: true + # We don't want to override underlying metric's cluster label. + - --label=ruler_cluster="%%CLUSTER%%" + - --label=replica="$(POD_NAME)" + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + resources: + limits: + cpu: 500m + memory: 500Mi + requests: + cpu: 500m + memory: 500Mi + volumeMounts: + - mountPath: /etc/thanos-ruler + name: config + volumes: + - configMap: + name: thanos-ruler-rules + name: config +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: thanos-ruler + name: thanos-ruler +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: http + name: http + selector: + statefulset.kubernetes.io/pod-name: thanos-ruler-0 + type: NodePort \ No newline at end of file diff --git a/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml b/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml new file mode 100644 index 0000000000..2bf4b91bfa --- /dev/null +++ b/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml @@ -0,0 +1,65 @@ +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + name: thanos-store-gateway + labels: + app: thanos-store-gateway +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-store-gateway + serviceName: thanos-store-gateway + template: + metadata: + labels: + app: thanos-store-gateway + thanos-store-api: "true" + spec: + containers: + - name: thanos + image: improbable/thanos:v0.3.0 + args: + - store + - --log.level=debug + - --data-dir=/data + - --cluster.disable + - --index-cache-size=500MB + - --chunk-pool-size=500MB + - | + --objstore.config=type: S3 + config: + bucket: demo-bucket + access_key: smth + secret_key: Need8Chars + endpoint: %%S3_ENDPOINT%% + insecure: true + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi + +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: thanos-store-gateway + name: thanos-store-gateway +spec: + ports: + - port: 10901 + protocol: TCP + targetPort: grpc + name: grpc + selector: + app: thanos-store-gateway + type: NodePort diff --git a/tutorials/kubernetes-demo/setup.sh b/tutorials/kubernetes-demo/setup.sh new file mode 100755 index 0000000000..53896a99a1 --- /dev/null +++ b/tutorials/kubernetes-demo/setup.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -e + +MINIKUBE_RESTART=${1:true} + +# Prepare setup with: +# +# eu1: +# * Grafana +# * Alertmanager +# * 1 replica Prometheus with 2w metrics data. +# +# us1: +# * 1 replica Prometheus with 2w metrics data. + +if ${MINIKUBE_RESTART}; then + ./cluster-down.sh + ./cluster-up.sh +fi + +kubectl --context=eu1 apply -f manifests/alertmanager.yaml + +sleep 2s + +ALERTMANAGER_URL=$(minikube -p eu1 service alertmanager --format="{{.IP}}:{{.Port}}") +if [[ -z "${ALERTMANAGER_URL}" ]]; then + echo "minikube returns empty result for ALERTMANAGER_URL" + exit 1 +fi + +./apply-pv-gen-metrics.sh eu1 0 336h +kubectl --context=eu1 apply -f manifests/prometheus-rules.yaml +cat manifests/prometheus.yaml | sed "s#%%ALERTMANAGER_URL%%#${ALERTMANAGER_URL}#g" | sed "s#%%CLUSTER%%#eu1#g" | kubectl --context=eu1 apply -f - +kubectl --context=eu1 apply -f manifests/kube-state-metrics.yaml + +./apply-pv-gen-metrics.sh us1 0 336h +kubectl --context=us1 apply -f manifests/prometheus-rules.yaml +cat manifests/prometheus.yaml | sed "s#%%ALERTMANAGER_URL%%#${ALERTMANAGER_URL}#g" | sed "s#%%CLUSTER%%#us1#g" | kubectl --context=us1 apply -f - +kubectl --context=us1 apply -f manifests/kube-state-metrics.yaml + +sleep 1s + +PROM_US1_URL=$(minikube -p us1 service prometheus --url) +echo "PROM_US1_URL=${PROM_US1_URL}" +sed "s#%%PROM_US1_URL%%#${PROM_US1_URL}#g" manifests/grafana-datasources.yaml | kubectl --context=eu1 apply -f - +kubectl apply --context=eu1 -f manifests/grafana.yaml diff --git a/tutorials/kubernetes-demo/slides/globalview-ha.svg b/tutorials/kubernetes-demo/slides/globalview-ha.svg new file mode 100644 index 0000000000..885f60211c --- /dev/null +++ b/tutorials/kubernetes-demo/slides/globalview-ha.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tutorials/kubernetes-demo/slides/initial-setup.svg b/tutorials/kubernetes-demo/slides/initial-setup.svg new file mode 100644 index 0000000000..c3897c0ff6 --- /dev/null +++ b/tutorials/kubernetes-demo/slides/initial-setup.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tutorials/kubernetes-demo/slides/unlimited-retention.svg b/tutorials/kubernetes-demo/slides/unlimited-retention.svg new file mode 100644 index 0000000000..b13b2a6c46 --- /dev/null +++ b/tutorials/kubernetes-demo/slides/unlimited-retention.svg @@ -0,0 +1 @@ + \ No newline at end of file From a14cc0d9c3a1baa72562c9b7b4f73c29278cdb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Fri, 22 Feb 2019 04:43:12 -0500 Subject: [PATCH 08/18] k8s tutorial: Moved to proper Thanos version. (#863) Signed-off-by: Bartek Plotka --- .../kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml | 2 +- tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml | 2 +- tutorials/kubernetes-demo/manifests/thanos-compactor.yaml | 2 +- tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml | 2 +- tutorials/kubernetes-demo/manifests/thanos-querier.yaml | 2 +- tutorials/kubernetes-demo/manifests/thanos-ruler.yaml | 2 +- tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml | 3 ++- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml index 51382ee068..02d37d2dc5 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml @@ -115,7 +115,7 @@ spec: - name: prometheus mountPath: /var/prometheus - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - sidecar - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml index de776f4107..9e30120ade 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml @@ -110,7 +110,7 @@ spec: - name: prometheus mountPath: /var/prometheus - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - sidecar - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml b/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml index dba722a14b..f3174b00a6 100644 --- a/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml +++ b/tutorials/kubernetes-demo/manifests/thanos-compactor.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - compact - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml b/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml index a2a70db48a..e3d6f4d5e3 100644 --- a/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml +++ b/tutorials/kubernetes-demo/manifests/thanos-querier-no-us1.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - query - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/thanos-querier.yaml b/tutorials/kubernetes-demo/manifests/thanos-querier.yaml index 05491760c3..5cbb3d9742 100644 --- a/tutorials/kubernetes-demo/manifests/thanos-querier.yaml +++ b/tutorials/kubernetes-demo/manifests/thanos-querier.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - query - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml b/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml index f189c84271..8f90c02294 100644 --- a/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml +++ b/tutorials/kubernetes-demo/manifests/thanos-ruler.yaml @@ -45,7 +45,7 @@ spec: spec: containers: - name: thanos - image: improbable/thanos:v0.3.0 + image: improbable/thanos:v0.3.1 args: - rule - --log.level=debug diff --git a/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml b/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml index 2bf4b91bfa..c080cf58c0 100644 --- a/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml +++ b/tutorials/kubernetes-demo/manifests/thanos-store-gateway.yaml @@ -18,7 +18,8 @@ spec: spec: containers: - name: thanos - image: improbable/thanos:v0.3.0 + # TODO(bwplotka): Move to v0.3.2 once available. + image: improbable/thanos:master-2019-02-21-0c730c1 args: - store - --log.level=debug From 0bb65902227b0f8ce0cc46044babb17ee6c5afd4 Mon Sep 17 00:00:00 2001 From: Sylvain Rabot Date: Fri, 1 Mar 2019 20:45:48 +0100 Subject: [PATCH 09/18] Add a multi-stage Dockerfile (#871) Signed-off-by: Sylvain Rabot --- Dockerfile.multi-stage | 17 +++++++++++++++++ Makefile | 6 ++++++ 2 files changed, 23 insertions(+) create mode 100644 Dockerfile.multi-stage diff --git a/Dockerfile.multi-stage b/Dockerfile.multi-stage new file mode 100644 index 0000000000..cae0339cdf --- /dev/null +++ b/Dockerfile.multi-stage @@ -0,0 +1,17 @@ +FROM golang:1.11-alpine3.9 as builder + +ADD . $GOPATH/src/github.com/improbable-eng/thanos +WORKDIR $GOPATH/src/github.com/improbable-eng/thanos + +RUN apk update && apk upgrade && apk add --no-cache alpine-sdk + +RUN git update-index --refresh; make + +# ----------------------------------------------------------------------------- + +FROM quay.io/prometheus/busybox:latest +LABEL maintainer="The Thanos Authors" + +COPY --from=builder /go/src/github.com/improbable-eng/thanos/thanos /bin/thanos + +ENTRYPOINT [ "/bin/thanos" ] diff --git a/Makefile b/Makefile index 14101796a0..9274ebe0b2 100644 --- a/Makefile +++ b/Makefile @@ -99,6 +99,12 @@ docker: build @echo ">> building docker image '${DOCKER_IMAGE_NAME}'" @docker build -t "${DOCKER_IMAGE_NAME}" . +#docker-multi-stage builds docker image using multi-stage. +.PHONY: docker-multi-stage +docker-multi-stage: + @echo ">> building docker image '${DOCKER_IMAGE_NAME}' with Dockerfile.multi-stage" + @docker build -f Dockerfile.multi-stage -t "${DOCKER_IMAGE_NAME}" . + # docker-push pushes docker image build under `${DOCKER_IMAGE_NAME}` to improbable/"$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" .PHONY: docker-push docker-push: From d6bf24fcd27a3a9f0f03cb67fa752c43f3d30b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Mon, 4 Mar 2019 09:54:06 +0000 Subject: [PATCH 10/18] sidecar: Moved shipper flag check to run.group. (#867) Otherwise we can end up being in starvation. Reloader is trying to create config for Prometheus and shipper fails before that because Prometheus is not up. Signed-off-by: Bartek Plotka --- cmd/thanos/sidecar.go | 20 ++++++++++---------- pkg/shipper/shipper.go | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 8f77790f31..7334cbd4ec 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -256,19 +256,19 @@ func runSidecar( } ctx, cancel := context.WithCancel(context.Background()) - var s *shipper.Shipper - if uploadCompacted { - s, err = shipper.NewWithCompacted(ctx, logger, reg, dataDir, bkt, m.Labels, metadata.SidecarSource, promURL) - if err != nil { - return errors.Wrap(err, "create shipper") - } - } else { - s = shipper.New(logger, reg, dataDir, bkt, m.Labels, metadata.SidecarSource) - } - g.Add(func() error { defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") + var s *shipper.Shipper + if uploadCompacted { + s, err = shipper.NewWithCompacted(ctx, logger, reg, dataDir, bkt, m.Labels, metadata.SidecarSource, m.promURL) + if err != nil { + return errors.Wrap(err, "create shipper") + } + } else { + s = shipper.New(logger, reg, dataDir, bkt, m.Labels, metadata.SidecarSource) + } + return runutil.Repeat(30*time.Second, ctx.Done(), func() error { if uploaded, err := s.Sync(ctx); err != nil { level.Warn(logger).Log("err", err, "uploaded", uploaded) diff --git a/pkg/shipper/shipper.go b/pkg/shipper/shipper.go index 5a2c687be6..a5208d84f1 100644 --- a/pkg/shipper/shipper.go +++ b/pkg/shipper/shipper.go @@ -136,9 +136,18 @@ func NewWithCompacted( lbls = func() labels.Labels { return nil } } - flags, err := promclient.ConfiguredFlags(ctx, logger, prometheusURL) - if err != nil { - return nil, errors.Wrap(err, "configured flags; failed to check if compaction is disabled") + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + var flags promclient.Flags + if err := runutil.Retry(1*time.Second, ctx.Done(), func() (err error) { + flags, err = promclient.ConfiguredFlags(ctx, logger, prometheusURL) + if err != nil { + return errors.Wrap(err, "configured flags; failed to check if compaction is disabled") + } + return nil + }); err != nil { + return nil, err } if flags.TSDBMinTime != model.Duration(2*time.Hour) || flags.TSDBMaxTime != model.Duration(2*time.Hour) { From ca759be067855f432a1ff30957d60d258b7a153d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20P=C5=82otka?= Date: Mon, 4 Mar 2019 13:26:06 +0000 Subject: [PATCH 11/18] Update README.md with docker pull number. (#877) * Update README.md * Update README.md * Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 871400b878..209d83b83e 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Go Report Card](https://goreportcard.com/badge/github.com/improbable-eng/thanos)](https://goreportcard.com/report/github.com/improbable-eng/thanos) [![GoDoc](https://godoc.org/github.com/improbable-eng/thanos?status.svg)](https://godoc.org/github.com/improbable-eng/thanos) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://join.slack.com/t/improbable-eng/shared_invite/enQtMzQ1ODcyMzQ5MjM4LWY5ZWZmNGM2ODc5MmViNmQ3ZTA3ZTY3NzQwOTBlMTkzZmIxZTIxODk0OWU3YjZhNWVlNDU3MDlkZGViZjhkMjc) +[![Docker Pulls](https://img.shields.io/docker/pulls/improbable/thanos.svg?maxAge=604800)](https://hub.docker.com/r/improbable/thanos/) ## Overview From 55aaf765823819805422884ef4ef85f4bae04b57 Mon Sep 17 00:00:00 2001 From: David Tsur Date: Mon, 4 Mar 2019 16:19:17 +0200 Subject: [PATCH 12/18] Rule component: Adding new API end point for rules and alerts in (#851) * Adding new API end point for rules and alerts * reusing methods in query api * Adding unitest for rule API (similar to prometheus rule/alert API end point unitest) * Adding SetCORS * Removing comments * Related to issue #850 --- benchmark/cmd/thanosbench/resources.go | 5 +- cmd/thanos/rule.go | 5 + pkg/query/api/v1.go | 124 ++++++------- pkg/query/api/v1_test.go | 12 +- pkg/rule/api/v1.go | 194 ++++++++++++++++++++ pkg/rule/api/v1_test.go | 240 +++++++++++++++++++++++++ 6 files changed, 510 insertions(+), 70 deletions(-) create mode 100644 pkg/rule/api/v1.go create mode 100644 pkg/rule/api/v1_test.go diff --git a/benchmark/cmd/thanosbench/resources.go b/benchmark/cmd/thanosbench/resources.go index 063c4e6607..f9da1c7ac0 100644 --- a/benchmark/cmd/thanosbench/resources.go +++ b/benchmark/cmd/thanosbench/resources.go @@ -12,6 +12,7 @@ import ( prom "github.com/prometheus/prometheus/config" "gopkg.in/yaml.v2" appsv1 "k8s.io/api/apps/v1" + "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -207,7 +208,7 @@ func createPrometheus(opts *opts, name string, bucket string) *appsv1.StatefulSe Name: name, Namespace: promNamespace, Labels: map[string]string{ - "app": name, + "app": name, "thanos-gossip-member": "true", }, } @@ -370,7 +371,7 @@ func createThanosQuery(opts *opts) (*v1.Service, *v1.Pod) { Name: "thanos-query", Namespace: thanosNamespace, Labels: map[string]string{ - "app": "thanos-query", + "app": "thanos-query", "thanos-gossip-member": "true", }, } diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 612e1cf93e..a3847105b4 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -10,6 +10,7 @@ import ( "net/url" "os" "os/signal" + "path" "path/filepath" "sort" "strconv" @@ -29,6 +30,7 @@ import ( "github.com/improbable-eng/thanos/pkg/extprom" "github.com/improbable-eng/thanos/pkg/objstore/client" "github.com/improbable-eng/thanos/pkg/promclient" + "github.com/improbable-eng/thanos/pkg/rule/api" "github.com/improbable-eng/thanos/pkg/runutil" "github.com/improbable-eng/thanos/pkg/shipper" "github.com/improbable-eng/thanos/pkg/store" @@ -565,6 +567,9 @@ func runRule( ui.NewRuleUI(logger, mgr, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix)) + api := v1.NewAPI(logger, mgr) + api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger) + mux := http.NewServeMux() registerMetrics(mux, reg) registerProfile(mux) diff --git a/pkg/query/api/v1.go b/pkg/query/api/v1.go index cf693dd2fb..61c1603b57 100644 --- a/pkg/query/api/v1.go +++ b/pkg/query/api/v1.go @@ -50,15 +50,15 @@ const ( statusError = "error" ) -type errorType string +type ErrorType string const ( - errorNone errorType = "" + errorNone ErrorType = "" errorTimeout = "timeout" errorCanceled = "canceled" errorExec = "execution" errorBadData = "bad_data" - errorInternal = "internal" + ErrorInternal = "internal" ) var corsHeaders = map[string]string{ @@ -68,31 +68,31 @@ var corsHeaders = map[string]string{ "Access-Control-Expose-Headers": "Date", } -type apiError struct { - typ errorType - err error +type ApiError struct { + Typ ErrorType + Err error } -func (e *apiError) Error() string { - return fmt.Sprintf("%s: %s", e.typ, e.err) +func (e *ApiError) Error() string { + return fmt.Sprintf("%s: %s", e.Typ, e.Err) } type response struct { Status status `json:"status"` Data interface{} `json:"data,omitempty"` - ErrorType errorType `json:"errorType,omitempty"` + ErrorType ErrorType `json:"ErrorType,omitempty"` Error string `json:"error,omitempty"` Warnings []string `json:"warnings,omitempty"` } // Enables cross-site script calls. -func setCORS(w http.ResponseWriter) { +func SetCORS(w http.ResponseWriter) { for h, v := range corsHeaders { w.Header().Set(h, v) } } -type apiFunc func(r *http.Request) (interface{}, []error, *apiError) +type ApiFunc func(r *http.Request) (interface{}, []error, *ApiError) // API can register a set of endpoints in a router and handle // them using the provided storage and query engine. @@ -151,13 +151,13 @@ func NewAPI( // Register the API's endpoints in the given router. func (api *API) Register(r *route.Router, tracer opentracing.Tracer, logger log.Logger) { - instr := func(name string, f apiFunc) http.HandlerFunc { + instr := func(name string, f ApiFunc) http.HandlerFunc { hf := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - setCORS(w) + SetCORS(w) if data, warnings, err := f(r); err != nil { - respondError(w, err, data) + RespondError(w, err, data) } else if data != nil { - respond(w, data, warnings) + Respond(w, data, warnings) } else { w.WriteHeader(http.StatusNoContent) } @@ -183,7 +183,7 @@ type queryData struct { Warnings []error `json:"warnings,omitempty"` } -func (api *API) parseEnableDedupParam(r *http.Request) (enableDeduplication bool, _ *apiError) { +func (api *API) parseEnableDedupParam(r *http.Request) (enableDeduplication bool, _ *ApiError) { const dedupParam = "dedup" enableDeduplication = true @@ -191,13 +191,13 @@ func (api *API) parseEnableDedupParam(r *http.Request) (enableDeduplication bool var err error enableDeduplication, err = strconv.ParseBool(val) if err != nil { - return false, &apiError{errorBadData, errors.Wrapf(err, "'%s' parameter", dedupParam)} + return false, &ApiError{errorBadData, errors.Wrapf(err, "'%s' parameter", dedupParam)} } } return enableDeduplication, nil } -func (api *API) parseDownsamplingParam(r *http.Request, step time.Duration) (maxSourceResolution time.Duration, _ *apiError) { +func (api *API) parseDownsamplingParam(r *http.Request, step time.Duration) (maxSourceResolution time.Duration, _ *ApiError) { const maxSourceResolutionParam = "max_source_resolution" maxSourceResolution = 0 * time.Second @@ -209,18 +209,18 @@ func (api *API) parseDownsamplingParam(r *http.Request, step time.Duration) (max var err error maxSourceResolution, err = parseDuration(val) if err != nil { - return 0, &apiError{errorBadData, errors.Wrapf(err, "'%s' parameter", maxSourceResolutionParam)} + return 0, &ApiError{errorBadData, errors.Wrapf(err, "'%s' parameter", maxSourceResolutionParam)} } } if maxSourceResolution < 0 { - return 0, &apiError{errorBadData, errors.Errorf("negative '%s' is not accepted. Try a positive integer", maxSourceResolutionParam)} + return 0, &ApiError{errorBadData, errors.Errorf("negative '%s' is not accepted. Try a positive integer", maxSourceResolutionParam)} } return maxSourceResolution, nil } -func (api *API) parsePartialResponseParam(r *http.Request) (enablePartialResponse bool, _ *apiError) { +func (api *API) parsePartialResponseParam(r *http.Request) (enablePartialResponse bool, _ *ApiError) { const partialResponseParam = "partial_response" enablePartialResponse = api.enablePartialResponse @@ -228,23 +228,23 @@ func (api *API) parsePartialResponseParam(r *http.Request) (enablePartialRespons var err error enablePartialResponse, err = strconv.ParseBool(val) if err != nil { - return false, &apiError{errorBadData, errors.Wrapf(err, "'%s' parameter", partialResponseParam)} + return false, &ApiError{errorBadData, errors.Wrapf(err, "'%s' parameter", partialResponseParam)} } } return enablePartialResponse, nil } -func (api *API) options(r *http.Request) (interface{}, []error, *apiError) { +func (api *API) options(r *http.Request) (interface{}, []error, *ApiError) { return nil, nil, nil } -func (api *API) query(r *http.Request) (interface{}, []error, *apiError) { +func (api *API) query(r *http.Request) (interface{}, []error, *ApiError) { var ts time.Time if t := r.FormValue("time"); t != "" { var err error ts, err = parseTime(t) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } } else { ts = api.now() @@ -255,7 +255,7 @@ func (api *API) query(r *http.Request) (interface{}, []error, *apiError) { var cancel context.CancelFunc timeout, err := parseDuration(to) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } ctx, cancel = context.WithTimeout(ctx, timeout) @@ -289,20 +289,20 @@ func (api *API) query(r *http.Request) (interface{}, []error, *apiError) { begin := api.now() qry, err := api.queryEngine.NewInstantQuery(api.queryableCreate(enableDedup, 0, enablePartialResponse, warningReporter), r.FormValue("query"), ts) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { case promql.ErrQueryCanceled: - return nil, nil, &apiError{errorCanceled, res.Err} + return nil, nil, &ApiError{errorCanceled, res.Err} case promql.ErrQueryTimeout: - return nil, nil, &apiError{errorTimeout, res.Err} + return nil, nil, &ApiError{errorTimeout, res.Err} case promql.ErrStorage: - return nil, nil, &apiError{errorInternal, res.Err} + return nil, nil, &ApiError{ErrorInternal, res.Err} } - return nil, nil, &apiError{errorExec, res.Err} + return nil, nil, &ApiError{errorExec, res.Err} } api.instantQueryDuration.Observe(time.Since(begin).Seconds()) @@ -312,35 +312,35 @@ func (api *API) query(r *http.Request) (interface{}, []error, *apiError) { }, warnings, nil } -func (api *API) queryRange(r *http.Request) (interface{}, []error, *apiError) { +func (api *API) queryRange(r *http.Request) (interface{}, []error, *ApiError) { start, err := parseTime(r.FormValue("start")) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } end, err := parseTime(r.FormValue("end")) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } if end.Before(start) { err := errors.New("end timestamp must not be before start time") - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } step, err := parseDuration(r.FormValue("step")) if err != nil { - return nil, nil, &apiError{errorBadData, errors.Wrap(err, "param step")} + return nil, nil, &ApiError{errorBadData, errors.Wrap(err, "param step")} } if step <= 0 { err := errors.New("zero or negative query resolution step widths are not accepted. Try a positive integer") - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } // For safety, limit the number of returned points per timeseries. // This is sufficient for 60s resolution for a week or 1h resolution for a year. if end.Sub(start)/step > 11000 { err := errors.Errorf("exceeded maximum resolution of 11,000 points per timeseries. Try decreasing the query resolution (?step=XX)") - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } ctx := r.Context() @@ -348,7 +348,7 @@ func (api *API) queryRange(r *http.Request) (interface{}, []error, *apiError) { var cancel context.CancelFunc timeout, err := parseDuration(to) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } ctx, cancel = context.WithTimeout(ctx, timeout) @@ -393,18 +393,18 @@ func (api *API) queryRange(r *http.Request) (interface{}, []error, *apiError) { step, ) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { case promql.ErrQueryCanceled: - return nil, nil, &apiError{errorCanceled, res.Err} + return nil, nil, &ApiError{errorCanceled, res.Err} case promql.ErrQueryTimeout: - return nil, nil, &apiError{errorTimeout, res.Err} + return nil, nil, &ApiError{errorTimeout, res.Err} } - return nil, nil, &apiError{errorExec, res.Err} + return nil, nil, &ApiError{errorExec, res.Err} } api.rangeQueryDuration.Observe(time.Since(begin).Seconds()) @@ -414,12 +414,12 @@ func (api *API) queryRange(r *http.Request) (interface{}, []error, *apiError) { }, warnings, nil } -func (api *API) labelValues(r *http.Request) (interface{}, []error, *apiError) { +func (api *API) labelValues(r *http.Request) (interface{}, []error, *ApiError) { ctx := r.Context() name := route.Param(ctx, "name") if !model.LabelNameRE.MatchString(name) { - return nil, nil, &apiError{errorBadData, fmt.Errorf("invalid label name: %q", name)} + return nil, nil, &ApiError{errorBadData, fmt.Errorf("invalid label name: %q", name)} } enablePartialResponse, apiErr := api.parsePartialResponseParam(r) @@ -439,7 +439,7 @@ func (api *API) labelValues(r *http.Request) (interface{}, []error, *apiError) { q, err := api.queryableCreate(true, 0, enablePartialResponse, warningReporter).Querier(ctx, math.MinInt64, math.MaxInt64) if err != nil { - return nil, nil, &apiError{errorExec, err} + return nil, nil, &ApiError{errorExec, err} } defer runutil.CloseWithLogOnErr(api.logger, q, "queryable labelValues") @@ -447,7 +447,7 @@ func (api *API) labelValues(r *http.Request) (interface{}, []error, *apiError) { vals, err := q.LabelValues(name) if err != nil { - return nil, nil, &apiError{errorExec, err} + return nil, nil, &ApiError{errorExec, err} } return vals, warnings, nil @@ -458,13 +458,13 @@ var ( maxTime = time.Unix(math.MaxInt64/1000-62135596801, 999999999) ) -func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { +func (api *API) series(r *http.Request) (interface{}, []error, *ApiError) { if err := r.ParseForm(); err != nil { - return nil, nil, &apiError{errorInternal, errors.Wrap(err, "parse form")} + return nil, nil, &ApiError{ErrorInternal, errors.Wrap(err, "parse form")} } if len(r.Form["match[]"]) == 0 { - return nil, nil, &apiError{errorBadData, fmt.Errorf("no match[] parameter provided")} + return nil, nil, &ApiError{errorBadData, fmt.Errorf("no match[] parameter provided")} } var start time.Time @@ -472,7 +472,7 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { var err error start, err = parseTime(t) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } } else { start = minTime @@ -483,7 +483,7 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { var err error end, err = parseTime(t) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } } else { end = maxTime @@ -493,7 +493,7 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { for _, s := range r.Form["match[]"] { matchers, err := promql.ParseMetricSelector(s) if err != nil { - return nil, nil, &apiError{errorBadData, err} + return nil, nil, &ApiError{errorBadData, err} } matcherSets = append(matcherSets, matchers) } @@ -521,7 +521,7 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { // TODO(bwplotka): Support downsampling? q, err := api.queryableCreate(enableDedup, 0, enablePartialResponse, warningReporter).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { - return nil, nil, &apiError{errorExec, err} + return nil, nil, &ApiError{errorExec, err} } defer runutil.CloseWithLogOnErr(api.logger, q, "queryable series") @@ -529,7 +529,7 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { for _, mset := range matcherSets { s, _, err := q.Select(&storage.SelectParams{}, mset...) if err != nil { - return nil, nil, &apiError{errorExec, err} + return nil, nil, &ApiError{errorExec, err} } sets = append(sets, s) } @@ -541,13 +541,13 @@ func (api *API) series(r *http.Request) (interface{}, []error, *apiError) { metrics = append(metrics, set.At().Labels()) } if set.Err() != nil { - return nil, nil, &apiError{errorExec, set.Err()} + return nil, nil, &ApiError{errorExec, set.Err()} } return metrics, warnings, nil } -func respond(w http.ResponseWriter, data interface{}, warnings []error) { +func Respond(w http.ResponseWriter, data interface{}, warnings []error) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) @@ -561,18 +561,18 @@ func respond(w http.ResponseWriter, data interface{}, warnings []error) { _ = json.NewEncoder(w).Encode(resp) } -func respondError(w http.ResponseWriter, apiErr *apiError, data interface{}) { +func RespondError(w http.ResponseWriter, apiErr *ApiError, data interface{}) { w.Header().Set("Content-Type", "application/json") var code int - switch apiErr.typ { + switch apiErr.Typ { case errorBadData: code = http.StatusBadRequest case errorExec: code = 422 case errorCanceled, errorTimeout: code = http.StatusServiceUnavailable - case errorInternal: + case ErrorInternal: code = http.StatusInternalServerError default: code = http.StatusInternalServerError @@ -581,8 +581,8 @@ func respondError(w http.ResponseWriter, apiErr *apiError, data interface{}) { _ = json.NewEncoder(w).Encode(&response{ Status: statusError, - ErrorType: apiErr.typ, - Error: apiErr.err.Error(), + ErrorType: apiErr.Typ, + Error: apiErr.Err.Error(), Data: data, }) } diff --git a/pkg/query/api/v1_test.go b/pkg/query/api/v1_test.go index bcf17907b0..ca5dee2013 100644 --- a/pkg/query/api/v1_test.go +++ b/pkg/query/api/v1_test.go @@ -77,11 +77,11 @@ func TestEndpoints(t *testing.T) { start := time.Unix(0, 0) var tests = []struct { - endpoint apiFunc + endpoint ApiFunc params map[string]string query url.Values response interface{} - errType errorType + errType ErrorType }{ { endpoint: api.query, @@ -425,8 +425,8 @@ func TestEndpoints(t *testing.T) { if test.errType == errorNone { t.Fatalf("Unexpected error: %s", apiErr) } - if test.errType != apiErr.typ { - t.Fatalf("Expected error of type %q but got type %q", test.errType, apiErr.typ) + if test.errType != apiErr.Typ { + t.Fatalf("Expected error of type %q but got type %q", test.errType, apiErr.Typ) } return } @@ -446,7 +446,7 @@ func TestEndpoints(t *testing.T) { func TestRespondSuccess(t *testing.T) { s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - respond(w, "test", nil) + Respond(w, "test", nil) })) defer s.Close() @@ -483,7 +483,7 @@ func TestRespondSuccess(t *testing.T) { func TestRespondError(t *testing.T) { s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - respondError(w, &apiError{errorTimeout, errors.New("message")}, "test") + RespondError(w, &ApiError{errorTimeout, errors.New("message")}, "test") })) defer s.Close() diff --git a/pkg/rule/api/v1.go b/pkg/rule/api/v1.go new file mode 100644 index 0000000000..be2dc5abf4 --- /dev/null +++ b/pkg/rule/api/v1.go @@ -0,0 +1,194 @@ +package v1 + +import ( + "fmt" + "net/http" + "time" + + "github.com/NYTimes/gziphandler" + qapi "github.com/improbable-eng/thanos/pkg/query/api" + "github.com/improbable-eng/thanos/pkg/tracing" + "github.com/prometheus/client_golang/prometheus" + + "github.com/go-kit/kit/log" + "github.com/opentracing/opentracing-go" + "github.com/prometheus/common/route" + "github.com/prometheus/prometheus/pkg/labels" + "github.com/prometheus/prometheus/rules" +) + +type API struct { + logger log.Logger + now func() time.Time + rulesRetriever rulesRetriever +} + +func NewAPI( + logger log.Logger, + rr rulesRetriever, +) *API { + return &API{ + logger: logger, + now: time.Now, + rulesRetriever: rr, + } +} + +func (api *API) Register(r *route.Router, tracer opentracing.Tracer, logger log.Logger) { + instr := func(name string, f qapi.ApiFunc) http.HandlerFunc { + hf := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + qapi.SetCORS(w) + if data, warnings, err := f(r); err != nil { + qapi.RespondError(w, err, data) + } else if data != nil { + qapi.Respond(w, data, warnings) + } else { + w.WriteHeader(http.StatusNoContent) + } + }) + return prometheus.InstrumentHandler(name, tracing.HTTPMiddleware(tracer, name, logger, gziphandler.GzipHandler(hf))) + } + + r.Get("/alerts", instr("alerts", api.alerts)) + r.Get("/rules", instr("rules", api.rules)) + +} + +type rulesRetriever interface { + RuleGroups() []*rules.Group + AlertingRules() []*rules.AlertingRule +} + +func (api *API) rules(r *http.Request) (interface{}, []error, *qapi.ApiError) { + ruleGroups := api.rulesRetriever.RuleGroups() + res := &RuleDiscovery{RuleGroups: make([]*RuleGroup, len(ruleGroups))} + for i, grp := range ruleGroups { + apiRuleGroup := &RuleGroup{ + Name: grp.Name(), + File: grp.File(), + Interval: grp.Interval().Seconds(), + Rules: []rule{}, + } + + for _, r := range grp.Rules() { + var enrichedRule rule + + lastError := "" + if r.LastError() != nil { + lastError = r.LastError().Error() + } + + switch rule := r.(type) { + case *rules.AlertingRule: + enrichedRule = alertingRule{ + Name: rule.Name(), + Query: rule.Query().String(), + Duration: rule.Duration().Seconds(), + Labels: rule.Labels(), + Annotations: rule.Annotations(), + Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()), + Health: rule.Health(), + LastError: lastError, + Type: "alerting", + } + case *rules.RecordingRule: + enrichedRule = recordingRule{ + Name: rule.Name(), + Query: rule.Query().String(), + Labels: rule.Labels(), + Health: rule.Health(), + LastError: lastError, + Type: "recording", + } + default: + err := fmt.Errorf("failed to assert type of rule '%v'", rule.Name()) + return nil, nil, &qapi.ApiError{qapi.ErrorInternal, err} + } + + apiRuleGroup.Rules = append(apiRuleGroup.Rules, enrichedRule) + } + res.RuleGroups[i] = apiRuleGroup + } + return res, nil, nil +} + +func (api *API) alerts(r *http.Request) (interface{}, []error, *qapi.ApiError) { + alertingRules := api.rulesRetriever.AlertingRules() + alerts := []*Alert{} + + for _, alertingRule := range alertingRules { + alerts = append( + alerts, + rulesAlertsToAPIAlerts(alertingRule.ActiveAlerts())..., + ) + } + + res := &AlertDiscovery{Alerts: alerts} + + return res, nil, nil +} + +type AlertDiscovery struct { + Alerts []*Alert `json:"alerts"` +} + +type Alert struct { + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + State string `json:"state"` + ActiveAt *time.Time `json:"activeAt,omitempty"` + Value float64 `json:"value"` +} + +func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { + apiAlerts := make([]*Alert, len(rulesAlerts)) + for i, ruleAlert := range rulesAlerts { + apiAlerts[i] = &Alert{ + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + Value: ruleAlert.Value, + } + } + + return apiAlerts +} + +type RuleDiscovery struct { + RuleGroups []*RuleGroup `json:"groups"` +} + +type RuleGroup struct { + Name string `json:"name"` + File string `json:"file"` + // In order to preserve rule ordering, while exposing type (alerting or recording) + // specific properties, both alerting and recording rules are exposed in the + // same array. + Rules []rule `json:"rules"` + Interval float64 `json:"interval"` +} + +type rule interface{} + +type alertingRule struct { + Name string `json:"name"` + Query string `json:"query"` + Duration float64 `json:"duration"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + Alerts []*Alert `json:"alerts"` + Health rules.RuleHealth `json:"health"` + LastError string `json:"lastError,omitempty"` + Type string `json:"type"` +} + +type recordingRule struct { + Name string `json:"name"` + Query string `json:"query"` + Labels labels.Labels `json:"labels,omitempty"` + Health rules.RuleHealth `json:"health"` + LastError string `json:"lastError,omitempty"` + // Type of a recordingRule is always "recording". + Type string `json:"type"` +} diff --git a/pkg/rule/api/v1_test.go b/pkg/rule/api/v1_test.go new file mode 100644 index 0000000000..af4034bd8d --- /dev/null +++ b/pkg/rule/api/v1_test.go @@ -0,0 +1,240 @@ +package v1 + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "reflect" + "testing" + "time" + + "github.com/go-kit/kit/log" + qapi "github.com/improbable-eng/thanos/pkg/query/api" + "github.com/prometheus/common/route" + "github.com/prometheus/prometheus/pkg/labels" + "github.com/prometheus/prometheus/promql" + "github.com/prometheus/prometheus/rules" + "github.com/prometheus/prometheus/util/testutil" +) + +type rulesRetrieverMock struct { + testing *testing.T +} + +func (m rulesRetrieverMock) RuleGroups() []*rules.Group { + var ar rulesRetrieverMock + arules := ar.AlertingRules() + storage := testutil.NewStorage(m.testing) + //defer storage.Close() + + engineOpts := promql.EngineOpts{ + Logger: nil, + Reg: nil, + MaxConcurrent: 10, + MaxSamples: 10, + Timeout: 100 * time.Second, + } + + engine := promql.NewEngine(engineOpts) + opts := &rules.ManagerOptions{ + QueryFunc: rules.EngineQueryFunc(engine, storage), + Appendable: storage, + Context: context.Background(), + Logger: log.NewNopLogger(), + } + + var r []rules.Rule + + for _, alertrule := range arules { + r = append(r, alertrule) + } + + recordingExpr, err := promql.ParseExpr(`vector(1)`) + if err != nil { + m.testing.Fatalf("unable to parse alert expression: %s", err) + } + recordingRule := rules.NewRecordingRule("recording-rule-1", recordingExpr, labels.Labels{}) + r = append(r, recordingRule) + + group := rules.NewGroup("grp", "/path/to/file", time.Second, r, false, opts) + return []*rules.Group{group} +} + +func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { + expr1, err := promql.ParseExpr(`absent(test_metric3) != 1`) + if err != nil { + m.testing.Fatalf("unable to parse alert expression: %s", err) + } + expr2, err := promql.ParseExpr(`up == 1`) + if err != nil { + m.testing.Fatalf("Unable to parse alert expression: %s", err) + } + + rule1 := rules.NewAlertingRule( + "test_metric3", + expr1, + time.Second, + labels.Labels{}, + labels.Labels{}, + true, + log.NewNopLogger(), + ) + rule2 := rules.NewAlertingRule( + "test_metric4", + expr2, + time.Second, + labels.Labels{}, + labels.Labels{}, + true, + log.NewNopLogger(), + ) + var r []*rules.AlertingRule + r = append(r, rule1) + r = append(r, rule2) + return r +} + +func TestEndpoints(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 1m + test_metric1{foo="bar"} 0+100x100 + test_metric1{foo="boo"} 1+0x100 + test_metric2{foo="boo"} 1+0x100 + `) + if err != nil { + t.Fatal(err) + } + defer suite.Close() + + if err := suite.Run(); err != nil { + t.Fatal(err) + } + + var algr rulesRetrieverMock + algr.testing = t + algr.AlertingRules() + algr.RuleGroups() + + t.Run("local", func(t *testing.T) { + var algr rulesRetrieverMock + algr.testing = t + algr.AlertingRules() + algr.RuleGroups() + api := NewAPI(nil, algr) + testEndpoints(t, api) + }) +} + +func testEndpoints(t *testing.T, api *API) { + + type test struct { + endpoint qapi.ApiFunc + params map[string]string + query url.Values + response interface{} + errType qapi.ErrorType + } + var tests = []test{ + { + endpoint: api.rules, + response: &RuleDiscovery{ + RuleGroups: []*RuleGroup{ + { + Name: "grp", + File: "/path/to/file", + Interval: 1, + Rules: []rule{ + alertingRule{ + Name: "test_metric3", + Query: "absent(test_metric3) != 1", + Duration: 1, + Labels: labels.Labels{}, + Annotations: labels.Labels{}, + Alerts: []*Alert{}, + Health: "unknown", + Type: "alerting", + }, + alertingRule{ + Name: "test_metric4", + Query: "up == 1", + Duration: 1, + Labels: labels.Labels{}, + Annotations: labels.Labels{}, + Alerts: []*Alert{}, + Health: "unknown", + Type: "alerting", + }, + recordingRule{ + Name: "recording-rule-1", + Query: "vector(1)", + Labels: labels.Labels{}, + Health: "unknown", + Type: "recording", + }, + }, + }, + }, + }, + }, + } + + methods := func(f qapi.ApiFunc) []string { + return []string{http.MethodGet} + } + + request := func(m string, q url.Values) (*http.Request, error) { + return http.NewRequest(m, fmt.Sprintf("http://example.com?%s", q.Encode()), nil) + } + for i, test := range tests { + for _, method := range methods(test.endpoint) { + // Build a context with the correct request params. + ctx := context.Background() + for p, v := range test.params { + ctx = route.WithParam(ctx, p, v) + } + t.Logf("run %d\t%s\t%q", i, method, test.query.Encode()) + + req, err := request(method, test.query) + if err != nil { + t.Fatal(err) + } + endpoint, errors, apiError := test.endpoint(req.WithContext(ctx)) + + if errors != nil { + t.Fatalf("Unexpected errors: %s", errors) + return + } + assertAPIError(t, apiError) + assertAPIResponse(t, endpoint, test.response) + } + } +} + +func assertAPIError(t *testing.T, got *qapi.ApiError) { + if got != nil { + t.Fatalf("Unexpected error: %s", got) + return + } +} + +func assertAPIResponse(t *testing.T, got interface{}, exp interface{}) { + if !reflect.DeepEqual(exp, got) { + respJSON, err := json.Marshal(got) + if err != nil { + t.Fatalf("failed to marshal response as JSON: %v", err.Error()) + } + + expectedRespJSON, err := json.Marshal(exp) + if err != nil { + t.Fatalf("failed to marshal expected response as JSON: %v", err.Error()) + } + + t.Fatalf( + "Response does not match, expected:\n%+v\ngot:\n%+v", + string(expectedRespJSON), + string(respJSON), + ) + } +} From b3d7d15b1f3140c3046b859554a56cdc77983c12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Mon, 4 Mar 2019 16:39:26 +0200 Subject: [PATCH 13/18] store/cache: fix broken metric and current index cache size handling (#873) * store/cache: do not forget to increase c.current on adding new items * store/cache: properly adjust c.curSize * store/cache: prevent uint64 overflow by switching operands Adding uint64(len(b)) to c.curSize might potentially overflow uint64 if the numbers are big enough and then we might not remove enough items from the LRU to satisfy the request. On the other hand, switching the operands avoids this problem because we check before if uint64(len(b)) is bigger than c.maxSize so subtracting uint64(len(b)) will *never* overflow because we know that it is less or equal to c.maxSize. * store/cache: revert ensureFits() changes c.curSize is lowered in onEvict. * store/cache: add smoke tests Add smoke tests for the index cache which check if we set curSize properly, and if removal works. --- pkg/store/cache.go | 6 +++++- pkg/store/cache_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/pkg/store/cache.go b/pkg/store/cache.go index 5acd2c104c..58e720790b 100644 --- a/pkg/store/cache.go +++ b/pkg/store/cache.go @@ -127,7 +127,7 @@ func (c *indexCache) ensureFits(b []byte) bool { if uint64(len(b)) > c.maxSize { return false } - for c.curSize+uint64(len(b)) > c.maxSize { + for c.curSize > c.maxSize-uint64(len(b)) { c.lru.RemoveOldest() } return true @@ -151,6 +151,8 @@ func (c *indexCache) setPostings(b ulid.ULID, l labels.Label, v []byte) { c.lru.Add(cacheItem{b, cacheKeyPostings(l)}, cv) c.currentSize.WithLabelValues(cacheTypePostings).Add(float64(len(v))) + c.current.WithLabelValues(cacheTypePostings).Inc() + c.curSize += uint64(len(v)) } func (c *indexCache) postings(b ulid.ULID, l labels.Label) ([]byte, bool) { @@ -185,6 +187,8 @@ func (c *indexCache) setSeries(b ulid.ULID, id uint64, v []byte) { c.lru.Add(cacheItem{b, cacheKeySeries(id)}, cv) c.currentSize.WithLabelValues(cacheTypeSeries).Add(float64(len(v))) + c.current.WithLabelValues(cacheTypeSeries).Inc() + c.curSize += uint64(len(v)) } func (c *indexCache) series(b ulid.ULID, id uint64) ([]byte, bool) { diff --git a/pkg/store/cache_test.go b/pkg/store/cache_test.go index 2846513a0f..0c3d241668 100644 --- a/pkg/store/cache_test.go +++ b/pkg/store/cache_test.go @@ -3,9 +3,13 @@ package store import ( "testing" + "time" + "github.com/fortytw2/leaktest" "github.com/improbable-eng/thanos/pkg/testutil" + "github.com/oklog/ulid" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/tsdb/labels" ) // TestIndexCacheEdge tests the index cache edge cases. @@ -20,3 +24,38 @@ func TestIndexCacheEdge(t *testing.T) { fits = cache.ensureFits([]byte{42}) testutil.Equals(t, fits, true) } + +// TestIndexCacheSmoke runs the smoke tests for the index cache. +func TestIndexCacheSmoke(t *testing.T) { + defer leaktest.CheckTimeout(t, 10*time.Second)() + + metrics := prometheus.NewRegistry() + cache, err := newIndexCache(metrics, 20) + testutil.Ok(t, err) + + blid := ulid.ULID([16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) + labels := labels.Label{Name: "test", Value: "123"} + + cache.setPostings(blid, labels, []byte{42}) + + p, ok := cache.postings(blid, labels) + testutil.Equals(t, ok, true) + testutil.Equals(t, p, []byte{42}) + testutil.Equals(t, cache.curSize, uint64(1)) + + cache.setSeries(blid, 1234, []byte{42, 42}) + + s, ok := cache.series(blid, 1234) + testutil.Equals(t, ok, true) + testutil.Equals(t, s, []byte{42, 42}) + testutil.Equals(t, cache.curSize, uint64(3)) + + cache.lru.RemoveOldest() + testutil.Equals(t, cache.curSize, uint64(2)) + + cache.lru.RemoveOldest() + testutil.Equals(t, cache.curSize, uint64(0)) + + cache.lru.RemoveOldest() + testutil.Equals(t, cache.curSize, uint64(0)) +} From 4b7320c0e45e3f48a437bd19294f569785bafb02 Mon Sep 17 00:00:00 2001 From: Ludwik <33523060+improbable-ludwik@users.noreply.github.com> Date: Mon, 4 Mar 2019 14:48:05 +0000 Subject: [PATCH 14/18] release updates (#881) --- CHANGELOG.md | 9 ++++++++- VERSION | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec71a45830..3f8a52ddd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,17 @@ We use *breaking* word for marking changes that are not backward compatible (rel ## Unreleased +## [v0.3.2](https://github.com/improbable-eng/thanos/releases/tag/v0.3.2) - 2019.03.04 + +### Added +- [#851](https://github.com/improbable-eng/thanos/pull/851) New read API endpoint for api/v1/rules and api/v1/alerts. +- [#873](https://github.com/improbable-eng/thanos/pull/873) Store: fix set index cache LRU. + ### Fixed - [#833](https://github.com/improbable-eng/thanos/issues/833) Store Gateway matcher regression for intersecting with empty posting. +- [#867](https://github.com/improbable-eng/thanos/pull/867) Fixed race condition in sidecare between reloader and shipper. -## [v0.3.1](https://github.com/improbable-eng/thanos/releases/tag/v0.3.0) - 2019.02.18 +## [v0.3.1](https://github.com/improbable-eng/thanos/releases/tag/v0.3.1) - 2019.02.18 ### Fixed - [#829](https://github.com/improbable-eng/thanos/issues/829) Store Gateway crashing due to `slice bounds out of range`. diff --git a/VERSION b/VERSION index 0cb66a557e..9fc80f937f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.1-master +0.3.2 \ No newline at end of file From b4ee2fa0d8c5833c67c190aec2eac1f1a46c7774 Mon Sep 17 00:00:00 2001 From: Ludwik <33523060+improbable-ludwik@users.noreply.github.com> Date: Mon, 4 Mar 2019 17:38:06 +0000 Subject: [PATCH 15/18] master version switch (#883) --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 9fc80f937f..1e4337edbf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.2 \ No newline at end of file +0.3.2-master \ No newline at end of file From 74a4b3b9c203edbbf22b664c371e5d32b1cb8d6c Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Tue, 5 Mar 2019 17:20:54 +0100 Subject: [PATCH 16/18] Include example Prometheus command line (#884) Make it more clear what the Prometheus command line flags should be for sidecar use. --- docs/components/sidecar.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/components/sidecar.md b/docs/components/sidecar.md index 948449a8bc..f5a8d47232 100644 --- a/docs/components/sidecar.md +++ b/docs/components/sidecar.md @@ -13,7 +13,14 @@ Prometheus servers connected to the Thanos cluster via the sidecar are subject t The retention is recommended to not be lower than three times the block duration. This achieves resilience in the face of connectivity issues to the object storage since all local data will remain available within the Thanos cluster. If connectivity gets restored the backlog of blocks gets uploaded to the object storage. +```console +$ prometheus \ + --storage.tsdb.max-block-duration=2h \ + --storage.tsdb.min-block-duration=2h \ + --web.enable-lifecycle ``` + +```console $ thanos sidecar \ --tsdb.path "/path/to/prometheus/data/dir" \ --prometheus.url "http://localhost:9090" \ From 45c8a5f3e10de7b8c9bf6cbbfb01bc695f9be30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Tue, 5 Mar 2019 18:23:50 +0200 Subject: [PATCH 17/18] compact: relax index label checking (#848) It is possible that Prometheus will produce time series with duplicate labels. E.g.: a{b="2",b="2"} 1 Where such metric is pushed to Prometheus either wittingly or unwittingly, the compact process will stall because the check will not pass. For example: {"caller":"main.go:181","err":"error executing compaction: compaction failed: compaction: gather index issues for block /data/compact/0@{monitor="monitor",replica="repl"}/01D34EDQMSQ29RHAC47XGKHGC7: out-of-order label set {**name**="foo",exported_job="vv",host="172_16_226_56",host="172_16_226_56",region="lt",subtask_index="5",task_attempt_id="32e4b047bb768583ff57c709be3b1046",task_attempt_num="8",task_id="688c028a219ff3372f3eecb0ee5811f9",task_name="Source:_foo",tenant="abc",tier="cooltier",tm_id="53b2ed987b08f427dec4ee1465df91fa"} for series 2594231","level":"error","msg":"running command failed","ts":"2019-02-11T13:30:33.901722306Z"} This commit fixes the mentioned issue. Amend the e2e test to test the case when a series has identical labels. --- pkg/block/index.go | 2 +- pkg/compact/compact_e2e_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/block/index.go b/pkg/block/index.go index eb0a3689c0..232bf40d37 100644 --- a/pkg/block/index.go +++ b/pkg/block/index.go @@ -347,7 +347,7 @@ func GatherIndexIssueStats(logger log.Logger, fn string, minTime int64, maxTime } l0 := lset[0] for _, l := range lset[1:] { - if l.Name <= l0.Name { + if l.Name < l0.Name { return stats, errors.Errorf("out-of-order label set %s for series %d", lset, id) } l0 = l diff --git a/pkg/compact/compact_e2e_test.go b/pkg/compact/compact_e2e_test.go index 11213dc047..dc6c7c1fd3 100644 --- a/pkg/compact/compact_e2e_test.go +++ b/pkg/compact/compact_e2e_test.go @@ -180,7 +180,7 @@ func TestGroup_Compact_e2e(t *testing.T) { extLset := labels.Labels{{Name: "e1", Value: "1"}} b1, err := testutil.CreateBlock(prepareDir, []labels.Labels{ {{Name: "a", Value: "1"}}, - {{Name: "a", Value: "2"}}, + {{Name: "a", Value: "2"}, {Name: "a", Value: "2"}}, {{Name: "a", Value: "3"}}, {{Name: "a", Value: "4"}}, }, 100, 0, 1000, extLset, 124) From 181c8ce6ee935699bd385a0865245f595d55fcea Mon Sep 17 00:00:00 2001 From: Martin Dickson Date: Tue, 5 Mar 2019 17:41:53 +0000 Subject: [PATCH 18/18] compact: add concurrency to meta sync (#887) * add concurrency to meta sync * fix test * update docs * address cr * use sentinel error to handle ignoring fresh blocks --- cmd/thanos/compact.go | 7 +- docs/components/compact.md | 3 + pkg/compact/compact.go | 136 ++++++++++++++++++++++---------- pkg/compact/compact_e2e_test.go | 4 +- 4 files changed, 105 insertions(+), 45 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 89a0c2672f..dd462d6b51 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -92,6 +92,9 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application, name stri maxCompactionLevel := cmd.Flag("debug.max-compaction-level", fmt.Sprintf("Maximum compaction level, default is %d: %s", compactions.maxLevel(), compactions.String())). Hidden().Default(strconv.Itoa(compactions.maxLevel())).Int() + blockSyncConcurrency := cmd.Flag("block-sync-concurrency", "Number of goroutines to use when syncing block metadata from object storage."). + Default("20").Int() + m[name] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { return runCompact(g, logger, reg, *httpAddr, @@ -108,6 +111,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application, name stri name, *disableDownsampling, *maxCompactionLevel, + *blockSyncConcurrency, ) } } @@ -126,6 +130,7 @@ func runCompact( component string, disableDownsampling bool, maxCompactionLevel int, + blockSyncConcurrency int, ) error { halted := prometheus.NewGauge(prometheus.GaugeOpts{ Name: "thanos_compactor_halted", @@ -157,7 +162,7 @@ func runCompact( } }() - sy, err := compact.NewSyncer(logger, reg, bkt, syncDelay) + sy, err := compact.NewSyncer(logger, reg, bkt, syncDelay, blockSyncConcurrency) if err != nil { return errors.Wrap(err, "create syncer") } diff --git a/docs/components/compact.md b/docs/components/compact.md index c6b27cabe8..2eb6a99824 100644 --- a/docs/components/compact.md +++ b/docs/components/compact.md @@ -67,5 +67,8 @@ Flags: in bucket. 0d - disables this retention -w, --wait Do not exit after all compactions have been processed and wait for new work. + --block-sync-concurrency=20 + Number of goroutines to use when syncing block + metadata from object storage. ``` diff --git a/pkg/compact/compact.go b/pkg/compact/compact.go index 29302f2a1c..5eea403d01 100644 --- a/pkg/compact/compact.go +++ b/pkg/compact/compact.go @@ -33,16 +33,20 @@ const ( ResolutionLevel1h = ResolutionLevel(downsample.ResLevel2) ) +var blockTooFreshSentinelError = errors.New("Block too fresh") + // Syncer syncronizes block metas from a bucket into a local directory. // It sorts them into compaction groups based on equal label sets. type Syncer struct { - logger log.Logger - reg prometheus.Registerer - bkt objstore.Bucket - syncDelay time.Duration - mtx sync.Mutex - blocks map[ulid.ULID]*metadata.Meta - metrics *syncerMetrics + logger log.Logger + reg prometheus.Registerer + bkt objstore.Bucket + syncDelay time.Duration + mtx sync.Mutex + blocks map[ulid.ULID]*metadata.Meta + blocksMtx sync.Mutex + blockSyncConcurrency int + metrics *syncerMetrics } type syncerMetrics struct { @@ -124,17 +128,18 @@ func newSyncerMetrics(reg prometheus.Registerer) *syncerMetrics { // NewSyncer returns a new Syncer for the given Bucket and directory. // Blocks must be at least as old as the sync delay for being considered. -func NewSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, syncDelay time.Duration) (*Syncer, error) { +func NewSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, syncDelay time.Duration, blockSyncConcurrency int) (*Syncer, error) { if logger == nil { logger = log.NewNopLogger() } return &Syncer{ - logger: logger, - reg: reg, - syncDelay: syncDelay, - blocks: map[ulid.ULID]*metadata.Meta{}, - bkt: bkt, - metrics: newSyncerMetrics(reg), + logger: logger, + reg: reg, + syncDelay: syncDelay, + blocks: map[ulid.ULID]*metadata.Meta{}, + bkt: bkt, + metrics: newSyncerMetrics(reg), + blockSyncConcurrency: blockSyncConcurrency, }, nil } @@ -157,6 +162,44 @@ func (c *Syncer) SyncMetas(ctx context.Context) error { } func (c *Syncer) syncMetas(ctx context.Context) error { + var wg sync.WaitGroup + defer wg.Wait() + + metaIDsChan := make(chan ulid.ULID) + errChan := make(chan error, c.blockSyncConcurrency) + + workCtx, cancel := context.WithCancel(ctx) + defer cancel() + for i := 0; i < c.blockSyncConcurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + + for id := range metaIDsChan { + // Check if we already have this block cached locally. + c.blocksMtx.Lock() + _, seen := c.blocks[id] + c.blocksMtx.Unlock() + if seen { + continue + } + + meta, err := c.downloadMeta(workCtx, id) + if err == blockTooFreshSentinelError { + continue + } + if err != nil { + errChan <- err + return + } + + c.blocksMtx.Lock() + c.blocks[id] = meta + c.blocksMtx.Unlock() + } + }() + } + // Read back all block metas so we can detect deleted blocks. remote := map[ulid.ULID]struct{}{} @@ -168,42 +211,25 @@ func (c *Syncer) syncMetas(ctx context.Context) error { remote[id] = struct{}{} - // Check if we already have this block cached locally. - if _, ok := c.blocks[id]; ok { - return nil - } - - level.Debug(c.logger).Log("msg", "download meta", "block", id) - - meta, err := block.DownloadMeta(ctx, c.logger, c.bkt, id) - if err != nil { - return errors.Wrapf(err, "downloading meta.json for %s", id) + select { + case <-ctx.Done(): + case metaIDsChan <- id: } - // ULIDs contain a millisecond timestamp. We do not consider blocks that have been created too recently to - // avoid races when a block is only partially uploaded. This relates to all blocks, excluding: - // - repair created blocks - // - compactor created blocks - // NOTE: It is not safe to miss "old" block (even that it is newly created) in sync step. Compactor needs to aware of ALL old blocks. - // TODO(bplotka): https://github.com/improbable-eng/thanos/issues/377 - if ulid.Now()-id.Time() < uint64(c.syncDelay/time.Millisecond) && - meta.Thanos.Source != metadata.BucketRepairSource && - meta.Thanos.Source != metadata.CompactorSource && - meta.Thanos.Source != metadata.CompactorRepairSource { - - level.Debug(c.logger).Log("msg", "block is too fresh for now", "block", id) - return nil - } - - remote[id] = struct{}{} - c.blocks[id] = &meta - return nil }) + close(metaIDsChan) if err != nil { return retry(errors.Wrap(err, "retrieve bucket block metas")) } + wg.Wait() + close(errChan) + + if err := <-errChan; err != nil { + return retry(err) + } + // Delete all local block dirs that no longer exist in the bucket. for id := range c.blocks { if _, ok := remote[id]; !ok { @@ -214,6 +240,32 @@ func (c *Syncer) syncMetas(ctx context.Context) error { return nil } +func (c *Syncer) downloadMeta(ctx context.Context, id ulid.ULID) (*metadata.Meta, error) { + level.Debug(c.logger).Log("msg", "download meta", "block", id) + + meta, err := block.DownloadMeta(ctx, c.logger, c.bkt, id) + if err != nil { + return nil, errors.Wrapf(err, "downloading meta.json for %s", id) + } + + // ULIDs contain a millisecond timestamp. We do not consider blocks that have been created too recently to + // avoid races when a block is only partially uploaded. This relates to all blocks, excluding: + // - repair created blocks + // - compactor created blocks + // NOTE: It is not safe to miss "old" block (even that it is newly created) in sync step. Compactor needs to aware of ALL old blocks. + // TODO(bplotka): https://github.com/improbable-eng/thanos/issues/377 + if ulid.Now()-id.Time() < uint64(c.syncDelay/time.Millisecond) && + meta.Thanos.Source != metadata.BucketRepairSource && + meta.Thanos.Source != metadata.CompactorSource && + meta.Thanos.Source != metadata.CompactorRepairSource { + + level.Debug(c.logger).Log("msg", "block is too fresh for now", "block", id) + return nil, blockTooFreshSentinelError + } + + return &meta, nil +} + // GroupKey returns a unique identifier for the group the block belongs to. It considers // the downsampling resolution and the block's labels. func GroupKey(meta metadata.Meta) string { diff --git a/pkg/compact/compact_e2e_test.go b/pkg/compact/compact_e2e_test.go index dc6c7c1fd3..a2073ad633 100644 --- a/pkg/compact/compact_e2e_test.go +++ b/pkg/compact/compact_e2e_test.go @@ -32,7 +32,7 @@ func TestSyncer_SyncMetas_e2e(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) defer cancel() - sy, err := NewSyncer(nil, nil, bkt, 0) + sy, err := NewSyncer(nil, nil, bkt, 0, 1) testutil.Ok(t, err) // Generate 15 blocks. Initially the first 10 are synced into memory and only the last @@ -134,7 +134,7 @@ func TestSyncer_GarbageCollect_e2e(t *testing.T) { } // Do one initial synchronization with the bucket. - sy, err := NewSyncer(nil, nil, bkt, 0) + sy, err := NewSyncer(nil, nil, bkt, 0, 1) testutil.Ok(t, err) testutil.Ok(t, sy.SyncMetas(ctx))