diff --git a/README.md b/README.md index 334cd30..a1fe618 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Logs will be scraped from all pods in the Kubernetes cluster. ```bash helm repo add coder-observability https://helm.coder.com/observability -helm upgrade --install coder-observability coder-observability/coder-observability --version 0.4.2 --namespace coder-observability --create-namespace +helm upgrade --install coder-observability coder-observability/coder-observability --version 0.4.3 --namespace coder-observability --create-namespace ``` ## Requirements @@ -288,6 +288,7 @@ If you switch from classic to native histograms, dashboards may need to account | https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ~0.37.0 | | https://grafana.github.io/helm-charts | loki | ~v6.7.3 | | https://grafana.github.io/helm-charts | pyroscope | ~v1.14.1 | +| https://grafana.github.io/helm-charts | tempo | ~v1.23.0 | | https://prometheus-community.github.io/helm-charts | prometheus | ~v25.24.1 | Each subchart can be disabled by setting the `enabled` field to `false`. @@ -420,16 +421,24 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana.datasources."datasources.yaml".datasources[2].type | string | `"loki"` | | | grafana.datasources."datasources.yaml".datasources[2].uid | string | `"loki"` | | | grafana.datasources."datasources.yaml".datasources[2].url | string | `"http://loki-gateway.{{ .Release.Namespace }}.{{ $.Values.global.zone }}"` | | +| grafana.datasources."datasources.yaml".datasources[3].access | string | `"proxy"` | | | grafana.datasources."datasources.yaml".datasources[3].editable | bool | `false` | | | grafana.datasources."datasources.yaml".datasources[3].isDefault | bool | `false` | | -| grafana.datasources."datasources.yaml".datasources[3].jsonData.sslmode | string | `"{{ .Values.global.postgres.sslmode }}"` | | -| grafana.datasources."datasources.yaml".datasources[3].name | string | `"postgres"` | | -| grafana.datasources."datasources.yaml".datasources[3].secureJsonData.password | string | `"{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}"` | | +| grafana.datasources."datasources.yaml".datasources[3].name | string | `"traces"` | | | grafana.datasources."datasources.yaml".datasources[3].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | | -| grafana.datasources."datasources.yaml".datasources[3].type | string | `"postgres"` | | -| grafana.datasources."datasources.yaml".datasources[3].uid | string | `"postgres"` | | -| grafana.datasources."datasources.yaml".datasources[3].url | string | `"{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}"` | | -| grafana.datasources."datasources.yaml".datasources[3].user | string | `"{{ .Values.global.postgres.username }}"` | | +| grafana.datasources."datasources.yaml".datasources[3].type | string | `"tempo"` | | +| grafana.datasources."datasources.yaml".datasources[3].uid | string | `"tempo"` | | +| grafana.datasources."datasources.yaml".datasources[3].url | string | `"http://tempo.{{ .Release.Namespace }}.{{ $.Values.global.zone }}:3200"` | | +| grafana.datasources."datasources.yaml".datasources[4].editable | bool | `false` | | +| grafana.datasources."datasources.yaml".datasources[4].isDefault | bool | `false` | | +| grafana.datasources."datasources.yaml".datasources[4].jsonData.sslmode | string | `"{{ .Values.global.postgres.sslmode }}"` | | +| grafana.datasources."datasources.yaml".datasources[4].name | string | `"postgres"` | | +| grafana.datasources."datasources.yaml".datasources[4].secureJsonData.password | string | `"{{ if .Values.global.postgres.password }}{{ .Values.global.postgres.password }}{{ else }}$PGPASSWORD{{ end }}"` | | +| grafana.datasources."datasources.yaml".datasources[4].timeout | string | `"{{ add $.Values.global.dashboards.queryTimeout 5 }}"` | | +| grafana.datasources."datasources.yaml".datasources[4].type | string | `"postgres"` | | +| grafana.datasources."datasources.yaml".datasources[4].uid | string | `"postgres"` | | +| grafana.datasources."datasources.yaml".datasources[4].url | string | `"{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}"` | | +| grafana.datasources."datasources.yaml".datasources[4].user | string | `"{{ .Values.global.postgres.username }}"` | | | grafana.deploymentStrategy.type | string | `"Recreate"` | | | grafana.enabled | bool | `true` | | | grafana.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION | bool | `true` | | @@ -576,4 +585,13 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | runbookViewer.image | string | `"dannyben/madness"` | | | sqlExporter.enabled | bool | `true` | | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | +| tempo.enabled | bool | `false` | | +| tempo.fullnameOverride | string | `"tempo"` | | +| tempo.nameOverride | string | `"tempo"` | | +| tempo.persistence.enabled | bool | `true` | | +| tempo.persistence.size | string | `"10Gi"` | | +| tempo.podAnnotations."prometheus.io/scrape" | string | `"true"` | | +| tempo.replicas | int | `1` | | +| tempo.tempo.reportingEnabled | bool | `false` | | +| tempo.tempo.retention | string | `"336h"` | | diff --git a/TESTING_KIND.md b/TESTING_KIND.md index 6a3e1f3..e1c9786 100644 --- a/TESTING_KIND.md +++ b/TESTING_KIND.md @@ -10,7 +10,7 @@ To test the observability chart locally without a kubernetes cluster, you can use [`kind` (Kubernetes in Docker)](https://kind.sigs.k8s.io/). This allows you to create a local Kubernetes cluster that can be used for testing purposes. ```bash -kind create cluster --name observability +kind create cluster --name observability --image kindest/node:v1.24.0 # To clean everything up, you can delete the cluster with: # kind delete cluster --name observability ``` @@ -46,6 +46,11 @@ coder: - name: CODER_ACCESS_URL # Keep this an empty string to get a public `try` url value: "" + # Enable tracing in observability chart (tempo.enabled=true) + - name: CODER_TRACE_ENABLE + value: "true" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://tempo.coder-observability.svc.cluster.local:4317" ``` ### Verify `coder` installation @@ -87,7 +92,20 @@ Install the local observability chart using Helm into its own namespace. kubectl config set-context --current --namespace=coder-observability # This will install the local observability chart into the `coder-observability` namespace -helm install --namespace coder-observability --create-namespace observe . +cd coder-observability +helm upgrade --install coder-observability . --namespace observe --create-namespace +``` + +You may need to remove the taint to get pods to schedule if you get an error like this: + +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints +NAME TAINTS +observability-control-plane [map[effect:NoSchedule key:node-role.kubernetes.io/control-plane]] +``` + +```bash +kubectl taint nodes observability-control-plane node-role.kubernetes.io/control-plane:NoSchedule- ``` ## To update diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock index 706f95e..4f6e35c 100644 --- a/coder-observability/Chart.lock +++ b/coder-observability/Chart.lock @@ -11,8 +11,11 @@ dependencies: - name: loki repository: https://grafana.github.io/helm-charts version: 6.7.4 +- name: tempo + repository: https://grafana.github.io/helm-charts + version: 1.23.3 - name: grafana-agent repository: https://grafana.github.io/helm-charts version: 0.37.0 -digest: sha256:38b7d46261c4d39a103fbf61eac9da26a997024221ab81078ea5b34fc2b83c68 -generated: "2025-08-27T14:16:57.521541846Z" +digest: sha256:537af466b8fca7d1ef78c62cfeae304e8e76e25b80e52f317da3ac3f411ba1f6 +generated: "2025-10-13T14:43:48.263512398+02:00" diff --git a/coder-observability/Chart.yaml b/coder-observability/Chart.yaml index 2e958b8..c464d17 100644 --- a/coder-observability/Chart.yaml +++ b/coder-observability/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: coder-observability description: Gain insights into your Coder deployment type: application -version: 0.4.2 +version: 0.4.3 dependencies: - name: pyroscope condition: pyroscope.enabled @@ -20,6 +20,10 @@ dependencies: condition: loki.enabled repository: https://grafana.github.io/helm-charts version: '~v6.7.3' + - name: tempo + condition: tempo.enabled + repository: https://grafana.github.io/helm-charts + version: '~v1.23.0' - name: grafana-agent alias: grafana-agent condition: grafana-agent.enabled diff --git a/coder-observability/templates/_collector-config.tpl b/coder-observability/templates/_collector-config.tpl index a0b7737..55700a3 100644 --- a/coder-observability/templates/_collector-config.tpl +++ b/coder-observability/templates/_collector-config.tpl @@ -377,6 +377,9 @@ otelcol.receiver.otlp "otlp_receiver" { output { metrics = [otelcol.processor.batch.default.input] logs = [otelcol.processor.batch.default.input] +{{- if .Values.tempo.enabled }} + traces = [otelcol.processor.batch.default.input] +{{- end }} } } otelcol.exporter.prometheus "to_prometheus" { @@ -391,11 +394,24 @@ otelcol.exporter.loki "to_loki" { ] } {{- end }} +{{- if .Values.tempo.enabled }} +otelcol.exporter.otlp "to_tempo" { + client { + endpoint = "http://tempo.{{ .Release.Namespace }}.{{ .Values.global.zone }}:4317" + tls { + insecure = true + } + } +} +{{- end }} otelcol.processor.batch "default" { output { metrics = [otelcol.exporter.prometheus.to_prometheus.input] {{- if .Values.loki.enabled }} logs = [otelcol.exporter.loki.to_loki.input] +{{- end }} +{{- if .Values.tempo.enabled }} + traces = [otelcol.exporter.otlp.to_tempo.input] {{- end }} } } diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 8f44674..7982aed 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -385,6 +385,15 @@ grafana: # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}' uid: loki + - name: traces + type: tempo + url: http://tempo.{{ .Release.Namespace }}.{{ $.Values.global.zone }}:3200 + access: proxy + isDefault: false + editable: false + # add 5s on global timeout to distinguish between Grafana timeout & datasource timeout + timeout: '{{ add $.Values.global.dashboards.queryTimeout 5 }}' + uid: tempo - name: postgres type: postgres url: '{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}' @@ -664,3 +673,20 @@ loki: mountPath: /var/loki-ruler-wal extraArgs: - -log.level=debug + +tempo: + enabled: false + nameOverride: tempo + fullnameOverride: tempo + + tempo: + retention: 336h # 14 days + reportingEnabled: false + + persistence: + enabled: true + size: 10Gi + + replicas: 1 + podAnnotations: + prometheus.io/scrape: "true" diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 5a67083..23e39a9 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -229,6 +229,14 @@ data: type: loki uid: loki url: http://loki-gateway.coder-observability.svc + - access: proxy + editable: false + isDefault: false + name: traces + timeout: '905' + type: tempo + uid: tempo + url: http://tempo.coder-observability.svc:3200 - editable: false isDefault: false jsonData: @@ -11686,7 +11694,7 @@ spec: app.kubernetes.io/name: grafana app.kubernetes.io/instance: coder-observability annotations: - checksum/config: 8143dd78a48b90e972a84c3078e9812a2536a4efff7ca13f404626d5eae08ab2 + checksum/config: 20c285f755dee545f371c9b074076736f3f793c8901c979afa0e244702d80f10 checksum/dashboards-json-config: 010b57348b6dd1f09007330c03d22a0570022534712646511cad39a9e3cb4bb7 checksum/sc-dashboard-provider-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b kubectl.kubernetes.io/default-container: grafana