diff --git a/README.gotmpl b/README.gotmpl
index 411d638..90e3a76 100644
--- a/README.gotmpl
+++ b/README.gotmpl
@@ -215,6 +215,73 @@ grafana:
path: "/"
```
+### Prometheus
+
+To access Prometheus, run:
+
+```bash
+kubectl -n coder-observability port-forward svc/prometheus 9090:80
+```
+
+And open your web browser to http://localhost:9090/graph.
+
+#### Native Histograms
+
+Native histograms are an **experimental** Prometheus feature that remove the need to predefine bucket boundaries and instead provide higher-resolution, adaptive buckets (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details).
+
+Unlike classic histograms, which are sent in plain text, **native histograms require the protobuf protocol**.
+In addition to running Prometheus with native histogram support, since the Prometheus Helm chart is configured with remote write, the Grafana Agent must be configured to scrape and remote write using protobuf.
+Native histograms are **disabled by default**, but when you enable them globally, the Helm chart automatically updates the Grafana Agent configuration accordingly.
+
+To enable native histograms, define this in your `values.yaml`:
+
+```yaml
+global:
+ telemetry:
+ metrics:
+ nativeHistograms: true
+
+prometheus:
+ server:
+ extraFlags:
+ - web.enable-lifecycle
+ - enable-feature=remote-write-receiver
+ - enable-feature=native-histograms
+```
+
+After updating values, it might be required to restart the Grafana Agent so it picks up the new configuration:
+```bash
+kubectl -n coder-observability rollout restart daemonset/grafana-agent
+```
+
+⚠️ **Important**: Classic and native histograms cannot be aggregated together.
+If you switch from classic to native histograms, dashboards may need to account for the transition. See [Prometheus migration guidelines](https://prometheus.io/docs/specs/native_histograms/#migration-considerations) for details.
+
+
+Validate Prometheus Native Histograms
+
+1) Check Prometheus flags:
+
+ Open http://localhost:9090/flags and confirm that `--enable-feature` includes `native-histograms`.
+
+2) Inspect histogram metrics:
+
+ * Classic histograms expose metrics with suffixes: `_bucket`, `_sum`, and `_count`.
+ * Native histograms are exposed directly under the metric name.
+ * Example: query `coderd_workspace_creation_duration_seconds` in http://localhost:9090/graph.
+
+3) Check Grafana Agent (if remote write is enabled):
+
+ To confirm, run:
+ ```bash
+ kubectl -n coder-observability port-forward svc/grafana-agent 3030:80
+ ```
+ Then open http://localhost:3030:
+ * scrape configurations defined in `prometheus.scrape.cadvisor`, should have `enable_protobuf_negotiation: true`
+ * remote write configurations defined in `prometheus.remote_write.default` should have `send_native_histograms: true`
+
+
+
## Subcharts
{{ template "chart.requirementsTable" . }}
diff --git a/README.md b/README.md
index 9a77d7d..1ab492a 100644
--- a/README.md
+++ b/README.md
@@ -215,6 +215,73 @@ grafana:
path: "/"
```
+### Prometheus
+
+To access Prometheus, run:
+
+```bash
+kubectl -n coder-observability port-forward svc/prometheus 9090:80
+```
+
+And open your web browser to http://localhost:9090/graph.
+
+#### Native Histograms
+
+Native histograms are an **experimental** Prometheus feature that remove the need to predefine bucket boundaries and instead provide higher-resolution, adaptive buckets (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details).
+
+Unlike classic histograms, which are sent in plain text, **native histograms require the protobuf protocol**.
+In addition to running Prometheus with native histogram support, since the Prometheus Helm chart is configured with remote write, the Grafana Agent must be configured to scrape and remote write using protobuf.
+Native histograms are **disabled by default**, but when you enable them globally, the Helm chart automatically updates the Grafana Agent configuration accordingly.
+
+To enable native histograms, define this in your `values.yaml`:
+
+```yaml
+global:
+ telemetry:
+ metrics:
+ nativeHistograms: true
+
+prometheus:
+ server:
+ extraFlags:
+ - web.enable-lifecycle
+ - enable-feature=remote-write-receiver
+ - enable-feature=native-histograms
+```
+
+After updating values, it might be required to restart the Grafana Agent so it picks up the new configuration:
+```bash
+kubectl -n coder-observability rollout restart daemonset/grafana-agent
+```
+
+⚠️ **Important**: Classic and native histograms cannot be aggregated together.
+If you switch from classic to native histograms, dashboards may need to account for the transition. See [Prometheus migration guidelines](https://prometheus.io/docs/specs/native_histograms/#migration-considerations) for details.
+
+
+Validate Prometheus Native Histograms
+
+1) Check Prometheus flags:
+
+ Open http://localhost:9090/flags and confirm that `--enable-feature` includes `native-histograms`.
+
+2) Inspect histogram metrics:
+
+ * Classic histograms expose metrics with suffixes: `_bucket`, `_sum`, and `_count`.
+ * Native histograms are exposed directly under the metric name.
+ * Example: query `coderd_workspace_creation_duration_seconds` in http://localhost:9090/graph.
+
+3) Check Grafana Agent (if remote write is enabled):
+
+ To confirm, run:
+ ```bash
+ kubectl -n coder-observability port-forward svc/grafana-agent 3030:80
+ ```
+ Then open http://localhost:3030:
+ * scrape configurations defined in `prometheus.scrape.cadvisor`, should have `enable_protobuf_negotiation: true`
+ * remote write configurations defined in `prometheus.remote_write.default` should have `send_native_histograms: true`
+
+
+
## Subcharts
| Repository | Name | Version |
@@ -261,8 +328,9 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main
| global.externalZone | string | `"svc.cluster.local"` | |
| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":null,"username":"coder","volumeMounts":[],"volumes":[]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts |
| global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres |
-| global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"},"profiling":{"delta_profiling_duration":"30s","scrape_interval":"60s","scrape_timeout":"70s"}}` | control telemetry collection |
-| global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection |
+| global.telemetry | object | `{"metrics":{"nativeHistograms":false,"scrape_interval":"15s","scrape_timeout":"12s"},"profiling":{"delta_profiling_duration":"30s","scrape_interval":"60s","scrape_timeout":"70s"}}` | control telemetry collection |
+| global.telemetry.metrics | object | `{"nativeHistograms":false,"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection |
+| global.telemetry.metrics.nativeHistograms | bool | `false` | enable Prometheus native histograms or default to classic histograms |
| global.telemetry.metrics.scrape_interval | string | `"15s"` | how often the collector will scrape discovered pods |
| global.telemetry.metrics.scrape_timeout | string | `"12s"` | how long a request will be allowed to wait before being canceled |
| global.telemetry.profiling.delta_profiling_duration | string | `"30s"` | duration of each pprof profiling capture, must be less than scrape_interval |
diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock
index e72a227..706f95e 100644
--- a/coder-observability/Chart.lock
+++ b/coder-observability/Chart.lock
@@ -1,7 +1,7 @@
dependencies:
- name: pyroscope
repository: https://grafana.github.io/helm-charts
- version: 1.14.1
+ version: 1.14.2
- name: grafana
repository: https://grafana.github.io/helm-charts
version: 7.3.12
@@ -14,5 +14,5 @@ dependencies:
- name: grafana-agent
repository: https://grafana.github.io/helm-charts
version: 0.37.0
-digest: sha256:5a5f27f74bbf34848da9c1bab508d3b33fda19789016c2eda9608dcd6373921d
-generated: "2025-08-04T13:28:59.433447595-05:00"
+digest: sha256:38b7d46261c4d39a103fbf61eac9da26a997024221ab81078ea5b34fc2b83c68
+generated: "2025-08-27T14:16:57.521541846Z"
diff --git a/coder-observability/templates/_collector-config.tpl b/coder-observability/templates/_collector-config.tpl
index 6a07a89..512e47c 100644
--- a/coder-observability/templates/_collector-config.tpl
+++ b/coder-observability/templates/_collector-config.tpl
@@ -230,6 +230,7 @@ prometheus.scrape "pods" {
scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
+ enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
}
// These are metric_relabel_configs while discovery.relabel are relabel_configs.
@@ -301,6 +302,7 @@ prometheus.scrape "cadvisor" {
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
+ enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
}
prometheus.relabel "cadvisor" {
@@ -346,6 +348,7 @@ prometheus.relabel "cadvisor" {
prometheus.remote_write "default" {
endpoint {
+ send_native_histograms = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
url ="http://{{ include "prometheus.server.fullname" .Subcharts.prometheus }}.{{ .Release.Namespace }}.{{ .Values.global.zone }}/api/v1/write"
// drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned
@@ -396,6 +399,7 @@ prometheus.scrape "coder_metrics" {
forward_to = [prometheus.remote_write.default.receiver]
scrape_interval = "{{ .scrapeInterval }}"
+ enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
}
{{- end }}
{{- end }}
diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml
index 548195f..507b05a 100644
--- a/coder-observability/values.yaml
+++ b/coder-observability/values.yaml
@@ -113,6 +113,8 @@ global:
scrape_interval: 15s
# global.telemetry.metrics.scrape_timeout -- how long a request will be allowed to wait before being canceled
scrape_timeout: 12s
+ # global.telemetry.metrics.nativeHistograms -- enable Prometheus native histograms or default to classic histograms
+ nativeHistograms: false
profiling:
# global.telemetry.profiling.scrape_interval -- how often the collector will scrape pprof endpoints
scrape_interval: 60s
diff --git a/compiled/resources.yaml b/compiled/resources.yaml
index 44b35b9..a73e2c5 100644
--- a/compiled/resources.yaml
+++ b/compiled/resources.yaml
@@ -776,7 +776,7 @@ metadata:
name: collector-config
namespace: coder-observability
data:
- config.river: "\n// Discover k8s nodes\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Discover k8s pods\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n selectors {\n role = \"pod\"\n }\n}\n\ndiscovery.relabel \"pod_logs\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_uid\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n action = \"replace\"\n replacement = \"/var/log/pods/*$1/*.log\"\n target_label = \"__path__\"\n }\n rule {\n action = \"replace\"\n source_labels = [\"__meta_kubernetes_pod_container_id\"]\n regex = \"^(\\\\w+):\\\\/\\\\/.+$\"\n replacement = \"$1\"\n target_label = \"tmp_container_runtime\"\n }\n}\n\ndiscovery.relabel \"pod_metrics\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // drop ports that do not expose Prometheus metrics, but might otherwise be exposed by a container which *also*\n // exposes an HTTP port which exposes metrics\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n regex = \"grpc|http-(memberlist|console)\"\n action = \"drop\"\n }\n // adapted from the Prometheus helm chart\n // https://github.com/prometheus-community/helm-charts/blob/862870fc3c847e32479b509e511584d5283126a3/charts/prometheus/values.yaml#L1070\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n regex = \"(.+)\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\ndiscovery.relabel \"pod_pprof\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // The relabeling allows the actual pod scrape endpoint to be configured via the\n // following annotations:\n //\n // * `pyroscope.io/scrape`: Only scrape pods that have a value of `true`.\n // * `pyroscope.io/application-name`: Name of the application being profiled.\n // * `pyroscope.io/scheme`: If the metrics endpoint is secured then you will need\n // to set this to `https` & most likely set the `tls_config` of the scrape config.\n // * `pyroscope.io/port`: Scrape the pod on the indicated port.\n //\n // Kubernetes labels will be added as Pyroscope labels on metrics via the\n // `labelmap` relabeling action.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_application_name\"]\n action = \"replace\"\n target_label = \"__name__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\nlocal.file_match \"pod_logs\" {\n path_targets = discovery.relabel.pod_logs.output\n}\n\nloki.source.file \"pod_logs\" {\n targets = local.file_match.pod_logs.targets\n forward_to = [loki.process.pod_logs.receiver]\n}\n\nloki.process \"pod_logs\" {\n stage.match {\n selector = \"{tmp_container_runtime=\\\"containerd\\\"}\"\n // the cri processing stage extracts the following k/v pairs: log, stream, time, flags\n stage.cri {}\n // Set the extract flags and stream values as labels\n stage.labels {\n values = {\n flags = \"\",\n stream = \"\",\n }\n }\n }\n\n // if the label tmp_container_runtime from above is docker parse using docker\n stage.match {\n selector = \"{tmp_container_runtime=\\\"docker\\\"}\"\n // the docker processing stage extracts the following k/v pairs: log, stream, time\n stage.docker {}\n\n // Set the extract stream value as a label\n stage.labels {\n values = {\n stream = \"\",\n }\n }\n }\n\n // drop the temporary container runtime label as it is no longer needed\n stage.label_drop {\n values = [\"tmp_container_runtime\"]\n }\n\n // parse Coder logs and extract level & logger for efficient filtering\n stage.match {\n selector = \"{pod=~\\\"coder.*\\\"}\" // TODO: make configurable\n\n stage.multiline {\n firstline = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\"\n max_wait_time = \"10s\"\n }\n\n stage.regex {\n expression = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\\\\s\\\\[(?P\\\\w+)\\\\]\\\\s\\\\s(?P[^:]+):\\\\s(?P.+)\"\n }\n\n stage.timestamp {\n source = \"ts\"\n format = \"2006-01-02 15:04:05.000\"\n action_on_failure = \"fudge\" // rather have inaccurate time than drop the log line\n }\n\n stage.labels {\n values = {\n level = \"\",\n logger = \"\",\n }\n }\n }\n\n forward_to = [loki.write.loki.receiver]\n}\n\nloki.write \"loki\" {\n endpoint {\n url = \"http://loki-gateway.coder-observability.svc/loki/api/v1/push\"\n }\n}\n\n\n\nprometheus.scrape \"pods\" {\n targets = discovery.relabel.pod_metrics.output\n forward_to = [prometheus.relabel.pods.receiver]\n\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n}\n\n// These are metric_relabel_configs while discovery.relabel are relabel_configs.\n// See https://github.com/grafana/agent/blob/main/internal/converter/internal/prometheusconvert/prometheusconvert.go#L95-L106\nprometheus.relabel \"pods\" {\n forward_to = [prometheus.remote_write.default.receiver]\n\n // Drop kube-state-metrics' labels which clash with ours\n rule {\n source_labels = [\"__name__\", \"container\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"container\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"pod\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"pod\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"namespace\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"namespace\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_container\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"container\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_pod\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"pod\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_namespace\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"namespace\"\n replacement = \"$1\"\n }\n rule {\n regex = \"^(exported_.*|image_.*|container_id|id|uid)$\"\n action = \"labeldrop\"\n }\n}\n\ndiscovery.relabel \"cadvisor\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n replacement = \"/metrics/cadvisor\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"cadvisor\" {\n targets = discovery.relabel.cadvisor.output\n forward_to = [ prometheus.relabel.cadvisor.receiver ]\n scheme = \"https\"\n tls_config {\n insecure_skip_verify = true\n }\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n}\n\nprometheus.relabel \"cadvisor\" {\n forward_to = [ prometheus.remote_write.default.receiver ]\n\n // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"container\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*)@\"\n action = \"drop\"\n }\n // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"image\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@\"\n action = \"drop\"\n }\n // Drop irrelevant series\n rule {\n source_labels = [\"container\"]\n regex = \"^POD$\"\n action = \"drop\"\n }\n // Drop unnecessary labels\n rule {\n source_labels = [\"id\"]\n target_label = \"id\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"job\"]\n target_label = \"job\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"name\"]\n target_label = \"name\"\n replacement = \"\"\n }\n}\n\nprometheus.remote_write \"default\" {\n endpoint {\n url =\"http://prometheus.coder-observability.svc/api/v1/write\"\n\n // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned\n // NOTE: \"__address__\" is mapped to \"instance\", so will contain :\n write_relabel_config {\n regex = \"instance\"\n action = \"labeldrop\"\n }\n }\n}"
+ config.river: "\n// Discover k8s nodes\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Discover k8s pods\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n selectors {\n role = \"pod\"\n }\n}\n\ndiscovery.relabel \"pod_logs\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_uid\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n action = \"replace\"\n replacement = \"/var/log/pods/*$1/*.log\"\n target_label = \"__path__\"\n }\n rule {\n action = \"replace\"\n source_labels = [\"__meta_kubernetes_pod_container_id\"]\n regex = \"^(\\\\w+):\\\\/\\\\/.+$\"\n replacement = \"$1\"\n target_label = \"tmp_container_runtime\"\n }\n}\n\ndiscovery.relabel \"pod_metrics\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // drop ports that do not expose Prometheus metrics, but might otherwise be exposed by a container which *also*\n // exposes an HTTP port which exposes metrics\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n regex = \"grpc|http-(memberlist|console)\"\n action = \"drop\"\n }\n // adapted from the Prometheus helm chart\n // https://github.com/prometheus-community/helm-charts/blob/862870fc3c847e32479b509e511584d5283126a3/charts/prometheus/values.yaml#L1070\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n regex = \"(.+)\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\ndiscovery.relabel \"pod_pprof\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // The relabeling allows the actual pod scrape endpoint to be configured via the\n // following annotations:\n //\n // * `pyroscope.io/scrape`: Only scrape pods that have a value of `true`.\n // * `pyroscope.io/application-name`: Name of the application being profiled.\n // * `pyroscope.io/scheme`: If the metrics endpoint is secured then you will need\n // to set this to `https` & most likely set the `tls_config` of the scrape config.\n // * `pyroscope.io/port`: Scrape the pod on the indicated port.\n //\n // Kubernetes labels will be added as Pyroscope labels on metrics via the\n // `labelmap` relabeling action.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_application_name\"]\n action = \"replace\"\n target_label = \"__name__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\nlocal.file_match \"pod_logs\" {\n path_targets = discovery.relabel.pod_logs.output\n}\n\nloki.source.file \"pod_logs\" {\n targets = local.file_match.pod_logs.targets\n forward_to = [loki.process.pod_logs.receiver]\n}\n\nloki.process \"pod_logs\" {\n stage.match {\n selector = \"{tmp_container_runtime=\\\"containerd\\\"}\"\n // the cri processing stage extracts the following k/v pairs: log, stream, time, flags\n stage.cri {}\n // Set the extract flags and stream values as labels\n stage.labels {\n values = {\n flags = \"\",\n stream = \"\",\n }\n }\n }\n\n // if the label tmp_container_runtime from above is docker parse using docker\n stage.match {\n selector = \"{tmp_container_runtime=\\\"docker\\\"}\"\n // the docker processing stage extracts the following k/v pairs: log, stream, time\n stage.docker {}\n\n // Set the extract stream value as a label\n stage.labels {\n values = {\n stream = \"\",\n }\n }\n }\n\n // drop the temporary container runtime label as it is no longer needed\n stage.label_drop {\n values = [\"tmp_container_runtime\"]\n }\n\n // parse Coder logs and extract level & logger for efficient filtering\n stage.match {\n selector = \"{pod=~\\\"coder.*\\\"}\" // TODO: make configurable\n\n stage.multiline {\n firstline = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\"\n max_wait_time = \"10s\"\n }\n\n stage.regex {\n expression = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\\\\s\\\\[(?P\\\\w+)\\\\]\\\\s\\\\s(?P[^:]+):\\\\s(?P.+)\"\n }\n\n stage.timestamp {\n source = \"ts\"\n format = \"2006-01-02 15:04:05.000\"\n action_on_failure = \"fudge\" // rather have inaccurate time than drop the log line\n }\n\n stage.labels {\n values = {\n level = \"\",\n logger = \"\",\n }\n }\n }\n\n forward_to = [loki.write.loki.receiver]\n}\n\nloki.write \"loki\" {\n endpoint {\n url = \"http://loki-gateway.coder-observability.svc/loki/api/v1/push\"\n }\n}\n\n\n\nprometheus.scrape \"pods\" {\n targets = discovery.relabel.pod_metrics.output\n forward_to = [prometheus.relabel.pods.receiver]\n\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n enable_protobuf_negotiation = false\n}\n\n// These are metric_relabel_configs while discovery.relabel are relabel_configs.\n// See https://github.com/grafana/agent/blob/main/internal/converter/internal/prometheusconvert/prometheusconvert.go#L95-L106\nprometheus.relabel \"pods\" {\n forward_to = [prometheus.remote_write.default.receiver]\n\n // Drop kube-state-metrics' labels which clash with ours\n rule {\n source_labels = [\"__name__\", \"container\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"container\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"pod\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"pod\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"namespace\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"namespace\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_container\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"container\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_pod\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"pod\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_namespace\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"namespace\"\n replacement = \"$1\"\n }\n rule {\n regex = \"^(exported_.*|image_.*|container_id|id|uid)$\"\n action = \"labeldrop\"\n }\n}\n\ndiscovery.relabel \"cadvisor\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n replacement = \"/metrics/cadvisor\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"cadvisor\" {\n targets = discovery.relabel.cadvisor.output\n forward_to = [ prometheus.relabel.cadvisor.receiver ]\n scheme = \"https\"\n tls_config {\n insecure_skip_verify = true\n }\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n enable_protobuf_negotiation = false\n}\n\nprometheus.relabel \"cadvisor\" {\n forward_to = [ prometheus.remote_write.default.receiver ]\n\n // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"container\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*)@\"\n action = \"drop\"\n }\n // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"image\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@\"\n action = \"drop\"\n }\n // Drop irrelevant series\n rule {\n source_labels = [\"container\"]\n regex = \"^POD$\"\n action = \"drop\"\n }\n // Drop unnecessary labels\n rule {\n source_labels = [\"id\"]\n target_label = \"id\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"job\"]\n target_label = \"job\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"name\"]\n target_label = \"name\"\n replacement = \"\"\n }\n}\n\nprometheus.remote_write \"default\" {\n endpoint {\n send_native_histograms = false\n url =\"http://prometheus.coder-observability.svc/api/v1/write\"\n\n // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned\n // NOTE: \"__address__\" is mapped to \"instance\", so will contain :\n write_relabel_config {\n regex = \"instance\"\n action = \"labeldrop\"\n }\n }\n}"
---
# Source: coder-observability/templates/configmap-prometheus-alerts.yaml
apiVersion: v1