diff --git a/README.md b/README.md index 334cd30..f92db00 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Logs will be scraped from all pods in the Kubernetes cluster. ```bash helm repo add coder-observability https://helm.coder.com/observability -helm upgrade --install coder-observability coder-observability/coder-observability --version 0.4.2 --namespace coder-observability --create-namespace +helm upgrade --install coder-observability coder-observability/coder-observability --version 0.4.3 --namespace coder-observability --create-namespace ``` ## Requirements @@ -310,6 +310,8 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | Key | Type | Default | Description | |-----|------|---------|-------------| +| global.alerts.enabled | bool | `true` | enable or disable alerting | +| global.alerts.kind | string | `"configmap"` | the container resource kind in which alerts should be created valid values are "prometheusrule" or "configmap" | | global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"UnprovisionedPrebuiltWorkspaces":{"delay":"10m","enabled":true,"thresholds":{"warn":1}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | @@ -318,7 +320,8 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.coder.provisionerdSelector | string | `"pod=~`coder-provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. https://coder.com/docs/v2/latest/admin/provisioners TODO: rename container label in provisioner helm chart to be "provisioner" not "coder" ensure this uses backticks for quotes! | | global.coder.scrapeMetrics | string | `nil` | use this to scrape metrics from a standalone (set of) coder deployment(s) if using kubernetes, rather add an annotation "prometheus.io/scrape=true" and coder will get automatically scraped; set this value to null and configure coderdSelector to target your coder pods | | global.coder.workspacesSelector | string | `"namespace=`coder-workspaces`"` | the namespace into which any external provisioners have been deployed. | -| global.dashboards | object | `{"queryTimeout":900,"refresh":"30s","timerange":"12h"}` | settings for bundled dashboards | +| global.dashboards.configmapLabels | string | `nil` | labels to apply to configmaps created for dashboards | +| global.dashboards.enabled | bool | `true` | enable or disable the creation of configmaps for dashboards | | global.dashboards.queryTimeout | int | `900` | how long until a query in Grafana will timeout after | | global.dashboards.refresh | string | `"30s"` | how often dashboards should refresh | | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | @@ -357,7 +360,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana."grafana.ini"."auth.anonymous".org_name | string | `"Main Org."` | | | grafana."grafana.ini"."auth.anonymous".org_role | string | `"Admin"` | | | grafana."grafana.ini".analytics.reporting_enabled | bool | `false` | | -| grafana."grafana.ini".dashboards.default_home_dashboard_path | string | `"/var/lib/grafana/dashboards/coder/0/status.json"` | | +| grafana."grafana.ini".dashboards.default_home_dashboard_path | string | `"/var/lib/grafana/dashboards/coder/0/coder-status.json"` | | | grafana."grafana.ini".dataproxy.timeout | string | `"{{ $.Values.global.dashboards.queryTimeout }}"` | | | grafana."grafana.ini".feature_toggles.autoMigrateOldPanels | bool | `true` | | | grafana."grafana.ini".users.allow_sign_up | bool | `false` | | @@ -433,29 +436,29 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana.deploymentStrategy.type | string | `"Recreate"` | | | grafana.enabled | bool | `true` | | | grafana.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION | bool | `true` | | -| grafana.extraConfigmapMounts[0].configMap | string | `"dashboards-status"` | | +| grafana.extraConfigmapMounts[0].configMap | string | `"coder-dashboard-status"` | | | grafana.extraConfigmapMounts[0].mountPath | string | `"/var/lib/grafana/dashboards/coder/0"` | | -| grafana.extraConfigmapMounts[0].name | string | `"dashboards-status"` | | +| grafana.extraConfigmapMounts[0].name | string | `"coder-dashboard-status"` | | | grafana.extraConfigmapMounts[0].readOnly | bool | `false` | | -| grafana.extraConfigmapMounts[1].configMap | string | `"dashboards-coderd"` | | +| grafana.extraConfigmapMounts[1].configMap | string | `"coder-dashboard-coderd"` | | | grafana.extraConfigmapMounts[1].mountPath | string | `"/var/lib/grafana/dashboards/coder/1"` | | -| grafana.extraConfigmapMounts[1].name | string | `"dashboards-coderd"` | | +| grafana.extraConfigmapMounts[1].name | string | `"coder-dashboard-coderd"` | | | grafana.extraConfigmapMounts[1].readOnly | bool | `false` | | -| grafana.extraConfigmapMounts[2].configMap | string | `"dashboards-provisionerd"` | | +| grafana.extraConfigmapMounts[2].configMap | string | `"coder-dashboard-provisionerd"` | | | grafana.extraConfigmapMounts[2].mountPath | string | `"/var/lib/grafana/dashboards/coder/2"` | | -| grafana.extraConfigmapMounts[2].name | string | `"dashboards-provisionerd"` | | +| grafana.extraConfigmapMounts[2].name | string | `"coder-dashboard-provisionerd"` | | | grafana.extraConfigmapMounts[2].readOnly | bool | `false` | | -| grafana.extraConfigmapMounts[3].configMap | string | `"dashboards-workspaces"` | | +| grafana.extraConfigmapMounts[3].configMap | string | `"coder-dashboard-workspaces"` | | | grafana.extraConfigmapMounts[3].mountPath | string | `"/var/lib/grafana/dashboards/coder/3"` | | -| grafana.extraConfigmapMounts[3].name | string | `"dashboards-workspaces"` | | +| grafana.extraConfigmapMounts[3].name | string | `"coder-dashboard-workspaces"` | | | grafana.extraConfigmapMounts[3].readOnly | bool | `false` | | -| grafana.extraConfigmapMounts[4].configMap | string | `"dashboards-workspace-detail"` | | +| grafana.extraConfigmapMounts[4].configMap | string | `"coder-dashboard-workspace-detail"` | | | grafana.extraConfigmapMounts[4].mountPath | string | `"/var/lib/grafana/dashboards/coder/4"` | | -| grafana.extraConfigmapMounts[4].name | string | `"dashboards-workspace-detail"` | | +| grafana.extraConfigmapMounts[4].name | string | `"coder-dashboard-workspace-detail"` | | | grafana.extraConfigmapMounts[4].readOnly | bool | `false` | | -| grafana.extraConfigmapMounts[5].configMap | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].configMap | string | `"coder-dashboard-prebuilds"` | | | grafana.extraConfigmapMounts[5].mountPath | string | `"/var/lib/grafana/dashboards/coder/5"` | | -| grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].name | string | `"coder-dashboard-prebuilds"` | | | grafana.extraConfigmapMounts[5].readOnly | bool | `false` | | | grafana.fullnameOverride | string | `"grafana"` | | | grafana.image.tag | string | `"10.4.19"` | | @@ -533,9 +536,10 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.configmapReload.prometheus.containerPort | int | `9091` | | | prometheus.configmapReload.prometheus.extraArgs.log-level | string | `"all"` | | | prometheus.configmapReload.prometheus.extraArgs.watch-interval | string | `"15s"` | | -| prometheus.configmapReload.prometheus.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | | +| prometheus.configmapReload.prometheus.extraConfigmapMounts[0].configMap | string | `"coder-metrics-alerts"` | | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].name | string | `"alerts"` | | +| prometheus.configmapReload.prometheus.extraConfigmapMounts[0].optional | bool | `true` | | | prometheus.configmapReload.prometheus.extraConfigmapMounts[0].readonly | bool | `true` | | | prometheus.enabled | bool | `true` | | | prometheus.kube-state-metrics.enabled | bool | `true` | | @@ -546,9 +550,10 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.prometheus-node-exporter.podAnnotations."prometheus.io/scrape" | string | `"true"` | | | prometheus.prometheus-pushgateway.enabled | bool | `false` | | | prometheus.server.extraArgs."log.level" | string | `"debug"` | | -| prometheus.server.extraConfigmapMounts[0].configMap | string | `"metrics-alerts"` | | +| prometheus.server.extraConfigmapMounts[0].configMap | string | `"coder-metrics-alerts"` | | | prometheus.server.extraConfigmapMounts[0].mountPath | string | `"/etc/config/alerts"` | | | prometheus.server.extraConfigmapMounts[0].name | string | `"alerts"` | | +| prometheus.server.extraConfigmapMounts[0].optional | bool | `true` | | | prometheus.server.extraConfigmapMounts[0].readonly | bool | `true` | | | prometheus.server.extraFlags[0] | string | `"web.enable-lifecycle"` | | | prometheus.server.extraFlags[1] | string | `"enable-feature=remote-write-receiver"` | | @@ -573,6 +578,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | pyroscope.pyroscope.replicaCount | int | `1` | | | pyroscope.pyroscope.service.port | int | `4040` | | | pyroscope.pyroscope.service.type | string | `"ClusterIP"` | | +| runbookViewer.enabled | bool | `true` | enable or disable the runbook viewer | | runbookViewer.image | string | `"dannyben/madness"` | | | sqlExporter.enabled | bool | `true` | | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | diff --git a/coder-observability/Chart.yaml b/coder-observability/Chart.yaml index 2e958b8..0f9da8d 100644 --- a/coder-observability/Chart.yaml +++ b/coder-observability/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: coder-observability description: Gain insights into your Coder deployment type: application -version: 0.4.2 +version: 0.4.3 dependencies: - name: pyroscope condition: pyroscope.enabled diff --git a/coder-observability/templates/_helpers.tpl b/coder-observability/templates/_helpers.tpl index 0d8578d..b98f876 100644 --- a/coder-observability/templates/_helpers.tpl +++ b/coder-observability/templates/_helpers.tpl @@ -99,9 +99,13 @@ envFrom: {{/* Build a runbook URL */}} {{- define "runbook-url" -}} {{ $outer := . }} +{{- if .Values.runbookViewer.enabled -}} {{- with .Values.global -}} {{- .externalScheme }}://runbook-viewer.{{ $outer.Release.Namespace }}.{{ .externalZone }}/{{- $outer.service }}#{{- $outer.alert | lower }} {{- end }} +{{- else -}} +https://github.com/coder/observability/blob/main/coder-observability/runbooks/{{- $outer.service }}.md#{{- $outer.alert | lower }} +{{- end }} {{- end }} {{- define "coderd-selector" -}} {{- printf "%s, namespace=`%s`" .Values.global.coder.coderdSelector .Values.global.coder.controlPlaneNamespace -}} {{- end }} @@ -121,4 +125,4 @@ envFrom: {{- define "grafana-agent-job" -}} {{- printf "%s/%s/%s" .Release.Namespace (index .Values "grafana-agent").fullnameOverride "grafana-agent" -}} {{- end }} {{- define "dashboard-range" -}} {{ .Values.global.dashboards.timerange }} {{- end }} -{{- define "dashboard-refresh" -}} {{ .Values.global.dashboards.refresh }} {{- end }} \ No newline at end of file +{{- define "dashboard-refresh" -}} {{ .Values.global.dashboards.refresh }} {{- end }} diff --git a/coder-observability/templates/_prometheus-alerts.tpl b/coder-observability/templates/_prometheus-alerts.tpl new file mode 100644 index 0000000..7f0ce39 --- /dev/null +++ b/coder-observability/templates/_prometheus-alerts.tpl @@ -0,0 +1,248 @@ +{{- define "coderd-prometheus-alerts" -}} + {{- $service := dict "service" "coderd" -}} + {{- with .Values.global.coder.alerts.coderd }} + {{- with .groups.CPU }} + {{- $group := . }} + {{- if .enabled }} + - name: CPU Usage + rules: + {{ $alert := "CoderdCPUUsage" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.Memory }} + {{- $group := . }} + {{- if .enabled }} + - name: Memory Usage + rules: + {{ $alert := "CoderdMemoryUsage" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.Restarts }} + {{- $group := . }} + {{- if .enabled }} + - name: Pod Restarts + rules: + {{ $alert := "CoderdRestarts" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.Replicas }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Replicas + rules: + {{ $alert := "CoderdReplicas" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: sum(up{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: Number of alive coderd replicas is below the threshold = {{ $threshold -}}. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.WorkspaceBuildFailures }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Workspace Build Failures + rules: + {{ $alert := "CoderdWorkspaceBuildFailures" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: sum(increase(coderd_workspace_builds_total{ {{- include "coderd-selector" $ -}} , status="failed" }[{{- $group.period -}}])) > {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: Workspace builds have failed multiple times in the last {{ $group.period -}}, which may indicate a broken Coder template. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.IneligiblePrebuilds }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Ineligible Prebuilds + rules: + {{ $alert := "CoderdIneligiblePrebuilds" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.UnprovisionedPrebuiltWorkspaces }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Unprovisioned Prebuilt Workspaces + rules: + {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- end }} +{{- end }} + +{{- define "provisionerd-prometheus-alerts" -}} + {{- $service := dict "service" "coderd" -}} + {{- with .Values.global.coder.alerts.provisionerd }} + {{- with .groups.Replicas }} + {{- $group := . }} + {{- if .enabled }} + - name: Provisionerd Replicas + rules: + {{ $alert := "ProvisionerdReplicas" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: sum(coderd_provisionerd_num_daemons{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: Number of alive provisionerd replicas is below the threshold = {{ $threshold -}}. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- end }} +{{- end }} + +{{- define "enterprise-prometheus-alerts" -}} + {{- $service := dict "service" "enterprise" -}} + + {{- with .Values.global.coder.alerts.enterprise }} + {{- with .groups.Licences }} + {{- $group := . }} + {{- if .enabled }} + - name: Licences + rules: + {{ $alert := "CoderLicenseSeats" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}' + for: {{ $group.delay }} + annotations: + summary: Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} + +{{- define "postgres-prometheus-alerts" -}} + {{- $service := dict "service" "postgres" -}} + {{- with .Values.global.postgres }} + {{- with .alerts.groups.Notifications }} + {{- $group := . -}} + {{- if .enabled }} + - name: Notifications + rules: + {{ $alert := "PostgresNotificationQueueFillingUp" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }} + for: {{ $group.delay }} + annotations: + summary: The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end -}} + {{- end -}} + {{- with .alerts.groups.Basic }} + {{ $group := . -}} + {{- if .enabled }} + - name: Liveness + rules: + {{ $alert := "PostgresDown" }} + - alert: {{ $alert }} + expr: pg_up == 0 + for: {{ $group.delay }} + annotations: + summary: The postgres instance {{ `{{ $labels.instance }}` }} is down! + labels: + severity: critical + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{ end }} + {{- with .alerts.groups.Connections }} + {{ $group := . -}} + {{- if .enabled }} + - name: Connections + rules: + {{ $alert := "PostgresConnectionsRunningLow" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * {{ $threshold }}) + for: {{ $group.delay }} + labels: + summary: The postgres instance {{ `{{ $labels.instance }}` }} is running low on connections which may impact application performance. + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end -}} + {{- end -}} + {{ end }} +{{- end }} diff --git a/coder-observability/templates/configmap-collector.yaml b/coder-observability/templates/configmap-collector.yaml index 919b089..d5135ae 100644 --- a/coder-observability/templates/configmap-collector.yaml +++ b/coder-observability/templates/configmap-collector.yaml @@ -1,3 +1,4 @@ +{{- if (index .Values "grafana-agent").enabled }} --- kind: ConfigMap apiVersion: v1 @@ -5,4 +6,5 @@ metadata: name: {{ (index .Values "grafana-agent").agent.configMap.name }} namespace: {{ .Release.Namespace }} data: - config.river: |- {{- include "collector-config" . | trim | nindent 4 }} \ No newline at end of file + config.river: |- {{- include "collector-config" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index bf9bcc4..a84c0cc 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -1,256 +1,20 @@ +{{- if and .Values.global.alerts.enabled (eq .Values.global.alerts.format "configmap") -}} apiVersion: v1 kind: ConfigMap metadata: name: metrics-alerts namespace: {{ .Release.Namespace }} data: - {{- $service := dict "service" "coderd" -}} - - {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}} coderd.yaml: |- groups: - {{- with .groups.CPU }} - {{- $group := . }} - {{- if .enabled }} - - name: CPU Usage - rules: - {{ $alert := "CoderdCPUUsage" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.Memory }} - {{- $group := . }} - {{- if .enabled }} - - name: Memory Usage - rules: - {{ $alert := "CoderdMemoryUsage" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.Restarts }} - {{- $group := . }} - {{- if .enabled }} - - name: Pod Restarts - rules: - {{ $alert := "CoderdRestarts" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.Replicas }} - {{- $group := . }} - {{- if .enabled }} - - name: Coderd Replicas - rules: - {{ $alert := "CoderdReplicas" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: sum(up{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: Number of alive coderd replicas is below the threshold = {{ $threshold -}}. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.WorkspaceBuildFailures }} - {{- $group := . }} - {{- if .enabled }} - - name: Coderd Workspace Build Failures - rules: - {{ $alert := "CoderdWorkspaceBuildFailures" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: sum(increase(coderd_workspace_builds_total{ {{- include "coderd-selector" $ -}} , status="failed" }[{{- $group.period -}}])) > {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: Workspace builds have failed multiple times in the last {{ $group.period -}}, which may indicate a broken Coder template. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.IneligiblePrebuilds }} - {{- $group := . }} - {{- if .enabled }} - - name: Coderd Ineligible Prebuilds - rules: - {{ $alert := "CoderdIneligiblePrebuilds" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0 - for: {{ $group.delay }} - annotations: - summary: > - {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. - This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- with .groups.UnprovisionedPrebuiltWorkspaces }} - {{- $group := . }} - {{- if .enabled }} - - name: Coderd Unprovisioned Prebuilt Workspaces - rules: - {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0 - for: {{ $group.delay }} - annotations: - summary: > - {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- end }} {{/* end-section */}} - - - {{- with .Values.global.coder.alerts.provisionerd }} {{/* start-section */}} + {{- include "coderd-prometheus-alerts" . }} provisionerd.yaml: |- groups: - {{- with .groups.Replicas }} - {{- $group := . }} - {{- if .enabled }} - - name: Provisionerd Replicas - rules: - {{ $alert := "ProvisionerdReplicas" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: sum(coderd_provisionerd_num_daemons{ {{- include "coderd-selector" $ -}} }) < {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: Number of alive provisionerd replicas is below the threshold = {{ $threshold -}}. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - - {{- end }} {{/* end-section */}} - - - {{- $service = dict "service" "enterprise" -}} - - {{- with .Values.global.coder.alerts.enterprise }} {{/* start-section */}} + {{- include "provisionerd-prometheus-alerts" . }} enterprise.yaml: |- groups: - {{- with .groups.Licences }} - {{- $group := . }} - {{- if .enabled }} - - name: Licences - rules: - {{ $alert := "CoderLicenseSeats" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}' - for: {{ $group.delay }} - annotations: - summary: Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end }} - {{- end }} - {{- end }} {{/* end-section */}} - - {{- $service = dict "service" "postgres" -}} - {{- with .Values.global.postgres }} + {{- include "enterprise-prometheus-alerts" . }} postgres.yaml: |- groups: - {{- with .alerts.groups.Notifications }} - {{- $group := . -}} - {{- if .enabled }} - - name: Notifications - rules: - {{ $alert := "PostgresNotificationQueueFillingUp" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }} - for: {{ $group.delay }} - annotations: - summary: The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance. - labels: - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end -}} - {{- end -}} - {{- with .alerts.groups.Basic }} - {{ $group := . -}} - {{- if .enabled }} - - name: Liveness - rules: - {{ $alert := "PostgresDown" }} - - alert: {{ $alert }} - expr: pg_up == 0 - for: {{ $group.delay }} - annotations: - summary: The postgres instance {{ `{{ $labels.instance }}` }} is down! - labels: - severity: critical - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{ end }} - {{- with .alerts.groups.Connections }} - {{ $group := . -}} - {{- if .enabled }} - - name: Connections - rules: - {{ $alert := "PostgresConnectionsRunningLow" }} - {{- range $severity, $threshold := .thresholds }} - - alert: {{ $alert }} - expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * {{ $threshold }}) - for: {{ $group.delay }} - labels: - summary: The postgres instance {{ `{{ $labels.instance }}` }} is running low on connections which may impact application performance. - severity: {{ $severity }} - runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} - {{- end }} - {{- end -}} - {{- end -}} - {{ end }} + {{- include "postgres-prometheus-alerts" . }} +{{- end }} diff --git a/coder-observability/templates/configmap-runbooks.yaml b/coder-observability/templates/configmap-runbooks.yaml index 80eb085..677fddd 100644 --- a/coder-observability/templates/configmap-runbooks.yaml +++ b/coder-observability/templates/configmap-runbooks.yaml @@ -1,3 +1,4 @@ +{{- if .Values.runbookViewer.enabled }} --- kind: ConfigMap apiVersion: v1 @@ -7,4 +8,5 @@ metadata: annotations: checksum/config: {{ (.Files.Glob "runbooks/**").AsConfig | indent 2 | sha256sum }} data: -{{ (.Files.Glob "runbooks/**").AsConfig | indent 2 }} \ No newline at end of file +{{ (.Files.Glob "runbooks/**").AsConfig | indent 2 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/_dashboards_coderd.json.tpl b/coder-observability/templates/dashboards/_dashboards_coderd.json.tpl index 20a0ece..5a57987 100644 --- a/coder-observability/templates/dashboards/_dashboards_coderd.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_coderd.json.tpl @@ -1466,9 +1466,9 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Control Plane", - "uid": "coderd", + "title": "Coder Control Plane", + "uid": "coder-coderd", "version": 6, "weekStart": "" } -{{ end }} \ No newline at end of file +{{ end }} diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index a4ba641..9ee3cb0 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -1445,8 +1445,8 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Prebuilds", - "uid": "cej6jysyme22oa", + "title": "Coder Prebuilds", + "uid": "coder-prebuilds", "version": 5 } -{{ end }} \ No newline at end of file +{{ end }} diff --git a/coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl b/coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl index 9b855a5..1bc6efe 100644 --- a/coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_provisionerd.json.tpl @@ -1013,9 +1013,9 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Provisioners", - "uid": "provisionerd", + "title": "Coder Provisioners", + "uid": "coder-provisionerd", "version": 10, "weekStart": "" } -{{ end }} \ No newline at end of file +{{ end }} diff --git a/coder-observability/templates/dashboards/_dashboards_status.json.tpl b/coder-observability/templates/dashboards/_dashboards_status.json.tpl index dd468e3..ef16681 100644 --- a/coder-observability/templates/dashboards/_dashboards_status.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_status.json.tpl @@ -2068,7 +2068,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Status", + "title": "Coder Status", "uid": "coder-status", "version": 1, "weekStart": "" diff --git a/coder-observability/templates/dashboards/_dashboards_workspace_detail.json.tpl b/coder-observability/templates/dashboards/_dashboards_workspace_detail.json.tpl index 713cc9a..890ef15 100644 --- a/coder-observability/templates/dashboards/_dashboards_workspace_detail.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_workspace_detail.json.tpl @@ -1336,8 +1336,8 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Workspace Detail", - "uid": "workspace-detail", + "title": "Coder Workspace Detail", + "uid": "coder-workspace-detail", "version": 9, "weekStart": "" } diff --git a/coder-observability/templates/dashboards/_dashboards_workspaces.json.tpl b/coder-observability/templates/dashboards/_dashboards_workspaces.json.tpl index afd52d8..5e63327 100644 --- a/coder-observability/templates/dashboards/_dashboards_workspaces.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_workspaces.json.tpl @@ -1618,9 +1618,9 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Workspaces", - "uid": "workspaces", + "title": "Coder Workspaces", + "uid": "coder-workspaces", "version": 2, "weekStart": "" } -{{ end }} \ No newline at end of file +{{ end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml b/coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml index 33719f5..5221592 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-coderd.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-coderd + name: coder-dashboard-coderd namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - coderd.json: |- {{- include "coderd-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-coderd.json: |- {{- include "coderd-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml index 14d5908..6a7dae7 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-prebuilds + name: coder-dashboard-prebuilds namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml b/coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml index 0c20e83..38df554 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-provisionerd + name: coder-dashboard-provisionerd namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - provisionerd.json: |- {{- include "provisionerd-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-provisionerd.json: |- {{- include "provisionerd-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-status.yaml b/coder-observability/templates/dashboards/configmap-dashboards-status.yaml index e307cc5..4605711 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-status.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-status.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-status + name: coder-dashboard-status namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - status.json: |- {{- include "status-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-status.json: |- {{- include "status-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml b/coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml index 084c5e1..09256bd 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-workspace_detail.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-workspace-detail + name: coder-dashboard-workspace-detail namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - workspaces-detail.json: |- {{- include "workspace-detail-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-workspaces-detail.json: |- {{- include "workspace-detail-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml b/coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml index bae657d..237789a 100644 --- a/coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml +++ b/coder-observability/templates/dashboards/configmap-dashboards-workspaces.yaml @@ -1,7 +1,12 @@ +{{- if .Values.global.dashboards.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-workspaces + name: coder-dashboard-workspaces namespace: {{ .Release.Namespace }} + {{- with .Values.global.dashboards.configmapLabels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} data: - workspaces.json: |- {{- include "workspaces-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file + coder-workspaces.json: |- {{- include "workspaces-dashboard.json" . | trim | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/prometheusrule.yaml b/coder-observability/templates/prometheusrule.yaml new file mode 100644 index 0000000..8b171bd --- /dev/null +++ b/coder-observability/templates/prometheusrule.yaml @@ -0,0 +1,34 @@ +{{- if and .Values.global.alerts.enabled (eq .Values.global.alerts.format "prometheusrule") -}} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: coder-prometheus-alerts + namespace: {{ .Release.Namespace }} +spec: + groups: {{ include "coderd-prometheus-alerts" . | nindent 4 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: provisionerd-prometheus-alerts + namespace: {{ .Release.Namespace }} +spec: + groups: {{ include "provisionerd-prometheus-alerts" . | nindent 4 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: enterprise-prometheus-alerts + namespace: {{ .Release.Namespace }} +spec: + groups: {{ include "enterprise-prometheus-alerts" . | nindent 4 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: postgres-prometheus-alerts + namespace: {{ .Release.Namespace }} +spec: + groups: {{ include "postgres-prometheus-alerts" . | nindent 4 }} +{{- end }} diff --git a/coder-observability/templates/service-runbook-viewer.yaml b/coder-observability/templates/service-runbook-viewer.yaml index 68c210a..f50b298 100644 --- a/coder-observability/templates/service-runbook-viewer.yaml +++ b/coder-observability/templates/service-runbook-viewer.yaml @@ -1,3 +1,4 @@ +{{- if .Values.runbookViewer.enabled }} --- apiVersion: v1 kind: Service @@ -10,3 +11,4 @@ spec: protocol: TCP selector: app: runbook-viewer +{{- end }} diff --git a/coder-observability/templates/statefulset-runbook-viewer.yaml b/coder-observability/templates/statefulset-runbook-viewer.yaml index 64f50e4..79c7801 100644 --- a/coder-observability/templates/statefulset-runbook-viewer.yaml +++ b/coder-observability/templates/statefulset-runbook-viewer.yaml @@ -1,3 +1,4 @@ +{{- if .Values.runbookViewer.enabled }} --- apiVersion: apps/v1 kind: StatefulSet @@ -32,3 +33,4 @@ spec: - name: runbooks configMap: name: runbooks +{{- end }} diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 8f44674..89ff315 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -164,16 +164,28 @@ global: warning: 0.8 critical: 0.9 - # global.dashboards -- settings for bundled dashboards + alerts: + # global.alerts.enabled -- enable or disable alerting + enabled: true + # global.alerts.kind -- the container resource kind in which alerts should be created + # valid values are "prometheusrule" or "configmap" + kind: "configmap" + dashboards: + # global.dashboards.enabled -- enable or disable the creation of configmaps for dashboards + enabled: true # global.dashboards.timerange -- how far back dashboards should look timerange: 12h # global.dashboards.refresh -- how often dashboards should refresh refresh: 30s # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after queryTimeout: 900 + # global.dashboards.configmapLabels -- labels to apply to configmaps created for dashboards + configmapLabels: runbookViewer: + # runbookViewer.enabled -- enable or disable the runbook viewer + enabled: true image: "dannyben/madness" sqlExporter: @@ -417,7 +429,7 @@ grafana: autoMigrateOldPanels: true dashboards: # mounted configmap will be synced with sidecar - default_home_dashboard_path: /var/lib/grafana/dashboards/coder/0/status.json + default_home_dashboard_path: /var/lib/grafana/dashboards/coder/0/coder-status.json dataproxy: timeout: '{{ $.Values.global.dashboards.queryTimeout }}' sidecar: @@ -430,29 +442,29 @@ grafana: extraConfigmapMounts: # we can't combine configmaps because of the 1MiB size limit, but Grafana will scan # the /var/lib/grafana/dashboards/coder directory deeply to find dashboards - - name: dashboards-status + - name: coder-dashboard-status mountPath: /var/lib/grafana/dashboards/coder/0 - configMap: dashboards-status + configMap: coder-dashboard-status readOnly: false - - name: dashboards-coderd + - name: coder-dashboard-coderd mountPath: /var/lib/grafana/dashboards/coder/1 - configMap: dashboards-coderd + configMap: coder-dashboard-coderd readOnly: false - - name: dashboards-provisionerd + - name: coder-dashboard-provisionerd mountPath: /var/lib/grafana/dashboards/coder/2 - configMap: dashboards-provisionerd + configMap: coder-dashboard-provisionerd readOnly: false - - name: dashboards-workspaces + - name: coder-dashboard-workspaces mountPath: /var/lib/grafana/dashboards/coder/3 - configMap: dashboards-workspaces + configMap: coder-dashboard-workspaces readOnly: false - - name: dashboards-workspace-detail + - name: coder-dashboard-workspace-detail mountPath: /var/lib/grafana/dashboards/coder/4 - configMap: dashboards-workspace-detail + configMap: coder-dashboard-workspace-detail readOnly: false - - name: dashboards-prebuilds + - name: coder-dashboard-prebuilds mountPath: /var/lib/grafana/dashboards/coder/5 - configMap: dashboards-prebuilds + configMap: coder-dashboard-prebuilds readOnly: false prometheus: @@ -486,8 +498,9 @@ prometheus: extraConfigmapMounts: - name: alerts mountPath: /etc/config/alerts - configMap: metrics-alerts + configMap: coder-metrics-alerts readonly: true + optional: true serverFiles: prometheus.yml: @@ -510,8 +523,9 @@ prometheus: extraConfigmapMounts: - name: alerts mountPath: /etc/config/alerts - configMap: metrics-alerts + configMap: coder-metrics-alerts readonly: true + optional: true alertmanager: fullnameOverride: alertmanager diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 5a67083..6a401ea 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -184,7 +184,7 @@ data: org_name = Main Org. org_role = Admin [dashboards] - default_home_dashboard_path = /var/lib/grafana/dashboards/coder/0/status.json + default_home_dashboard_path = /var/lib/grafana/dashboards/coder/0/coder-status.json [dataproxy] timeout = 900 [feature_toggles] @@ -778,18 +778,6 @@ metadata: data: config.river: "\n// Discover k8s nodes\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Discover k8s pods\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n selectors {\n role = \"pod\"\n }\n}\n\ndiscovery.relabel \"pod_logs\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_uid\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n action = \"replace\"\n replacement = \"/var/log/pods/*$1/*.log\"\n target_label = \"__path__\"\n }\n rule {\n action = \"replace\"\n source_labels = [\"__meta_kubernetes_pod_container_id\"]\n regex = \"^(\\\\w+):\\\\/\\\\/.+$\"\n replacement = \"$1\"\n target_label = \"tmp_container_runtime\"\n }\n}\n\ndiscovery.relabel \"pod_metrics\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // drop ports that do not expose Prometheus metrics, but might otherwise be exposed by a container which *also*\n // exposes an HTTP port which exposes metrics\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n regex = \"grpc|http-(memberlist|console)\"\n action = \"drop\"\n }\n // adapted from the Prometheus helm chart\n // https://github.com/prometheus-community/helm-charts/blob/862870fc3c847e32479b509e511584d5283126a3/charts/prometheus/values.yaml#L1070\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n regex = \"(.+)\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\ndiscovery.relabel \"pod_pprof\" {\n targets = discovery.kubernetes.pods.targets\n \n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n // coalesce the following labels and pick the first value; we'll use this to define the \"job\" label\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_component\", \"app\", \"__meta_kubernetes_pod_container_name\"]\n separator = \"/\"\n target_label = \"__meta_app\"\n action = \"replace\"\n regex = \"^/*([^/]+?)(?:/.*)?$\" // split by the delimiter if it exists, we only want the first one\n replacement = \"${1}\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_namespace\", \"__meta_kubernetes_pod_label_app_kubernetes_io_name\", \"__meta_app\"]\n separator = \"/\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n regex = \"__meta_kubernetes_pod_label_(statefulset_kubernetes_io_pod_name|controller_revision_hash)\"\n action = \"labeldrop\"\n }\n rule {\n regex = \"pod_template_generation\"\n action = \"labeldrop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_phase\"]\n regex = \"Pending|Succeeded|Failed|Completed\"\n action = \"drop\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"node\"\n }\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_annotation_prometheus_io_param_(.+)\"\n replacement = \"__param_$1\"\n }\n // The relabeling allows the actual pod scrape endpoint to be configured via the\n // following annotations:\n //\n // * `pyroscope.io/scrape`: Only scrape pods that have a value of `true`.\n // * `pyroscope.io/application-name`: Name of the application being profiled.\n // * `pyroscope.io/scheme`: If the metrics endpoint is secured then you will need\n // to set this to `https` & most likely set the `tls_config` of the scrape config.\n // * `pyroscope.io/port`: Scrape the pod on the indicated port.\n //\n // Kubernetes labels will be added as Pyroscope labels on metrics via the\n // `labelmap` relabeling action.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scrape\"]\n action = \"keep\"\n regex = \"true\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_application_name\"]\n action = \"replace\"\n target_label = \"__name__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_scheme\"]\n action = \"replace\"\n regex = \"(https?)\"\n target_label = \"__scheme__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\"\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_pyroscope_io_port\", \"__meta_kubernetes_pod_ip\"]\n action = \"replace\"\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\"\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n}\n\nlocal.file_match \"pod_logs\" {\n path_targets = discovery.relabel.pod_logs.output\n}\n\nloki.source.file \"pod_logs\" {\n targets = local.file_match.pod_logs.targets\n forward_to = [loki.process.pod_logs.receiver]\n}\n\nloki.process \"pod_logs\" {\n stage.match {\n selector = \"{tmp_container_runtime=\\\"containerd\\\"}\"\n // the cri processing stage extracts the following k/v pairs: log, stream, time, flags\n stage.cri {}\n // Set the extract flags and stream values as labels\n stage.labels {\n values = {\n flags = \"\",\n stream = \"\",\n }\n }\n }\n\n // if the label tmp_container_runtime from above is docker parse using docker\n stage.match {\n selector = \"{tmp_container_runtime=\\\"docker\\\"}\"\n // the docker processing stage extracts the following k/v pairs: log, stream, time\n stage.docker {}\n\n // Set the extract stream value as a label\n stage.labels {\n values = {\n stream = \"\",\n }\n }\n }\n\n // drop the temporary container runtime label as it is no longer needed\n stage.label_drop {\n values = [\"tmp_container_runtime\"]\n }\n\n // parse Coder logs and extract level & logger for efficient filtering\n stage.match {\n selector = \"{pod=~\\\"coder.*\\\"}\" // TODO: make configurable\n\n stage.multiline {\n firstline = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\"\n max_wait_time = \"10s\"\n }\n\n stage.regex {\n expression = \"^(?P\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d{3})\\\\s\\\\[(?P\\\\w+)\\\\]\\\\s\\\\s(?P[^:]+):\\\\s(?P.+)\"\n }\n\n stage.timestamp {\n source = \"ts\"\n format = \"2006-01-02 15:04:05.000\"\n action_on_failure = \"fudge\" // rather have inaccurate time than drop the log line\n }\n\n stage.labels {\n values = {\n level = \"\",\n logger = \"\",\n }\n }\n }\n\n forward_to = [loki.write.loki.receiver]\n}\n\nloki.write \"loki\" {\n endpoint {\n url = \"http://loki-gateway.coder-observability.svc/loki/api/v1/push\"\n }\n}\n\n\n\nprometheus.scrape \"pods\" {\n targets = discovery.relabel.pod_metrics.output\n forward_to = [prometheus.relabel.pods.receiver]\n\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n enable_protobuf_negotiation = false\n}\n\n// These are metric_relabel_configs while discovery.relabel are relabel_configs.\n// See https://github.com/grafana/agent/blob/main/internal/converter/internal/prometheusconvert/prometheusconvert.go#L95-L106\nprometheus.relabel \"pods\" {\n forward_to = [prometheus.remote_write.default.receiver]\n\n // Drop kube-state-metrics' labels which clash with ours\n rule {\n source_labels = [\"__name__\", \"container\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"container\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"pod\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"pod\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"namespace\"]\n regex = \"kube_pod.+;(.+)\"\n target_label = \"namespace\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_container\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"container\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_pod\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"pod\"\n replacement = \"$1\"\n }\n rule {\n source_labels = [\"__name__\", \"exported_namespace\"]\n // don't replace an empty label\n regex = \"^kube_pod.+;(.+)$\"\n target_label = \"namespace\"\n replacement = \"$1\"\n }\n rule {\n regex = \"^(exported_.*|image_.*|container_id|id|uid)$\"\n action = \"labeldrop\"\n }\n}\n\ndiscovery.relabel \"cadvisor\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n replacement = \"/metrics/cadvisor\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"cadvisor\" {\n targets = discovery.relabel.cadvisor.output\n forward_to = [ prometheus.relabel.cadvisor.receiver ]\n scheme = \"https\"\n tls_config {\n insecure_skip_verify = true\n }\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n scrape_interval = \"15s\"\n scrape_timeout = \"12s\"\n enable_protobuf_negotiation = false\n}\n\nprometheus.relabel \"cadvisor\" {\n forward_to = [ prometheus.remote_write.default.receiver ]\n\n // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"container\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*)@\"\n action = \"drop\"\n }\n // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688\n rule {\n source_labels = [\"__name__\",\"image\"]\n separator = \"@\"\n regex = \"(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@\"\n action = \"drop\"\n }\n // Drop irrelevant series\n rule {\n source_labels = [\"container\"]\n regex = \"^POD$\"\n action = \"drop\"\n }\n // Drop unnecessary labels\n rule {\n source_labels = [\"id\"]\n target_label = \"id\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"job\"]\n target_label = \"job\"\n replacement = \"\"\n }\n rule {\n source_labels = [\"name\"]\n target_label = \"name\"\n replacement = \"\"\n }\n}\n\nprometheus.remote_write \"default\" {\n endpoint {\n send_native_histograms = false\n url =\"http://prometheus.coder-observability.svc/api/v1/write\"\n\n // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned\n // NOTE: \"__address__\" is mapped to \"instance\", so will contain :\n write_relabel_config {\n regex = \"instance\"\n action = \"labeldrop\"\n }\n }\n}" --- -# Source: coder-observability/templates/configmap-prometheus-alerts.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: metrics-alerts - namespace: coder-observability -data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces " - provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " - enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " - postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" ---- # Source: coder-observability/templates/configmap-runbooks.yaml kind: ConfigMap apiVersion: v1 @@ -1024,10 +1012,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-coderd + name: coder-dashboard-coderd namespace: coder-observability data: - coderd.json: |- + coder-coderd.json: |- { "annotations": { "list": [ @@ -2495,8 +2483,8 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Control Plane", - "uid": "coderd", + "title": "Coder Control Plane", + "uid": "coder-coderd", "version": 6, "weekStart": "" } @@ -2505,10 +2493,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-prebuilds + name: coder-dashboard-prebuilds namespace: coder-observability data: - prebuilds.json: |- + coder-prebuilds.json: |- { "annotations": { "list": [ @@ -3955,8 +3943,8 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Prebuilds", - "uid": "cej6jysyme22oa", + "title": "Coder Prebuilds", + "uid": "coder-prebuilds", "version": 5 } --- @@ -3964,10 +3952,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-provisionerd + name: coder-dashboard-provisionerd namespace: coder-observability data: - provisionerd.json: |- + coder-provisionerd.json: |- { "annotations": { "list": [ @@ -4982,8 +4970,8 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Provisioners", - "uid": "provisionerd", + "title": "Coder Provisioners", + "uid": "coder-provisionerd", "version": 10, "weekStart": "" } @@ -4992,10 +4980,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-status + name: coder-dashboard-status namespace: coder-observability data: - status.json: |- + coder-status.json: |- { "annotations": { "list": [ @@ -7065,7 +7053,7 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Status", + "title": "Coder Status", "uid": "coder-status", "version": 1, "weekStart": "" @@ -7075,10 +7063,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-workspace-detail + name: coder-dashboard-workspace-detail namespace: coder-observability data: - workspaces-detail.json: |- + coder-workspaces-detail.json: |- { "annotations": { "list": [ @@ -8416,8 +8404,8 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Workspace Detail", - "uid": "workspace-detail", + "title": "Coder Workspace Detail", + "uid": "coder-workspace-detail", "version": 9, "weekStart": "" } @@ -8426,10 +8414,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: dashboards-workspaces + name: coder-dashboard-workspaces namespace: coder-observability data: - workspaces.json: |- + coder-workspaces.json: |- { "annotations": { "list": [ @@ -10049,8 +10037,8 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Workspaces", - "uid": "workspaces", + "title": "Coder Workspaces", + "uid": "coder-workspaces", "version": 2, "weekStart": "" } @@ -11686,7 +11674,7 @@ spec: app.kubernetes.io/name: grafana app.kubernetes.io/instance: coder-observability annotations: - checksum/config: 8143dd78a48b90e972a84c3078e9812a2536a4efff7ca13f404626d5eae08ab2 + checksum/config: f6ba7281c89f89916dc689b99ba0d8ec9cfe7acc5ecbe24d36d543ef18892401 checksum/dashboards-json-config: 010b57348b6dd1f09007330c03d22a0570022534712646511cad39a9e3cb4bb7 checksum/sc-dashboard-provider-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b kubectl.kubernetes.io/default-container: grafana @@ -11752,27 +11740,27 @@ spec: - name: config mountPath: "/etc/grafana/grafana.ini" subPath: grafana.ini - - name: dashboards-status + - name: coder-dashboard-status mountPath: /var/lib/grafana/dashboards/coder/0 subPath: readOnly: false - - name: dashboards-coderd + - name: coder-dashboard-coderd mountPath: /var/lib/grafana/dashboards/coder/1 subPath: readOnly: false - - name: dashboards-provisionerd + - name: coder-dashboard-provisionerd mountPath: /var/lib/grafana/dashboards/coder/2 subPath: readOnly: false - - name: dashboards-workspaces + - name: coder-dashboard-workspaces mountPath: /var/lib/grafana/dashboards/coder/3 subPath: readOnly: false - - name: dashboards-workspace-detail + - name: coder-dashboard-workspace-detail mountPath: /var/lib/grafana/dashboards/coder/4 subPath: readOnly: false - - name: dashboards-prebuilds + - name: coder-dashboard-prebuilds mountPath: /var/lib/grafana/dashboards/coder/5 subPath: readOnly: false @@ -11830,24 +11818,24 @@ spec: - name: config configMap: name: grafana - - name: dashboards-status + - name: coder-dashboard-status configMap: - name: dashboards-status - - name: dashboards-coderd + name: coder-dashboard-status + - name: coder-dashboard-coderd configMap: - name: dashboards-coderd - - name: dashboards-provisionerd + name: coder-dashboard-coderd + - name: coder-dashboard-provisionerd configMap: - name: dashboards-provisionerd - - name: dashboards-workspaces + name: coder-dashboard-provisionerd + - name: coder-dashboard-workspaces configMap: - name: dashboards-workspaces - - name: dashboards-workspace-detail + name: coder-dashboard-workspaces + - name: coder-dashboard-workspace-detail configMap: - name: dashboards-workspace-detail - - name: dashboards-prebuilds + name: coder-dashboard-workspace-detail + - name: coder-dashboard-prebuilds configMap: - name: dashboards-prebuilds + name: coder-dashboard-prebuilds - name: dashboards-infra configMap: name: grafana-dashboards-infra @@ -12606,13 +12594,14 @@ spec: name: prometheus - name: configmap-reload-alerts configMap: - name: metrics-alerts + name: coder-metrics-alerts - name: server-alerts configMap: - name: metrics-alerts + name: coder-metrics-alerts - name: alerts configMap: - name: metrics-alerts + name: coder-metrics-alerts + optional: true volumeClaimTemplates: - apiVersion: v1 kind: PersistentVolumeClaim