diff --git a/README.md b/README.md index 523aae6..2197afc 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Logs will be scraped from all pods in the Kubernetes cluster. ```bash helm repo add coder-observability https://helm.coder.com/observability -helm upgrade --install coder-observability coder-observability/coder-observability --version 0.6.0 --namespace coder-observability --create-namespace +helm upgrade --install coder-observability coder-observability/coder-observability --version 0.6.1 --namespace coder-observability --create-namespace ``` ## Requirements diff --git a/coder-observability/Chart.yaml b/coder-observability/Chart.yaml index 6986ce1..aca59e9 100644 --- a/coder-observability/Chart.yaml +++ b/coder-observability/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: coder-observability description: Gain insights into your Coder deployment type: application -version: 0.6.0 +version: 0.6.1 dependencies: - name: pyroscope condition: pyroscope.enabled diff --git a/coder-observability/templates/_prometheus-alerts.tpl b/coder-observability/templates/_prometheus-alerts.tpl index 7f0ce39..5cefc77 100644 --- a/coder-observability/templates/_prometheus-alerts.tpl +++ b/coder-observability/templates/_prometheus-alerts.tpl @@ -218,7 +218,7 @@ rules: {{ $alert := "PostgresDown" }} - alert: {{ $alert }} - expr: pg_up == 0 + expr: pg_up{namespace="{{ $.Release.Namespace }}"} == 0 for: {{ $group.delay }} annotations: summary: The postgres instance {{ `{{ $labels.instance }}` }} is down! @@ -235,7 +235,7 @@ {{ $alert := "PostgresConnectionsRunningLow" }} {{- range $severity, $threshold := .thresholds }} - alert: {{ $alert }} - expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * {{ $threshold }}) + expr: sum by (datname, instance) (pg_stat_activity_count{namespace="{{ $.Release.Namespace }}"}) > on () group_left() (pg_settings_max_connections{namespace="{{ $.Release.Namespace }}"} * {{ $threshold }}) for: {{ $group.delay }} labels: summary: The postgres instance {{ `{{ $labels.instance }}` }} is running low on connections which may impact application performance. diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 077b366..f8bb42d 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -796,7 +796,7 @@ data: coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces" provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas" enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats" - postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" + postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up{namespace=\"coder-observability\"} == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count{namespace=\"coder-observability\"}) > on () group_left() (pg_settings_max_connections{namespace=\"coder-observability\"} * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count{namespace=\"coder-observability\"}) > on () group_left() (pg_settings_max_connections{namespace=\"coder-observability\"} * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count{namespace=\"coder-observability\"}) > on () group_left() (pg_settings_max_connections{namespace=\"coder-observability\"} * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" --- # Source: coder-observability/templates/configmap-runbooks.yaml kind: ConfigMap