cobaltcore-dev · SoWieMarkus · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
@@ -0,0 +1,33 @@
+name: Check Alerts using Promtool
+on:
+  pull_request:
+    paths:
+      - '**/*.rules.yaml'
+      - '**/*.alerts.yaml'
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout PR
+        uses: actions/checkout@v5
+
+      - name: Get changed rule and alert files
+        id: changed
+        uses: tj-actions/changed-files@v46
+        with:
+          files: |
+            **/*.rules.yaml
+            **/*.alerts.yaml
+
+      - name: Install Helm
+        uses: azure/setup-helm@v4
+
+      - name: Check changed rule and alert files via promtool
+        if: steps.changed.outputs.any_changed == 'true'
+        uses: peimanja/promtool-github-actions@v0.0.2
+        with:
+          promtool_actions_subcommand: 'rules'
+          promtool_actions_files: ${{ steps.changed.outputs.all_changed_files }}
+          promtool_actions_version: 'latest'
+          promtool_actions_comment: 'false'
@@ -51,17 +51,14 @@ dep_charts = {
         ('dist/chart', 'cortex'),
     ],
     'cortex-nova': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],
     'cortex-manila': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],
     'cortex-cinder': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],

@@ -25,7 +25,6 @@ helm/
 │   ├── cortex-ironcore/         # IronCore scheduling domain
 │   └── cortex-crds/             # CRDs for all operators
 ├── library/                   # Shared library charts
-│   ├── cortex-alerts/           # Common alerting infrastructure
 │   └── cortex-postgres/         # PostgreSQL database
 ├── dev/                       # Development-only charts
 │   └── cortex-prometheus-operator/  # Local monitoring stack
@@ -39,6 +38,7 @@ helm/
 Bundle charts are **umbrella charts** that represent complete deployments for specific scheduling domains. They aggregate operator charts and library charts into deployable units.
 
 **Available bundles:**
+
 - `cortex-nova` - Nova compute scheduling domain
 - `cortex-cinder` - Cinder block storage scheduling domain
 - `cortex-manila` - Manila shared filesystem scheduling domain
@@ -54,10 +54,11 @@ The operator chart contains the core Kubernetes operators built from the Go modu
 Library charts provide **shared, reusable components** that are consumed by bundle charts as dependencies.
 
 **Available library charts:**
-- `cortex-alerts` - Common alerting infrastructure and templates
+
 - `cortex-postgres` - PostgreSQL database deployment with monitoring
 
 **Integration with bundles:**
+
 - Library charts are **included as dependencies** in bundle Chart.yaml files
 - Provide common infrastructure components used across multiple domains
 - Reduce duplication of common services like databases and monitoring
@@ -68,15 +69,18 @@ Library charts provide **shared, reusable components** that are consumed by bund
 Dev charts support **local development and testing** but are not included in production releases.
 
 **Available dev charts:**
+
 - `cortex-prometheus-operator` - Prometheus operator setup for local development
 
 ## Usage Patterns
 
 ### Production Deployment
+
 1. Deploy CRDs first: `helm install cortex-crds bundles/cortex-crds/`
 2. Deploy domain-specific bundle: `helm install cortex-nova bundles/cortex-nova/`
 
 ### Development Setup
+
 1. Deploy monitoring: `helm install prometheus dev/cortex-prometheus-operator/`
 2. Deploy CRDs: `helm install cortex-crds bundles/cortex-crds/`
 3. Deploy and test bundles: `helm install cortex-nova bundles/cortex-nova/`

@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts

@@ -1,10 +1,10 @@
 groups:
 - name: cortex-cinder-alerts
   rules:
-  - alert: CortexCinderInitialPlacementDown
+  - alert: CortexCinderSchedulingDown
     expr: |
-      up{component="cortex-cinder-scheduler", namespace="cortex-cinder"} != 1 or
-      absent(up{component="cortex-cinder-scheduler", namespace="cortex-cinder"})
+      up{pod=~"cortex-cinder-scheduling-.*"} != 1 or
+      absent(up{pod=~"cortex-cinder-scheduling-.*"})
     for: 5m
     labels:
       context: liveness
@@ -14,8 +14,102 @@ groups:
       support_group: workload-management
       playbook: docs/support/playbook/cortex/down
     annotations:
-      summary: "Cortex initial placement for Cinder is down"
+      summary: "Cortex Scheduling for Cinder is down"
       description: >
-        The Cortex initial placement is down. Initial placement requests from Cinder will
+        The Cortex scheduling service is down. Scheduling requests from Cinder will
         not be served. This is no immediate problem, since Cinder will continue
-        placing new volumes. However, the placement will be less desirable.
+        placing new VMs. However, the placement will be less desirable.
+  - alert: CortexCinderKnowledgeDown
+    expr: |
+      up{pod=~"cortex-cinder-knowledge-.*"} != 1 or
+      absent(up{pod=~"cortex-cinder-knowledge-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Knowledge for Cinder is down"
+      description: >
+        The Cortex Knowledge service is down. This is no immediate problem,
+        since cortex is still able to process requests,
+        but the quality of the responses may be affected.
+  - alert: CortexCinderHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cinder Scheduler HTTP request 400 errors too high"
+      description: >
+        Cinder Scheduler is responding to placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the request format has
+        changed and Cortex is unable to parse it.
+  - alert: CortexCinderSchedulingHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cinder Scheduler HTTP request 500 errors too high"
+      description: >
+        Cinder Scheduler is responding to placement requests with HTTP 5xx errors.
+        This is not expected and indicates that Cortex is having some internal problem.
+        Cinder will continue to place new VMs, but the placement will be less desirable.
+        Thus, no immediate action is needed.
+  - alert: CortexCinderHighMemoryUsage
+    expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much memory"
+      description: >
+        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+  - alert: CortexCinderHighCPUUsage
+    expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much CPU"
+      description: >
+        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+  - alert: CortexCinderTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is trying to connect to the database too often"
+      description: >
+        `{{$labels.component}}` is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
@@ -111,9 +111,3 @@ cortex-knowledge-controllers:
 # Custom configuration for the cortex postgres chart.
 cortex-postgres:
   fullnameOverride: cortex-cinder-postgresql
-
-# Custom configuration for the cortex core chart.
-cortex-alerts:
-  fullnameOverride: cortex-cinder
-  alerts:
-    componentPrefix: cortex-cinder
@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts

@@ -1,10 +1,10 @@
 groups:
 - name: cortex-manila-alerts
   rules:
-  - alert: CortexManilaInitialPlacementDown
+  - alert: CortexManilaSchedulingDown
     expr: |
-      up{component="cortex-manila-scheduler", namespace="cortex-manila"} != 1 or
-      absent(up{component="cortex-manila-scheduler", namespace="cortex-manila"})
+      up{pod=~"cortex-manila-scheduling-.*"} != 1 or
+      absent(up{pod=~"cortex-manila-scheduling-.*"})
     for: 5m
     labels:
       context: liveness
@@ -14,9 +14,102 @@ groups:
       support_group: workload-management
       playbook: docs/support/playbook/cortex/down
     annotations:
-      summary: "Cortex initial placement for Manila is down"
+      summary: "Cortex Scheduling for Manila is down"
       description: >
-        The Cortex initial placement is down. Initial placement requests from Manila will
+        The Cortex scheduling service is down. Scheduling requests from Manila will
         not be served. This is no immediate problem, since Manila will continue
-        placing new shares. However, the placement will be less desirable.
-
+        placing new VMs. However, the placement will be less desirable.
+  - alert: CortexManilaKnowledgeDown
+    expr: |
+      up{pod=~"cortex-manila-knowledge-.*"} != 1 or
+      absent(up{pod=~"cortex-manila-knowledge-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Knowledge for Manila is down"
+      description: >
+        The Cortex Knowledge service is down. This is no immediate problem,
+        since cortex is still able to process requests,
+        but the quality of the responses may be affected.
+  - alert: CortexManilaHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Manila Scheduler HTTP request 400 errors too high"
+      description: >
+        Manila Scheduler is responding to placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the request format has
+        changed and Cortex is unable to parse it.
+  - alert: CortexManilaSchedulingHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Manila Scheduler HTTP request 500 errors too high"
+      description: >
+        Manila Scheduler is responding to placement requests with HTTP 5xx errors.
+        This is not expected and indicates that Cortex is having some internal problem.
+        Manila will continue to place new VMs, but the placement will be less desirable.
+        Thus, no immediate action is needed.
+  - alert: CortexManilaHighMemoryUsage
+    expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much memory"
+      description: >
+        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+  - alert: CortexManilaHighCPUUsage
+    expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much CPU"
+      description: >
+        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+  - alert: CortexManilaTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is trying to connect to the database too often"
+      description: >
+        `{{$labels.component}}` is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
@@ -111,9 +111,3 @@ cortex-knowledge-controllers:
 # Custom configuration for the cortex postgres chart.
 cortex-postgres:
   fullnameOverride: cortex-manila-postgresql
-
-# Custom configuration for the cortex core chart.
-cortex-alerts:
-  fullnameOverride: cortex-manila
-  alerts:
-    componentPrefix: cortex-manila
@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts