From 962c2b7870169d10aa1360c0eba59079ee0053e5 Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Tue, 14 Oct 2025 13:33:09 +0300
Subject: [PATCH 1/6] Fix alerts

---
 flux/apps/kube-system/kube-ovn/observability/alerts.yml      | 5 ++++-
 .../kube-state-metrics/app/monitoring/kube-state-alerts.yml  | 2 --
 .../kube-state-metrics/app/monitoring/kubernetes-alerts.yml  | 2 --
 .../kube-state-metrics/app/monitoring/kubernetes-rules.yml   | 2 --
 .../kube-state-metrics/app/monitoring/kustomization.yml      | 1 -
 .../observability/node-exporter/app/monitoring/alerts.yml    | 4 +---
 .../observability/node-exporter/app/monitoring/rules.yml     | 2 --
 7 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/flux/apps/kube-system/kube-ovn/observability/alerts.yml b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
index 658fbd3..a9cefc6 100644
--- a/flux/apps/kube-system/kube-ovn/observability/alerts.yml
+++ b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
@@ -83,7 +83,10 @@ spec:
         summary: OVS reported unhealthy status on at least one node.
 
     - alert: OVSInterfaceLinkDown
-      expr: max by (interface) (interface_link_state == 0 and interface=~"br-.*|genev.*|vxlan.*") == 1
+      expr: |
+        max by (interface) (
+          interface_link_state{interface=~"br-.*|genev.*|vxlan.*"}
+        ) == 0
       for: 10m
       labels:
         severity: critical
diff --git a/flux/apps/observability/kube-state-metrics/app/monitoring/kube-state-alerts.yml b/flux/apps/observability/kube-state-metrics/app/monitoring/kube-state-alerts.yml
index ed653df..d457a2c 100644
--- a/flux/apps/observability/kube-state-metrics/app/monitoring/kube-state-alerts.yml
+++ b/flux/apps/observability/kube-state-metrics/app/monitoring/kube-state-alerts.yml
@@ -2,8 +2,6 @@ apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
   name: kube-state-alerts
-  labels:
-    app: kube-state-alerts
 spec:
   groups:
   - name: kube-state-metrics
diff --git a/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-alerts.yml b/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-alerts.yml
index cc10eef..964f757 100644
--- a/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-alerts.yml
+++ b/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-alerts.yml
@@ -2,8 +2,6 @@ apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
   name: kubernetes-alerts
-  labels:
-    app: kubernetes-alerts
 spec:
   groups:
   - name: kubernetes-apps
diff --git a/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-rules.yml b/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-rules.yml
index 1f1b25d..1611c1e 100644
--- a/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-rules.yml
+++ b/flux/apps/observability/kube-state-metrics/app/monitoring/kubernetes-rules.yml
@@ -2,8 +2,6 @@ apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
   name: kubernetes-rules
-  labels:
-    app: kubernetes-rules
 spec:
   groups:
   - interval: 3m
diff --git a/flux/apps/observability/kube-state-metrics/app/monitoring/kustomization.yml b/flux/apps/observability/kube-state-metrics/app/monitoring/kustomization.yml
index f59701a..02918b7 100644
--- a/flux/apps/observability/kube-state-metrics/app/monitoring/kustomization.yml
+++ b/flux/apps/observability/kube-state-metrics/app/monitoring/kustomization.yml
@@ -1,6 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
-namespace: observability
 resources:
   - dashboards.yml
   - kube-state-alerts.yml
diff --git a/flux/apps/observability/node-exporter/app/monitoring/alerts.yml b/flux/apps/observability/node-exporter/app/monitoring/alerts.yml
index cebbe14..230a650 100644
--- a/flux/apps/observability/node-exporter/app/monitoring/alerts.yml
+++ b/flux/apps/observability/node-exporter/app/monitoring/alerts.yml
@@ -2,11 +2,9 @@ apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
   name: node-exporter-alerts
-  labels:
-    app: node-exporter-alerts
 spec:
   groups:
-  - name: node-exporter.rules
+  - name: node-exporter.alerts
     rules:
     - alert: NodeFilesystemSpaceFillingUp
       annotations:
diff --git a/flux/apps/observability/node-exporter/app/monitoring/rules.yml b/flux/apps/observability/node-exporter/app/monitoring/rules.yml
index 97c2f89..4af73f4 100644
--- a/flux/apps/observability/node-exporter/app/monitoring/rules.yml
+++ b/flux/apps/observability/node-exporter/app/monitoring/rules.yml
@@ -2,8 +2,6 @@ apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
   name: node-exporter-rules
-  labels:
-    app: node-exporter-rules
 spec:
   groups:
   - name: node-exporter.rules

From 8eb06323f5de9f52c0282b49242cc8ad8cd43fb9 Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Wed, 15 Oct 2025 13:09:56 +0300
Subject: [PATCH 2/6] Fix kube-state-metrics job label and add cluster label

---
 flux/apps/observability/alloy/app/metrics/alloy.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/flux/apps/observability/alloy/app/metrics/alloy.yml b/flux/apps/observability/alloy/app/metrics/alloy.yml
index 98432f0..c425c34 100644
--- a/flux/apps/observability/alloy/app/metrics/alloy.yml
+++ b/flux/apps/observability/alloy/app/metrics/alloy.yml
@@ -8,6 +8,9 @@ spec:
     replicas: 1
   alloy:
     enableReporting: false
+    extraEnv:
+      - name: CLUSTER
+        value: spectrum
     configMap:
       content: |-
         logging {
@@ -78,6 +81,7 @@ spec:
 
         ksm.scrape "metrics" {
           targets = ksm.kubernetes.targets.output
+          job_label = "kube-state-metrics"
           forward_to = [prometheus.relabel.relable.receiver]
         }
 
@@ -179,6 +183,11 @@ spec:
         
         // RELABELING
         prometheus.relabel "relable" {
+          rule {
+            action = "replace"
+            target_label = "cluster"
+            replacement = env("CLUSTER")
+          }
           forward_to = [prometheus.remote_write.victoriametrics.receiver]
         }
 

From 371c7a9e12a994689f9537026340a2ac7bf1913b Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Wed, 15 Oct 2025 13:17:52 +0300
Subject: [PATCH 3/6] Update kube-ovn alerts

---
 .../kube-ovn/observability/alerts.yml         | 551 +++++++++++++-----
 1 file changed, 403 insertions(+), 148 deletions(-)

diff --git a/flux/apps/kube-system/kube-ovn/observability/alerts.yml b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
index a9cefc6..789a9e2 100644
--- a/flux/apps/kube-system/kube-ovn/observability/alerts.yml
+++ b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
@@ -4,157 +4,412 @@ metadata:
   name: kube-ovn
 spec:
   groups:
-  - name: kubeovn.capacity.alerts
+  - name: kubeovn.resources
     rules:
-    - alert: ClusterPublicIPsPressure
-      expr: |
-        (
-          sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
-          /
-          clamp_min(
+      - alert: LargeOVNLogFile
+        expr: kube_ovn_log_file_size > 1073741824
+        for: 30m
+        labels:
+          severity: info
+          component: ovn
+        annotations:
+          summary: "Large OVN log file"
+          description: "OVN log file {{ $labels.filename }} on {{ $labels.instance }} is {{ $value | humanize1024 }}B"
+
+      - alert: LargeOVSLogFile
+        expr: log_file_size > 1073741824
+        for: 30m
+        labels:
+          severity: info
+          component: ovs
+        annotations:
+          summary: "Large OVS log file"
+          description: "OVS log file {{ $labels.filename }} on {{ $labels.instance }} is {{ $value | humanize1024 }}B"
+
+      - alert: LargeOVNDatabaseFile
+        expr: kube_ovn_db_file_size > 10737418240
+        for: 30m
+        labels:
+          severity: warning
+          component: ovn-db
+        annotations:
+          summary: "Large OVN database file"
+          description: "OVN database {{ $labels.database }} on {{ $labels.instance }} is {{ $value | humanize1024 }}B"
+
+      - alert: ClusterPublicIPsPressure
+        expr: |
+          (
             sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
-            + sum(subnet_used_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"}), 1
-          )
-        ) < 0.10
-      for: 30m
-      labels:
-        severity: warning
-      annotations:
-        summary: Public IPv4 pool below 10% free.
-
-    - alert: ClusterPublicIPsExhaustion
-      expr: |
-        (
-          sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
-          /
-          clamp_min(
+            /
+            clamp_min(
+              sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
+              + sum(subnet_used_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"}), 1
+            )
+          ) < 0.10
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: Public IPv4 pool below 10% free.
+
+      - alert: ClusterPublicIPsExhaustion
+        expr: |
+          (
             sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
-            + sum(subnet_used_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"}), 1
-          )
-        ) < 0.02
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: Public IPv4 pool is nearly exhausted (<2% free).
-
-  - name: kubeovn.ovn.alerts
+            /
+            clamp_min(
+              sum(subnet_available_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"})
+              + sum(subnet_used_ip_count{subnet_name=~"subnet-.*",protocol="IPv4"}), 1
+            )
+          ) < 0.02
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Public IPv4 pool is nearly exhausted (<2% free).
+
+  - name: kubeovn.health
+    rules:
+      - alert: OVNComponentUnhealthy
+        expr: kube_ovn_ovn_status == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: ovn
+        annotations:
+          summary: "OVN component is unhealthy"
+          description: "OVN component {{ $labels.component }} is unhealthy on {{ $labels.instance }}"
+
+      - alert: OVNDatabaseUnhealthy
+        expr: kube_ovn_db_status == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: ovn-db
+        annotations:
+          summary: "OVN database is unhealthy"
+          description: "OVN {{ $labels.database }} database is unhealthy on {{ $labels.instance }}"
+
+      - alert: OVNChassisDown
+        expr: kube_ovn_chassis_info == 0
+        for: 2m
+        labels:
+          severity: warning
+          component: ovn-chassis
+        annotations:
+          summary: "OVN chassis is down"
+          description: "OVN chassis {{ $labels.chassis }} is down"
+
+      - alert: OVSUnhealthy
+        expr: ovs_status == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: ovs
+        annotations:
+          summary: "OVS is unhealthy"
+          description: "OVS is unhealthy on node {{ $labels.instance }}"
+
+      - alert: OVSPingerDown
+        expr: pinger_ovs_down == 1
+        for: 1m
+        labels:
+          severity: critical
+          component: ovs
+        annotations:
+          summary: "OVS is down on node"
+          description: "OVS is down on node {{ $labels.node_name }}"
+
+      - alert: OVNControllerDown
+        expr: pinger_ovn_controller_down == 1
+        for: 1m
+        labels:
+          severity: critical
+          component: ovn-controller
+        annotations:
+          summary: "OVN controller is down"
+          description: "OVN controller is down on node {{ $labels.node_name }}"
+
+  - name: kubeovn.errors
+    interval: 30s
+    rules:
+      - alert: HighOVNRequestFailureRate
+        expr: rate(kube_ovn_failed_req_count[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          component: ovn
+        annotations:
+          summary: "High OVN request failure rate"
+          description: "OVN is experiencing {{ $value }} failed requests per second on {{ $labels.instance }}"
+
+      - alert: HighOVSRequestFailureRate
+        expr: rate(failed_req_count[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs
+        annotations:
+          summary: "High OVS request failure rate"
+          description: "OVS is experiencing {{ $value }} failed requests per second on {{ $labels.instance }}"
+
+      - alert: HighInterfaceRxErrors
+        expr: rate(interface_rx_errors[5m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs-interface
+        annotations:
+          summary: "High receive errors on interface"
+          description: "Interface {{ $labels.interface }} on {{ $labels.instance }} has {{ $value }} RX errors per second"
+
+      - alert: HighInterfaceTxErrors
+        expr: rate(interface_tx_errors[5m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs-interface
+        annotations:
+          summary: "High transmit errors on interface"
+          description: "Interface {{ $labels.interface }} on {{ $labels.instance }} has {{ $value }} TX errors per second"
+
+      - alert: HighInterfaceDroppedPackets
+        expr: rate(interface_rx_dropped[5m]) > 50 or rate(interface_tx_dropped[5m]) > 50
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs-interface
+        annotations:
+          summary: "High packet drop rate on interface"
+          description: "Interface {{ $labels.interface }} on {{ $labels.instance }} is dropping {{ $value }} packets per second"
+
+  - name: kubeovn.network-quality
+    rules:
+      - alert: APIServerUnhealthy
+        expr: pinger_apiserver_unhealthy == 1
+        for: 2m
+        labels:
+          severity: critical
+          component: apiserver
+        annotations:
+          summary: "API server is unhealthy from node"
+          description: "API server is unhealthy from node {{ $labels.node_name }}"
+
+      - alert: HighAPIServerLatency
+        expr: histogram_quantile(0.99, rate(pinger_apiserver_latency_ms_bucket[5m])) > 1000
+        for: 5m
+        labels:
+          severity: warning
+          component: apiserver
+        annotations:
+          summary: "High API server latency"
+          description: "P99 API server latency from node {{ $labels.node_name }} is {{ $value }}ms"
+
+      - alert: InternalDNSUnhealthy
+        expr: pinger_internal_dns_unhealthy == 1
+        for: 2m
+        labels:
+          severity: warning
+          component: dns
+        annotations:
+          summary: "Internal DNS is unhealthy"
+          description: "Internal DNS is unhealthy from node {{ $labels.node_name }}"
+
+      - alert: ExternalDNSUnhealthy
+        expr: pinger_external_dns_unhealthy == 1
+        for: 2m
+        labels:
+          severity: warning
+          component: dns
+        annotations:
+          summary: "External DNS is unhealthy"
+          description: "External DNS is unhealthy from node {{ $labels.node_name }}"
+
+      - alert: HighInternalDNSLatency
+        expr: histogram_quantile(0.99, rate(pinger_internal_dns_latency_ms_bucket[5m])) > 500
+        for: 5m
+        labels:
+          severity: warning
+          component: dns
+        annotations:
+          summary: "High internal DNS latency"
+          description: "P99 internal DNS latency from node {{ $labels.node_name }} is {{ $value }}ms"
+
+      - alert: HighPodPingLatency
+        expr: histogram_quantile(0.99, rate(pinger_pod_ping_latency_ms_bucket[5m])) > 100
+        for: 5m
+        labels:
+          severity: warning
+          component: pod-network
+        annotations:
+          summary: "High pod-to-pod ping latency"
+          description: "P99 pod-to-pod ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
+
+      - alert: HighPodPingLossRate
+        expr: rate(pinger_pod_ping_lost_total[5m]) / rate(pinger_pod_ping_count_total[5m]) > 0.01
+        for: 5m
+        labels:
+          severity: warning
+          component: pod-network
+        annotations:
+          summary: "High pod ping loss rate"
+          description: "Pod ping loss rate on node {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
+
+      - alert: HighNodePingLatency
+        expr: histogram_quantile(0.99, rate(pinger_node_ping_latency_ms_bucket[5m])) > 50
+        for: 5m
+        labels:
+          severity: warning
+          component: node-network
+        annotations:
+          summary: "High pod-to-node ping latency"
+          description: "P99 pod-to-node ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
+
+      - alert: HighNodePingLossRate
+        expr: rate(pinger_node_ping_lost_total[5m]) / rate(pinger_node_ping_count_total[5m]) > 0.01
+        for: 5m
+        labels:
+          severity: warning
+          component: node-network
+        annotations:
+          summary: "High node ping loss rate"
+          description: "Node ping loss rate on {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
+
+      - alert: HighExternalPingLatency
+        expr: histogram_quantile(0.99, rate(pinger_external_ping_latency_ms_bucket[5m])) > 200
+        for: 5m
+        labels:
+          severity: info
+          component: external-network
+        annotations:
+          summary: "High external ping latency"
+          description: "P99 external ping latency from node {{ $labels.node_name }} is {{ $value }}ms"
+
+      - alert: HighExternalPingLossRate
+        expr: rate(pinger_external_lost_total[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+          component: external-network
+        annotations:
+          summary: "High external ping loss rate"
+          description: "External ping loss rate from node {{ $labels.node_name }} is {{ $value }}"
+
+      - alert: InconsistentPortBindings
+        expr: pinger_inconsistent_port_binding > 0
+        for: 5m
+        labels:
+          severity: warning
+          component: ovn
+        annotations:
+          summary: "Inconsistent port bindings detected"
+          description: "{{ $value }} inconsistent port bindings between OVS and OVN-SB on node {{ $labels.node_name }}"
+
+  - name: kube-ovn-datapath
+    interval: 30s
     rules:
-    - alert: OVNDBUnhealthy
-      expr: min(kube_ovn_db_status) == 0
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: OVN NB/SB DB reported unhealthy.
-
-    - alert: OVNNoLeader
-      expr: sum(max by (pod) (kube_ovn_cluster_leader_self)) == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        summary: No OVN Raft leader detected.
-
-    - alert: OVNUnhealthy
-      expr: min(kube_ovn_ovn_status) == 0
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: OVN reported unhealthy status.
-
-    - alert: OVNFailedRequestsSpike
-      expr: increase(kube_ovn_failed_req_count[5m]) > 0
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Failed requests to OVN increased in the last 5 minutes.
-
-  - name: kubeovn.ovs.alerts
+      - alert: HighDatapathFlowLookupMissRate
+        expr: rate(dp_flows_lookup_missed[5m]) / rate(dp_flows_lookup_hit[5m]) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          component: ovs-datapath
+        annotations:
+          summary: "High datapath flow lookup miss rate"
+          description: "Datapath {{ $labels.datapath }} on {{ $labels.instance }} has high flow lookup miss rate: {{ $value | humanizePercentage }}"
+
+      - alert: DatapathFlowsLost
+        expr: rate(dp_flows_lookup_lost[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs-datapath
+        annotations:
+          summary: "Datapath flows being lost"
+          description: "Datapath {{ $labels.datapath }} on {{ $labels.instance }} is losing {{ $value }} flows per second"
+
+      - alert: LowDatapathMaskHitRatio
+        expr: dp_masks_hit_ratio < 1
+        for: 10m
+        labels:
+          severity: info
+          component: ovs-datapath
+        annotations:
+          summary: "Low datapath mask hit ratio"
+          description: "Datapath {{ $labels.datapath }} on {{ $labels.instance }} has low mask hit ratio: {{ $value }}"
+
+  - name: kube-ovn-performance
+    interval: 30s
     rules:
-    - alert: OVSUnhealthy
-      expr: min(ovs_status) == 0
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: OVS reported unhealthy status on at least one node.
-
-    - alert: OVSInterfaceLinkDown
-      expr: |
-        max by (interface) (
-          interface_link_state{interface=~"br-.*|genev.*|vxlan.*"}
-        ) == 0
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: Critical OVS interface (bridge or tunnel) link down for 10 minutes.
-
-    - alert: OVSFailedRequestsSpike
-      expr: increase(failed_req_count[5m]) > 0
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Failed requests to OVS increased in the last 5 minutes.
-
-  - name: kubeovn.pinger.alerts
+      - alert: HighCNIOperationLatency
+        expr: histogram_quantile(0.99, rate(cni_op_latency_seconds_bucket[5m])) > 5
+        for: 5m
+        labels:
+          severity: warning
+          component: cni
+        annotations:
+          summary: "High CNI operation latency"
+          description: "P99 CNI operation latency on {{ $labels.instance }} is {{ $value }}s"
+
+      - alert: HighCNIAddressWaitTime
+        expr: rate(cni_wait_address_seconds_total[5m]) > 2
+        for: 5m
+        labels:
+          severity: warning
+          component: cni
+        annotations:
+          summary: "High CNI address wait time"
+          description: "CNI is waiting {{ $value }}s per operation for address assignment on {{ $labels.instance }}"
+
+      - alert: HighOVSClientLatency
+        expr: histogram_quantile(0.99, rate(ovs_client_request_latency_milliseconds_bucket[5m])) > 1000
+        for: 5m
+        labels:
+          severity: warning
+          component: ovs-client
+        annotations:
+          summary: "High OVS client request latency"
+          description: "P99 OVS client request latency on {{ $labels.instance }} is {{ $value }}ms"
+
+      - alert: HighControllerRESTLatency
+        expr: histogram_quantile(0.99, rate(rest_client_request_latency_seconds_bucket{job="kube-ovn-controller"}[5m])) > 5
+        for: 5m
+        labels:
+          severity: warning
+          component: controller
+        annotations:
+          summary: "High controller REST client latency"
+          description: "P99 REST client latency for {{ $labels.verb }} {{ $labels.url }} is {{ $value }}s"
+
+  - name: kubeovn.cluster
+    interval: 30s
     rules:
-    - alert: NodeAPIServerUnhealthy
-      expr: max by (node) (pinger_apiserver_unhealthy) == 1
-      for: 5m
-      labels:
-        severity: critical
-      annotations:
-        summary: API server unhealthy on a node.
-
-    - alert: InternalDNSUnhealthy
-      expr: max by (node) (pinger_internal_dns_unhealthy) == 1
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: Internal DNS unhealthy on a node.
-
-    - alert: ExternalDNSUnhealthy
-      expr: max by (node) (pinger_external_dns_unhealthy) == 1
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: External DNS unhealthy on a node.
-
-    - alert: PodToPodPacketLossHigh
-      expr: (increase(pinger_pod_ping_lost_total[5m]) / clamp_min(increase(pinger_pod_ping_count_total[5m]), 1)) > 0.05
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: Pod-to-pod packet loss >5% over 10 minutes.
-
-    - alert: PodToNodePacketLossHigh
-      expr: (increase(pinger_node_ping_lost_total[5m]) / clamp_min(increase(pinger_node_ping_count_total[5m]), 1)) > 0.05
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: Pod-to-node packet loss >5% over 10 minutes.
-
-    - alert: ExternalPingPacketLossHigh
-      expr: (increase(pinger_external_lost_total[5m]) / clamp_min(increase(pinger_external_ping_latency_ms_count[5m]), 1)) > 0.05
-      for: 10m
-      labels:
-        severity: critical
-      annotations:
-        summary: External connectivity packet loss >5% over 10 minutes.
-
-    - alert: ExternalLatencyP99High
-      expr: histogram_quantile(0.99, sum by (le) (rate(pinger_external_ping_latency_ms_bucket[5m]))) > 200
-      for: 10m
-      labels:
-        severity: warning
-      annotations:
-        summary: External ping P99 latency >200 ms.
+      - alert: OVNClusterLogNotCommitted
+        expr: kube_ovn_cluster_log_not_committed > 100
+        for: 5m
+        labels:
+          severity: warning
+          component: ovn-cluster
+        annotations:
+          summary: "OVN cluster has uncommitted log entries"
+          description: "OVN cluster server {{ $labels.instance }} has {{ $value }} uncommitted log entries"
+
+      - alert: OVNClusterLogNotApplied
+        expr: kube_ovn_cluster_log_not_applied > 100
+        for: 5m
+        labels:
+          severity: warning
+          component: ovn-cluster
+        annotations:
+          summary: "OVN cluster has unapplied log entries"
+          description: "OVN cluster server {{ $labels.instance }} has {{ $value }} unapplied log entries"
+
+      - alert: HighOVNClusterConnectionErrors
+        expr: rate(kube_ovn_cluster_inbound_connections_error_total[5m]) > 0.1 or rate(kube_ovn_cluster_outbound_connections_error_total[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          component: ovn-cluster
+        annotations:
+          summary: "High OVN cluster connection error rate"
+          description: "OVN cluster server {{ $labels.instance }} has {{ $value }} connection errors per second"

From f9efa908c101af120ce9f22308c0dfeb3040b373 Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Wed, 15 Oct 2025 13:58:58 +0300
Subject: [PATCH 4/6] No exernal dns check

---
 .../kube-ovn/observability/alerts.yml          | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/flux/apps/kube-system/kube-ovn/observability/alerts.yml b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
index 789a9e2..c9ff2f2 100644
--- a/flux/apps/kube-system/kube-ovn/observability/alerts.yml
+++ b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
@@ -215,15 +215,15 @@ spec:
           summary: "Internal DNS is unhealthy"
           description: "Internal DNS is unhealthy from node {{ $labels.node_name }}"
 
-      - alert: ExternalDNSUnhealthy
-        expr: pinger_external_dns_unhealthy == 1
-        for: 2m
-        labels:
-          severity: warning
-          component: dns
-        annotations:
-          summary: "External DNS is unhealthy"
-          description: "External DNS is unhealthy from node {{ $labels.node_name }}"
+      # - alert: ExternalDNSUnhealthy
+      #   expr: pinger_external_dns_unhealthy == 1
+      #   for: 2m
+      #   labels:
+      #     severity: warning
+      #     component: dns
+      #   annotations:
+      #     summary: "External DNS is unhealthy"
+      #     description: "External DNS is unhealthy from node {{ $labels.node_name }}"
 
       - alert: HighInternalDNSLatency
         expr: histogram_quantile(0.99, rate(pinger_internal_dns_latency_ms_bucket[5m])) > 500

From 44e4646fc6877e22debabeb21db6aa43fca065d7 Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Thu, 16 Oct 2025 09:13:29 +0300
Subject: [PATCH 5/6] Fix node label

---
 flux/apps/observability/alloy/app/metrics/alloy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux/apps/observability/alloy/app/metrics/alloy.yml b/flux/apps/observability/alloy/app/metrics/alloy.yml
index c425c34..85cbb82 100644
--- a/flux/apps/observability/alloy/app/metrics/alloy.yml
+++ b/flux/apps/observability/alloy/app/metrics/alloy.yml
@@ -140,7 +140,7 @@ spec:
 
         prometheus.scrape "nodeexporter" {
           targets    = discovery.relabel.nodeexporter_targets.output
-          job_name   = "integrations/node_exporter"
+          job_name   = "node"
           forward_to = [prometheus.relabel.relable.receiver]
         }
 

From e275ff0521103daa547f5a13783012fe6a1af193 Mon Sep 17 00:00:00 2001
From: Anatolios Laskaris <github_me@nahsi.dev>
Date: Tue, 21 Oct 2025 09:51:02 +0300
Subject: [PATCH 6/6] No ping alerts for now

---
 .../kube-ovn/observability/alerts.yml         | 118 +++++++++---------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/flux/apps/kube-system/kube-ovn/observability/alerts.yml b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
index c9ff2f2..bbf6dea 100644
--- a/flux/apps/kube-system/kube-ovn/observability/alerts.yml
+++ b/flux/apps/kube-system/kube-ovn/observability/alerts.yml
@@ -235,65 +235,65 @@ spec:
           summary: "High internal DNS latency"
           description: "P99 internal DNS latency from node {{ $labels.node_name }} is {{ $value }}ms"
 
-      - alert: HighPodPingLatency
-        expr: histogram_quantile(0.99, rate(pinger_pod_ping_latency_ms_bucket[5m])) > 100
-        for: 5m
-        labels:
-          severity: warning
-          component: pod-network
-        annotations:
-          summary: "High pod-to-pod ping latency"
-          description: "P99 pod-to-pod ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
-
-      - alert: HighPodPingLossRate
-        expr: rate(pinger_pod_ping_lost_total[5m]) / rate(pinger_pod_ping_count_total[5m]) > 0.01
-        for: 5m
-        labels:
-          severity: warning
-          component: pod-network
-        annotations:
-          summary: "High pod ping loss rate"
-          description: "Pod ping loss rate on node {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
-
-      - alert: HighNodePingLatency
-        expr: histogram_quantile(0.99, rate(pinger_node_ping_latency_ms_bucket[5m])) > 50
-        for: 5m
-        labels:
-          severity: warning
-          component: node-network
-        annotations:
-          summary: "High pod-to-node ping latency"
-          description: "P99 pod-to-node ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
-
-      - alert: HighNodePingLossRate
-        expr: rate(pinger_node_ping_lost_total[5m]) / rate(pinger_node_ping_count_total[5m]) > 0.01
-        for: 5m
-        labels:
-          severity: warning
-          component: node-network
-        annotations:
-          summary: "High node ping loss rate"
-          description: "Node ping loss rate on {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
-
-      - alert: HighExternalPingLatency
-        expr: histogram_quantile(0.99, rate(pinger_external_ping_latency_ms_bucket[5m])) > 200
-        for: 5m
-        labels:
-          severity: info
-          component: external-network
-        annotations:
-          summary: "High external ping latency"
-          description: "P99 external ping latency from node {{ $labels.node_name }} is {{ $value }}ms"
-
-      - alert: HighExternalPingLossRate
-        expr: rate(pinger_external_lost_total[5m]) > 0.05
-        for: 5m
-        labels:
-          severity: warning
-          component: external-network
-        annotations:
-          summary: "High external ping loss rate"
-          description: "External ping loss rate from node {{ $labels.node_name }} is {{ $value }}"
+      # - alert: HighPodPingLatency
+      #   expr: histogram_quantile(0.99, rate(pinger_pod_ping_latency_ms_bucket[5m])) > 100
+      #   for: 5m
+      #   labels:
+      #     severity: warning
+      #     component: pod-network
+      #   annotations:
+      #     summary: "High pod-to-pod ping latency"
+      #     description: "P99 pod-to-pod ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
+      #
+      # - alert: HighPodPingLossRate
+      #   expr: rate(pinger_pod_ping_lost_total[5m]) / rate(pinger_pod_ping_count_total[5m]) > 0.01
+      #   for: 5m
+      #   labels:
+      #     severity: warning
+      #     component: pod-network
+      #   annotations:
+      #     summary: "High pod ping loss rate"
+      #     description: "Pod ping loss rate on node {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
+      #
+      # - alert: HighNodePingLatency
+      #   expr: histogram_quantile(0.99, rate(pinger_node_ping_latency_ms_bucket[5m])) > 50
+      #   for: 5m
+      #   labels:
+      #     severity: warning
+      #     component: node-network
+      #   annotations:
+      #     summary: "High pod-to-node ping latency"
+      #     description: "P99 pod-to-node ping latency on node {{ $labels.node_name }} is {{ $value }}ms"
+      #
+      # - alert: HighNodePingLossRate
+      #   expr: rate(pinger_node_ping_lost_total[5m]) / rate(pinger_node_ping_count_total[5m]) > 0.01
+      #   for: 5m
+      #   labels:
+      #     severity: warning
+      #     component: node-network
+      #   annotations:
+      #     summary: "High node ping loss rate"
+      #     description: "Node ping loss rate on {{ $labels.node_name }} is {{ $value | humanizePercentage }}"
+      #
+      # - alert: HighExternalPingLatency
+      #   expr: histogram_quantile(0.99, rate(pinger_external_ping_latency_ms_bucket[5m])) > 200
+      #   for: 5m
+      #   labels:
+      #     severity: info
+      #     component: external-network
+      #   annotations:
+      #     summary: "High external ping latency"
+      #     description: "P99 external ping latency from node {{ $labels.node_name }} is {{ $value }}ms"
+      #
+      # - alert: HighExternalPingLossRate
+      #   expr: rate(pinger_external_lost_total[5m]) > 0.05
+      #   for: 5m
+      #   labels:
+      #     severity: warning
+      #     component: external-network
+      #   annotations:
+      #     summary: "High external ping loss rate"
+      #     description: "External ping loss rate from node {{ $labels.node_name }} is {{ $value }}"
 
       - alert: InconsistentPortBindings
         expr: pinger_inconsistent_port_binding > 0