modules/021-cni-cilium/monitoring/prometheus-rules/agent.yaml

- name: d8.cni-cilium.agent
  rules:
  - alert: CiliumAgentUnreachableHealthEndpoints
    # Select all current `cilium_unreachable_health_endpoints` if the condition is met:
    # no changes in pod states for the last N minutes (number of pods including changes - number of pods N minutes ago)
    expr: |
      max by (namespace, pod) (cilium_unreachable_health_endpoints) > 0
      and on()
      (
        (
          count(changes(kube_pod_info{created_by_name="agent",namespace="d8-cni-cilium"}[2m])) -
          count(kube_pod_info{created_by_name="agent",namespace="d8-cni-cilium"} offset 2m)
        ) == 0
      )
    for: 5m
    labels:
      d8_module: cni-cilium
      d8_component: agent
      severity_level: "4"
      tier: cluster
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_grouped_by__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      plk_create_group_if_not_exists__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      summary: Some node's health endpoints are not reachable by agent {{ $labels.namespace }}/{{ $labels.pod }}.
      description: |
        Check what's going on: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`

  - alert: CiliumAgentMetricNotFound
    expr: (count by (namespace,pod) (cilium_unreachable_health_endpoints) OR count by (namespace,pod) (cilium_endpoint_state)) != 1
    for: 5m
    labels:
      d8_module: cni-cilium
      d8_component: agent
      severity_level: "4"
      tier: cluster
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_grouped_by__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      plk_create_group_if_not_exists__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      summary: Some of the metrics are not coming from the agent {{ $labels.namespace }}/{{ $labels.pod }}.
      description: |
        Use the following commands to check what's going on:
        - `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`
        - `kubectl -n {{ $labels.namespace }} exec -ti {{ $labels.pod }} cilium-health status`

        We need to cross-check the metrics with the neighboring agent.
        Also the absence of metrics is an indirect sign that new pods cannot be created on the node because of the inability to connect to the agent.
        It is important to get a more specific way of determining the above situation and create a more accurate alert for the inability to connect new pods to the agent.

  - alert: CiliumAgentEndpointsNotReady
    expr: sum by (namespace, pod) (cilium_endpoint_state{endpoint_state="ready"} / cilium_endpoint_state) < 0.5
    for: 5m
    labels:
      d8_module: cni-cilium
      d8_component: agent
      severity_level: "4"
      tier: cluster
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_grouped_by__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      plk_create_group_if_not_exists__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      summary: More than half of all known Endpoints are not ready in agent {{ $labels.namespace }}/{{ $labels.pod }}.
      description: |
        Check what's going on: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`

  - alert: CiliumAgentMapPressureCritical
    expr: sum by (namespace, pod, map_name) (cilium_bpf_map_pressure > 0.9)
    for: 5m
    labels:
      d8_module: cni-cilium
      d8_component: agent
      severity_level: "4"
      experimental: "true"
      tier: cluster
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_grouped_by__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      plk_create_group_if_not_exists__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      summary: eBPF map {{ $labels.map_name }} is more than 90% full in agent {{ $labels.namespace }}/{{ $labels.pod }}.
      description: We've reached resource limit of eBPF maps. Consult with vendor for possible remediation steps.

  - alert: CiliumAgentPolicyImportErrors
    expr: sum by (namespace, pod) (rate(cilium_policy_import_errors_total[2m]) > 0)
    for: 5m
    labels:
      d8_module: cni-cilium
      d8_component: agent
      severity_level: "4"
      tier: cluster
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_grouped_by__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      plk_create_group_if_not_exists__main: CiliumAgentMalfunctioning,prometheus=deckhouse,tier=~tier,kubernetes=~kubernetes
      summary: Agent {{ $labels.namespace }}/{{ $labels.pod }} fails to import policies.
      description: |
        Check what's going on: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`