From c706f62ca2e283d7d5be85b080d6355701febe92 Mon Sep 17 00:00:00 2001 From: Michael Borens Date: Thu, 8 Feb 2024 21:57:00 +0100 Subject: [PATCH] feat: add prometheusRule --- README.md | 26 +++ templates/metrics/_alerts.yaml.tpl | 205 ++++++++++++++++++++++ templates/metrics/_helpers-monitoring.tpl | 18 ++ templates/metrics/metrics-rules.yaml | 16 ++ templates/metrics/metrics-svcmon.yaml | 2 +- values.yaml | 72 ++++++-- 6 files changed, 327 insertions(+), 12 deletions(-) create mode 100644 templates/metrics/_alerts.yaml.tpl create mode 100644 templates/metrics/_helpers-monitoring.tpl create mode 100644 templates/metrics/metrics-rules.yaml diff --git a/README.md b/README.md index e474fdcc6..ce59c2e3b 100644 --- a/README.md +++ b/README.md @@ -378,10 +378,36 @@ The following table lists the configurable parameters of the Harbor chart and th | `metrics.exporter.path` | the url path for exporter metrics | `/metrics` | | `metrics.exporter.port` | the port for exporter metrics | `8001` | | `metrics.serviceMonitor.enabled` | create prometheus serviceMonitor. Requires prometheus CRD's | `false` | +| `metrics.serviceMonitor.matchLabels` | additional labels to be discovered by prometheus | `{}` | | `metrics.serviceMonitor.additionalLabels` | additional labels to upsert to the manifest | `""` | | `metrics.serviceMonitor.interval` | scrape period for harbor metrics | `""` | | `metrics.serviceMonitor.metricRelabelings` | metrics relabel to add/mod/del before ingestion | `[]` | | `metrics.serviceMonitor.relabelings` | relabels to add/mod/del to sample before scrape | `[]` | +| `metrics.rules.enabled` | create prometheus prometheusRule. Requires prometheus CRD's | `false` | +| `metrics.rules.disabled` | specify which individual alerts should be disabled. | `{}` | +| `metrics.rules.alerting` | instead of turning off each alert one by one, set the .rules.alerting value to false instead | `true` | +| `metrics.rules.additionalAggregationLabels` | additional labels when using expression aggregation | `{}` | +| `metrics.rules.additionalLabels` | additional labels for PrometheusRule resource | `{}` | +| `metrics.rules.additionalRuleLabels` | Additional labels for specific PrometheusRule alert | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborCoreDown` | Additional labels for specific PrometheusRule alert groups HarborCoreDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborDatabaseDown` | Additional labels for specific PrometheusRule alert groups HarborDatabaseDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborRegistryDown` | Additional labels for specific PrometheusRule alert groups HarborRegistryDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborRedisDown` | Additional labels for specific PrometheusRule alert groups HarborRedisDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborTrivyDown` | Additional labels for specific PrometheusRule alert groups HarborTrivyDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborJobServiceDown` | Additional labels for specific PrometheusRule alert groups HarborJobServiceDown | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborLatency99` | Additional labels for specific PrometheusRule alert groups HarborLatency99 | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborRateErrors` | Additional labels for specific PrometheusRule alert groups HarborRateErrors | `{}` | +| `metrics.rules.additionalRuleGroupLabels.HarborQuotaProjectLimit` | Additional labels for specific PrometheusRule alert groups HarborQuotaProjectLimit | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborCoreDown` | Additional annotations for specific PrometheusRule alert groups HarborCoreDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborDatabaseDown` | Additional annotations for specific PrometheusRule alert groups HarborDatabaseDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborRegistryDown` | Additional annotations for specific PrometheusRule alert groups HarborRegistryDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborRedisDown` | Additional annotations for specific PrometheusRule alert groups HarborRedisDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborTrivyDown` | Additional annotations for specific PrometheusRule alert groups HarborTrivyDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborJobServiceDown` | Additional annotations for specific PrometheusRule alert groups HarborJobServiceDown | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborLatency99` | Additional annotations for specific PrometheusRule alert groups HarborLatency99 | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborRateErrors` | Additional annotations for specific PrometheusRule alert groups HarborRateErrors | `{}` | +| `metrics.rules.additionalRuleGroupAnnotations.HarborQuotaProjectLimit` | Additional annotations for specific PrometheusRule alert groups HarborQuotaProjectLimit | `{}` | +| `metrics.rules.additionalGroups` | additional groups to add to the rules file | `[]` | | **Trace** | | | | `trace.enabled` | Enable tracing or not | `false` | | `trace.provider` | The tracing provider: `jaeger` or `otel`. `jaeger` should be 1.26+ | `jaeger` | diff --git a/templates/metrics/_alerts.yaml.tpl b/templates/metrics/_alerts.yaml.tpl new file mode 100644 index 000000000..b5fddb743 --- /dev/null +++ b/templates/metrics/_alerts.yaml.tpl @@ -0,0 +1,205 @@ +{{/* Base alert for Harbor */}} +{{- define "harbor.rules" -}} +groups: +- name: harbor_alerts + rules: +{{- if not (.Values.metrics.rules.disabled.HarborCoreDown | default false) }} + - alert: HarborCoreDown + annotations: + summary: Harbor core is down. + expr: |- + harbor_up{component="core"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborCoreDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborCoreDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborCoreDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborCoreDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborDatabaseDown | default false) }} + - alert: HarborDatabaseDown + annotations: + summary: Harbor database is down. + expr: |- + harbor_up{component="database"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborDatabaseDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborDatabaseDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborDatabaseDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborDatabaseDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborRegistryDown | default false) }} + - alert: HarborRegistryDown + annotations: + summary: Harbor registry is down. + expr: |- + harbor_up{component="registry"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborRegistryDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborRegistryDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRegistryDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRegistryDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborRedisDown | default false) }} + - alert: HarborRedisDown + annotations: + summary: Harbor redis is down. + expr: |- + harbor_up{component="redis"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborRedisDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborRedisDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRedisDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRedisDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborTrivyDown | default false) }} + - alert: HarborTrivyDown + annotations: + summary: Harbor trivy is down. + expr: |- + harbor_up{component="trivy"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborTrivyDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborTrivyDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborTrivyDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborTrivyDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborJobServiceDown | default false) }} + - alert: HarborJobServiceDown + annotations: + summary: Harbor job service is down. + expr: |- + harbor_up{component="jobservice"} == 0 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborJobServiceDown }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborJobServiceDown }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborJobServiceDown }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborJobServiceDown | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborLatency99 | default false) }} + - alert: HarborLatency99 + annotations: + summary: Harbor p99 latency is higher than 10 seconds. + expr: |- + histogram_quantile(0.99, + sum by ({{ range $.Values.metrics.rules.additionalAggregationLabels }}{{ . }},{{ end }})( + rate(registry_http_request_duration_seconds_bucket[30m]))) + > 10 + for: 5m + labels: + severity: warning +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborLatency99 }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborLatency99 }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborLatency99 }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborLatency99 | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborRateErrors | default false) }} + - alert: HarborRateErrors + annotations: + summary: Harbor Error Rate is High. + expr: |- + sum by ({{ range $.Values.metrics.rules.additionalAggregationLabels }}{{ . }},{{ end }})( + rate(registry_http_requests_total{code=~"4..|5.."}[5m]) + ) + / + sum by ({{ range $.Values.metrics.rules.additionalAggregationLabels }}{{ . }},{{ end }})( + rate(registry_http_requests_total[5m]) + ) + > 0.15 + for: 5m + labels: + severity: warning +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborRateErrors }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborRateErrors }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRateErrors }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborRateErrors | indent 8 }} +{{- end }} +{{- end }} +{{- if not (.Values.metrics.rules.disabled.HarborQuotaProjectLimit | default false) }} + - alert: HarborQuotaProjectLimit + annotations: + summary: Project Quota Is Raising The Limit. + expr: |- + ((harbor_project_quota_usage_byte > 0) / harbor_quotas_size_bytes) > 0.95 + for: 5m + labels: + severity: critical +{{- if or .Values.metrics.rules.additionalRuleLabels .Values.metrics.rules.additionalRuleGroupLabels.HarborQuotaProjectLimit }} +{{- with .Values.metrics.rules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- with .Values.metrics.rules.additionalRuleGroupLabels.HarborQuotaProjectLimit }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- end }} +{{- if .Values.metrics.rules.additionalRuleGroupAnnotations.HarborQuotaProjectLimit }} +{{ toYaml .Values.metrics.rules.additionalRuleGroupAnnotations.HarborQuotaProjectLimit | indent 8 }} +{{- end }} +{{- end }} +{{- end }} diff --git a/templates/metrics/_helpers-monitoring.tpl b/templates/metrics/_helpers-monitoring.tpl new file mode 100644 index 000000000..56cfdc630 --- /dev/null +++ b/templates/metrics/_helpers-monitoring.tpl @@ -0,0 +1,18 @@ +{{/* +Convert a recording rule group to yaml +*/}} +{{- define "harbor.ruleGroupToYaml" -}} +{{- range . }} +- name: {{ .name }} + rules: + {{- toYaml .rules | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "harbor.serviceMonitorMatchLabels"}} +{{- if .Values.metrics.serviceMonitor.matchLabels }} +{{- toYaml .Values.metrics.serviceMonitor.matchLabels }} +{{- else }} +{{- include "harbor.matchLabels" $ }} +{{- end }} +{{- end }} diff --git a/templates/metrics/metrics-rules.yaml b/templates/metrics/metrics-rules.yaml new file mode 100644 index 000000000..efbcca02c --- /dev/null +++ b/templates/metrics/metrics-rules.yaml @@ -0,0 +1,16 @@ +{{- if and .Values.metrics.enabled .Values.metrics.serviceMonitor.enabled .Values.metrics.rules.enabled .Values.metrics.rules.alerting }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "harbor.labels" $ | nindent 4 }} + {{- with .Values.metrics.rules.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} + name: {{ template "harbor.fullname" . }} +spec: + groups: + {{- include "harbor.ruleGroupToYaml" (include "harbor.rules" . | fromYaml).groups | indent 2 }} + {{- include "harbor.ruleGroupToYaml" .Values.metrics.rules.additionalGroups | indent 2 }} +{{- end }} diff --git a/templates/metrics/metrics-svcmon.yaml b/templates/metrics/metrics-svcmon.yaml index 1122ef01e..8e6af626e 100644 --- a/templates/metrics/metrics-svcmon.yaml +++ b/templates/metrics/metrics-svcmon.yaml @@ -24,5 +24,5 @@ spec: {{ toYaml .Values.metrics.serviceMonitor.relabelings | indent 4 }} {{- end }} selector: - matchLabels: {{ include "harbor.matchLabels" . | nindent 6 }} + matchLabels: {{ include "harbor.serviceMonitorMatchLabels" . | nindent 6 }} {{- end }} diff --git a/values.yaml b/values.yaml index 3c4e87081..ed076c472 100644 --- a/values.yaml +++ b/values.yaml @@ -398,24 +398,74 @@ metrics: ## serviceMonitor: enabled: false + # -- Override base matchLabels + matchLabels: {} + # -- Additional labels for serviceMonitor additionalLabels: {} # Scrape interval. If not set, the Prometheus default scrape interval is used. interval: "" # Metric relabel configs to apply to samples before ingestion. - metricRelabelings: - [] - # - action: keep + metricRelabelings: [] + # - action: keep # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' # sourceLabels: [__name__] # Relabel configs to apply to samples before ingestion. - relabelings: - [] - # - sourceLabels: [__meta_kubernetes_pod_node_name] - # separator: ; - # regex: ^(.*)$ - # targetLabel: nodename - # replacement: $1 - # action: replace + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + rules: + enabled: false + # -- Specify which individual alerts should be disabled + # -- Instead of turning off each alert one by one, set the .rules.alerting value to false instead. + # -- If you disable all the alerts and keep .rules.alerting set to true, the chart will fail to render. + alerting: true + disabled: {} + # HarborCoreDown: true + # HarborDatabaseDown: true + # -- Additional labels for expression + additionalAggregationLabels: [] + # -- Additional labels for the rules PrometheusRule resource + additionalLabels: {} + # -- Additional labels for PrometheusRule alerts + additionalRuleLabels: {} + # -- Additional labels for specific PrometheusRule alert groups + additionalRuleGroupLabels: + HarborCoreDown: {} + HarborDatabaseDown: {} + HarborRegistryDown: {} + HarborRedisDown: {} + HarborTrivyDown: {} + HarborJobServiceDown: {} + HarborLatency99: {} + HarborRateErrors: {} + HarborQuotaProjectLimit: {} + additionalRuleGroupAnnotations: + HarborCoreDown: {} + HarborDatabaseDown: {} + HarborRegistryDown: {} + HarborRedisDown: {} + HarborTrivyDown: {} + HarborJobServiceDown: {} + HarborLatency99: {} + HarborRateErrors: {} + HarborQuotaProjectLimit: {} + # -- Additional groups to add to the rules file + additionalGroups: [] + # - name: additional-harbor-rules + # rules: + # - record: my:record:name + # expr: my_expression + # - alert: MyAlertName + # expr: my:record:name > 0 + # annotations: + # ... + # labels: + # ... + # for: 1m trace: enabled: false