From 832f587f0b1b21b104ea746891c071b77a848283 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Tue, 16 Jul 2024 11:02:27 -0400 Subject: [PATCH] jsonnet: allow rollout-operator to be used as webhook endpoint This change adds jsonnet configuration that allows the rollout-operator to be used as a validating or mutating webhook for changes to statefulsets. This is required to use the `no-downscale` or `prepare-downscale` labels on stateful components. This doesn't make any changes to the functionality used internally at Grafana or make anything more configurable. For example, this jsonnet still has the rollout-operator create self-signed certificates for the HTTPS webhook endpoints. Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 3 +- .../grafana-agent-cluster-role-binding.yaml | 1 - .../grafana-agent-cluster-role.yaml | 1 - .../grafana-agent-cluster-role-binding.yaml | 1 - .../grafana-agent-cluster-role.yaml | 1 - ...teway-automated-downscaling-generated.yaml | 150 ++++++++++++++++ ...tore-gateway-automated-downscaling.jsonnet | 1 + operations/mimir/rollout-operator.libsonnet | 163 +++++++++++++++++- operations/policies/policies.rego | 2 +- 9 files changed, 313 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2212addee1..90a855ba49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,7 @@ * `ingest_storage_migration_partition_ingester_zone_c_replicas` * [ENHANCEMENT] Distributor: increase `-distributor.remote-timeout` when the experimental ingest storage is enabled. #8518 * [ENHANCEMENT] Memcached: Update to Memcached 1.6.28 and memcached-exporter 0.14.4. #8557 +* [ENHANCEMENT] Rollout-operator: Allow the rollout-operator to be used as Kubernetes statefulset webhook to enable `no-downscale` and `prepare-downscale` annotations to be used on ingesters or store-gateways. #8743 ### Mimirtool @@ -82,8 +83,8 @@ * [CHANGE] Use test metrics that do not pass through 0 to make identifying incorrect results easier. #8630 * [ENHANCEMENT] Include human-friendly timestamps in diffs logged when a test fails. #8630 -* [BUGFIX] Initialize test result metrics to 0 at startup so that alerts can correctly identify the first failure after startup. #8630 * [ENHANCEMENT] Add histograms to measure latency of read and write requests. #8583 +* [BUGFIX] Initialize test result metrics to 0 at startup so that alerts can correctly identify the first failure after startup. #8630 ### Query-tee diff --git a/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml b/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml index 47d8cf59bd..23986ba580 100644 --- a/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml +++ b/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml @@ -4,7 +4,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: {{ include "mimir.resourceName" (dict "ctx" $ "component" "grafana-agent") }} - namespace: {{ .namespace | default $.Release.Namespace | quote }} labels: {{- include "mimir.labels" (dict "ctx" $ "component" "meta-monitoring" ) | nindent 4 }} roleRef: diff --git a/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml b/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml index 678d9165d2..a65e905610 100644 --- a/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml +++ b/operations/helm/charts/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml @@ -4,7 +4,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{ include "mimir.resourceName" (dict "ctx" $ "component" "grafana-agent") }} - namespace: {{ .namespace | default $.Release.Namespace | quote }} labels: {{- include "mimir.labels" (dict "ctx" $ "component" "meta-monitoring" ) | nindent 4 }} rules: diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml index 9c23474624..9aaf3777b0 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role-binding.yaml @@ -4,7 +4,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: metamonitoring-values-mimir-grafana-agent - namespace: "citestns" labels: app.kubernetes.io/name: mimir app.kubernetes.io/instance: metamonitoring-values diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml index 282c89ff2a..1ee8b0c618 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-agent-cluster-role.yaml @@ -4,7 +4,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: metamonitoring-values-mimir-grafana-agent - namespace: "citestns" labels: app.kubernetes.io/name: mimir app.kubernetes.io/instance: metamonitoring-values diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index 948522c09a..53f3353252 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -201,6 +201,33 @@ metadata: namespace: default --- apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: rollout-operator-default-webhook-cert-update-role +rules: +- apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + - mutatingwebhookconfigurations + verbs: + - list + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rollout-operator-default-webhook-cert-secret-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: rollout-operator-default-webhook-cert-update-role +subjects: +- kind: ServiceAccount + name: rollout-operator + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: rollout-operator-role @@ -232,6 +259,28 @@ rules: - update --- apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: rollout-operator-webhook-cert-secret-role + namespace: default +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - create +- apiGroups: + - "" + resourceNames: + - rollout-operator-self-signed-certificate + resources: + - secrets + verbs: + - update + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: rollout-operator-rolebinding @@ -245,6 +294,20 @@ subjects: name: rollout-operator namespace: default --- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: rollout-operator-webhook-cert-secret-rolebinding + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: rollout-operator-webhook-cert-secret-role +subjects: +- kind: ServiceAccount + name: rollout-operator + namespace: default +--- apiVersion: v1 kind: Service metadata: @@ -551,6 +614,20 @@ spec: --- apiVersion: v1 kind: Service +metadata: + name: rollout-operator + namespace: default +spec: + ports: + - name: https + port: 443 + protocol: TCP + targetPort: 8443 + selector: + name: rollout-operator +--- +apiVersion: v1 +kind: Service metadata: labels: name: ruler @@ -1015,6 +1092,7 @@ spec: spec: containers: - args: + - --server-tls.enabled=true - -kubernetes.namespace=default image: grafana/rollout-operator:v0.17.0 imagePullPolicy: IfNotPresent @@ -2391,3 +2469,75 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + labels: + grafana.com/inject-rollout-operator-ca: "true" + grafana.com/namespace: default + name: prepare-downscale-default +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: rollout-operator + namespace: default + path: /admission/prepare-downscale + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + name: prepare-downscale-default.grafana.com + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: default + rules: + - apiGroups: + - apps + apiVersions: + - v1 + operations: + - UPDATE + resources: + - statefulsets + - statefulsets/scale + scope: Namespaced + sideEffects: NoneOnDryRun + timeoutSeconds: 10 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + labels: + grafana.com/inject-rollout-operator-ca: "true" + grafana.com/namespace: default + name: no-downscale-default +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: rollout-operator + namespace: default + path: /admission/no-downscale + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + name: no-downscale-default.grafana.com + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: default + rules: + - apiGroups: + - apps + apiVersions: + - v1 + operations: + - UPDATE + resources: + - statefulsets + - statefulsets/scale + scope: Namespaced + sideEffects: None + timeoutSeconds: 10 diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling.jsonnet b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling.jsonnet index e88020cc6e..cbda8dea4a 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling.jsonnet +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling.jsonnet @@ -1,6 +1,7 @@ // Based on test-multi-zone.jsonnet. (import 'test-multi-zone.jsonnet') { _config+:: { + enable_rollout_operator_webhook: true, store_gateway_automated_downscale_enabled: true, store_gateway_automated_downscale_min_time_between_zones: '20m', }, diff --git a/operations/mimir/rollout-operator.libsonnet b/operations/mimir/rollout-operator.libsonnet index 17eeb59038..a4b19cce56 100644 --- a/operations/mimir/rollout-operator.libsonnet +++ b/operations/mimir/rollout-operator.libsonnet @@ -1,20 +1,45 @@ { + local clusterRole = $.rbac.v1.clusterRole, + local clusterRoleBinding = $.rbac.v1.clusterRoleBinding, local container = $.core.v1.container, local deployment = $.apps.v1.deployment, - local roleBinding = $.rbac.v1.roleBinding, + local mutatingWebhook = $.admissionregistration.v1.mutatingWebhook, + local mutatingWebhookConfiguration = $.admissionregistration.v1.mutatingWebhookConfiguration, + local policyRule = $.rbac.v1.policyRule, local role = $.rbac.v1.role, + local roleBinding = $.rbac.v1.roleBinding, + local service = $.core.v1.service, local serviceAccount = $.core.v1.serviceAccount, - local policyRule = $.rbac.v1.policyRule, + local servicePort = $.core.v1.servicePort, + local validatingWebhook = $.admissionregistration.v1.validatingWebhook, + local validatingWebhookConfiguration = $.admissionregistration.v1.validatingWebhookConfiguration, + + _config+:: { + // Configure the rollout operator to accept webhook requests made as part of scaling + // statefulsets up or down. This allows the rollout operator to ensure that stateful + // components (ingesters, store-gateways) are scaled up or down safely. + enable_rollout_operator_webhook: false, + + // ignore_rollout_operator_*_webhook_failures will set the rollout-operator to ignore + // webhook failures. Useful during a rollout to a new cell, where rollout-operator service + // is still not created, as the webhook might be created before the service, and that could + // block other operations that would block the service creation. + ignore_rollout_operator_no_downscale_webhook_failures: false, + ignore_rollout_operator_prepare_downscale_webhook_failures: false, + }, local rollout_operator_enabled = $._config.multi_zone_ingester_enabled || $._config.multi_zone_store_gateway_enabled || $._config.cortex_compactor_concurrent_rollout_enabled || - $._config.ingest_storage_ingester_autoscaling_enabled, + $._config.ingest_storage_ingester_autoscaling_enabled || + $._config.enable_rollout_operator_webhook, rollout_operator_args:: { 'kubernetes.namespace': $._config.namespace, - }, + } + if $._config.enable_rollout_operator_webhook then { + '-server-tls.enabled': 'true', + } else {}, rollout_operator_node_affinity_matchers:: [], @@ -41,6 +66,14 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + $.newMimirNodeAffinityMatchers($.rollout_operator_node_affinity_matchers), + rollout_operator_service: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + service.new( + 'rollout-operator', + { name: 'rollout-operator' }, + servicePort.newNamed('https', 443, 8443) + + servicePort.withProtocol('TCP'), + ), + rollout_operator_role: if !rollout_operator_enabled then null else role.new('rollout-operator-role') + role.mixin.metadata.withNamespace($._config.namespace) + @@ -68,6 +101,128 @@ namespace: $._config.namespace, }), + rollout_operator_webhook_cert_secret_role: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + role.new('rollout-operator-webhook-cert-secret-role') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRulesMixin([ + policyRule.withApiGroups('') + + policyRule.withResources(['secrets']) + + policyRule.withVerbs(['create']), + policyRule.withApiGroups('') + + policyRule.withResources(['secrets']) + + policyRule.withVerbs(['update', 'get']) + + policyRule.withResourceNames(['rollout-operator-self-signed-certificate']), + ]), + + rollout_operator_webhook_cert_secret_rolebinding: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + roleBinding.new('rollout-operator-webhook-cert-secret-rolebinding') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withKind('Role') + + roleBinding.mixin.roleRef.withName('rollout-operator-webhook-cert-secret-role') + + roleBinding.withSubjectsMixin({ + kind: 'ServiceAccount', + name: 'rollout-operator', + namespace: $._config.namespace, + }), + + rollout_operator_webhook_cert_update_clusterrole: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + clusterRole.new('rollout-operator-%s-webhook-cert-update-role' % $._config.namespace) + + clusterRole.withRulesMixin([ + policyRule.withApiGroups('admissionregistration.k8s.io') + + policyRule.withResources(['validatingwebhookconfigurations', 'mutatingwebhookconfigurations']) + + policyRule.withVerbs(['list', 'patch']), + ]), + + rollout_operator_webhook_cert_update_clusterrolebinding: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + clusterRoleBinding.new('rollout-operator-%s-webhook-cert-secret-rolebinding' % $._config.namespace) + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withKind('ClusterRole') + + clusterRoleBinding.mixin.roleRef.withName('rollout-operator-%s-webhook-cert-update-role' % $._config.namespace) + + clusterRoleBinding.withSubjectsMixin({ + kind: 'ServiceAccount', + name: 'rollout-operator', + namespace: $._config.namespace, + }), + + no_downscale_webhook: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + validatingWebhookConfiguration.new('no-downscale-%s' % $._config.namespace) + + validatingWebhookConfiguration.mixin.metadata.withLabels({ + 'grafana.com/namespace': $._config.namespace, + 'grafana.com/inject-rollout-operator-ca': 'true', + }) + + validatingWebhookConfiguration.withWebhooksMixin([ + validatingWebhook.withName('no-downscale-%s.grafana.com' % $._config.namespace) + + validatingWebhook.withAdmissionReviewVersions(['v1']) + + validatingWebhook.withFailurePolicy(if $._config.ignore_rollout_operator_no_downscale_webhook_failures then 'Ignore' else 'Fail') + + validatingWebhook.withMatchPolicy('Equivalent') + + validatingWebhook.withSideEffects('None') + + validatingWebhook.withTimeoutSeconds(10) + + validatingWebhook.withRulesMixin([ + { + apiGroups: ['apps'], + apiVersions: ['v1'], + operations: ['UPDATE'], + resources: ['statefulsets', 'statefulsets/scale'], + scope: 'Namespaced', + }, + ]) + + { + namespaceSelector: { + matchLabels: { + 'kubernetes.io/metadata.name': $._config.namespace, + }, + }, + clientConfig: { + service: { + name: 'rollout-operator', + namespace: $._config.namespace, + path: '/admission/no-downscale', + port: 443, + }, + }, + }, + ]), + + prepare_downscale_webhook: if !rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + mutatingWebhookConfiguration.new('prepare-downscale-%s' % $._config.namespace) + + mutatingWebhookConfiguration.mixin.metadata.withLabels({ + 'grafana.com/namespace': $._config.namespace, + 'grafana.com/inject-rollout-operator-ca': 'true', + }) + + mutatingWebhookConfiguration.withWebhooksMixin([ + mutatingWebhook.withName('prepare-downscale-%s.grafana.com' % $._config.namespace) + + mutatingWebhook.withAdmissionReviewVersions(['v1']) + + mutatingWebhook.withFailurePolicy(if $._config.ignore_rollout_operator_prepare_downscale_webhook_failures then 'Ignore' else 'Fail') + + mutatingWebhook.withMatchPolicy('Equivalent') + + mutatingWebhook.withSideEffects('NoneOnDryRun') + + mutatingWebhook.withTimeoutSeconds(10) + + mutatingWebhook.withRulesMixin([ + { + apiGroups: ['apps'], + apiVersions: ['v1'], + operations: ['UPDATE'], + resources: ['statefulsets', 'statefulsets/scale'], + scope: 'Namespaced', + }, + ]) + + { + namespaceSelector: { + matchLabels: { + 'kubernetes.io/metadata.name': $._config.namespace, + }, + }, + clientConfig: { + service: { + name: 'rollout-operator', + namespace: $._config.namespace, + path: '/admission/prepare-downscale', + port: 443, + }, + }, + }, + ]), + rollout_operator_service_account: if !rollout_operator_enabled then null else serviceAccount.new('rollout-operator'), diff --git a/operations/policies/policies.rego b/operations/policies/policies.rego index 0f6a2d3fb6..74b98f417e 100644 --- a/operations/policies/policies.rego +++ b/operations/policies/policies.rego @@ -8,7 +8,7 @@ should_be_namespaced(contents) { } should_be_namespaced(contents) { - not contents.kind in ["PodSecurityPolicy", "Namespace", "CustomResourceDefinition"] + not contents.kind in ["ClusterRole", "ClusterRoleBinding", "CustomResourceDefinition", "MutatingWebhookConfiguration", "Namespace", "PodSecurityPolicy", "ValidatingWebhookConfiguration"] } metadata_has_namespace(metadata) {