From 2cd739e8b292939e71c69d0117c4b37094dd3c25 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Tue, 25 Nov 2025 10:46:46 +0100 Subject: [PATCH 1/3] Implement multicluster support --- Tiltfile | 14 +- api/v1alpha1/datasource_types.go | 3 + api/v1alpha1/decision_types.go | 3 + api/v1alpha1/descheduling_types.go | 3 + api/v1alpha1/knowledge_types.go | 3 + api/v1alpha1/kpi_types.go | 3 + api/v1alpha1/pipeline_types.go | 3 + api/v1alpha1/reservation_types.go | 3 + api/v1alpha1/step_types.go | 3 + cmd/main.go | 89 +++-- docs/guides/multicluster/cortex-home-crb.yaml | 12 + docs/guides/multicluster/cortex-home.yaml | 23 ++ .../multicluster/cortex-remote-crb.yaml | 18 + docs/guides/multicluster/cortex-remote.yaml | 27 ++ docs/guides/multicluster/readme.md | 104 ++++++ .../cortex-ironcore/templates/pipelines.yaml | 1 - .../datasources/openstack/controller.go | 5 +- .../datasources/prometheus/controller.go | 5 +- internal/knowledge/extractor/controller.go | 5 +- internal/knowledge/extractor/trigger.go | 20 +- internal/knowledge/kpis/controller.go | 33 +- .../reservations/controller/controller.go | 5 +- .../decisions/cinder/pipeline_controller.go | 53 +-- .../decisions/explanation/controller.go | 5 +- .../decisions/machines/pipeline_controller.go | 53 +-- .../decisions/manila/pipeline_controller.go | 53 +-- .../decisions/nova/pipeline_controller.go | 53 +-- .../nova/pipeline_controller_test.go | 27 -- .../scheduling/descheduling/nova/cleanup.go | 5 +- .../scheduling/descheduling/nova/executor.go | 5 +- .../descheduling/nova/executor_test.go | 15 - .../descheduling/nova/pipeline_controller.go | 35 +- .../nova/pipeline_controller_test.go | 32 -- pkg/conf/conf.go | 16 + pkg/multicluster/builder.go | 49 +++ pkg/multicluster/builder_test.go | 61 ++++ pkg/multicluster/client.go | 310 ++++++++++++++++++ pkg/multicluster/client_test.go | 99 ++++++ 38 files changed, 985 insertions(+), 271 deletions(-) create mode 100644 docs/guides/multicluster/cortex-home-crb.yaml create mode 100644 docs/guides/multicluster/cortex-home.yaml create mode 100644 docs/guides/multicluster/cortex-remote-crb.yaml create mode 100644 docs/guides/multicluster/cortex-remote.yaml create mode 100644 docs/guides/multicluster/readme.md create mode 100644 pkg/multicluster/builder.go create mode 100644 pkg/multicluster/builder_test.go create mode 100644 pkg/multicluster/client.go create mode 100644 pkg/multicluster/client_test.go diff --git a/Tiltfile b/Tiltfile index 1ec900932..965581e7f 100644 --- a/Tiltfile +++ b/Tiltfile @@ -17,7 +17,11 @@ if not os.getenv('TILT_VALUES_PATH'): fail("TILT_VALUES_PATH is not set.") if not os.path.exists(os.getenv('TILT_VALUES_PATH')): fail("TILT_VALUES_PATH "+ os.getenv('TILT_VALUES_PATH') + " does not exist.") -tilt_values = os.getenv('TILT_VALUES_PATH') +tilt_values = [os.getenv('TILT_VALUES_PATH')] + +tilt_overrides = os.getenv('TILT_OVERRIDES_PATH') +if tilt_overrides and os.path.exists(tilt_overrides): + tilt_values.append(tilt_overrides) load('ext://helm_resource', 'helm_resource', 'helm_repo') helm_repo( @@ -106,7 +110,7 @@ k8s_yaml(helm('./helm/bundles/cortex-crds', name='cortex-crds', set=[ if 'nova' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Nova bundle") - k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=[tilt_values])) + k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=tilt_values)) k8s_resource('cortex-nova-postgresql', labels=['Cortex-Nova'], port_forwards=[ port_forward(8000, 5432), ]) @@ -125,7 +129,7 @@ if 'nova' in ACTIVE_DEPLOYMENTS: if 'manila' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Manila bundle") - k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=[tilt_values])) + k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=tilt_values)) k8s_resource('cortex-manila-postgresql', labels=['Cortex-Manila'], port_forwards=[ port_forward(8002, 5432), ]) @@ -142,7 +146,7 @@ if 'manila' in ACTIVE_DEPLOYMENTS: ) if 'cinder' in ACTIVE_DEPLOYMENTS: - k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=[tilt_values])) + k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=tilt_values)) k8s_resource('cortex-cinder-postgresql', labels=['Cortex-Cinder'], port_forwards=[ port_forward(8004, 5432), ]) @@ -160,7 +164,7 @@ if 'cinder' in ACTIVE_DEPLOYMENTS: if 'ironcore' in ACTIVE_DEPLOYMENTS: print("Activating Cortex IronCore bundle") - k8s_yaml(helm('./helm/bundles/cortex-ironcore', name='cortex-ironcore', values=[tilt_values])) + k8s_yaml(helm('./helm/bundles/cortex-ironcore', name='cortex-ironcore', values=tilt_values)) k8s_resource('cortex-ironcore-controller-manager', labels=['Cortex-IronCore']) # Deploy resources in machines/samples k8s_yaml('samples/ironcore/machinepool.yaml') diff --git a/api/v1alpha1/datasource_types.go b/api/v1alpha1/datasource_types.go index c994a2424..9ab0e3aa9 100644 --- a/api/v1alpha1/datasource_types.go +++ b/api/v1alpha1/datasource_types.go @@ -293,6 +293,9 @@ type DatasourceList struct { Items []Datasource `json:"items"` } +func (*Datasource) URI() string { return "datasources.cortex.cloud/v1alpha1" } +func (*DatasourceList) URI() string { return "datasources.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Datasource{}, &DatasourceList{}) } diff --git a/api/v1alpha1/decision_types.go b/api/v1alpha1/decision_types.go index ea71c60ff..0e4399ed1 100644 --- a/api/v1alpha1/decision_types.go +++ b/api/v1alpha1/decision_types.go @@ -155,6 +155,9 @@ type DecisionList struct { Items []Decision `json:"items"` } +func (*Decision) URI() string { return "decisions.cortex.cloud/v1alpha1" } +func (*DecisionList) URI() string { return "decisions.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Decision{}, &DecisionList{}) } diff --git a/api/v1alpha1/descheduling_types.go b/api/v1alpha1/descheduling_types.go index 060a7c56d..a4b05bd76 100644 --- a/api/v1alpha1/descheduling_types.go +++ b/api/v1alpha1/descheduling_types.go @@ -102,6 +102,9 @@ type DeschedulingList struct { Items []Descheduling `json:"items"` } +func (*Descheduling) URI() string { return "deschedulings.cortex.cloud/v1alpha1" } +func (*DeschedulingList) URI() string { return "deschedulings.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Descheduling{}, &DeschedulingList{}) } diff --git a/api/v1alpha1/knowledge_types.go b/api/v1alpha1/knowledge_types.go index cba0d026c..7e90ff7a9 100644 --- a/api/v1alpha1/knowledge_types.go +++ b/api/v1alpha1/knowledge_types.go @@ -174,6 +174,9 @@ type KnowledgeList struct { Items []Knowledge `json:"items"` } +func (*Knowledge) URI() string { return "knowledges.cortex.cloud/v1alpha1" } +func (*KnowledgeList) URI() string { return "knowledges.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Knowledge{}, &KnowledgeList{}) } diff --git a/api/v1alpha1/kpi_types.go b/api/v1alpha1/kpi_types.go index 59dee1b74..93ccd5f18 100644 --- a/api/v1alpha1/kpi_types.go +++ b/api/v1alpha1/kpi_types.go @@ -97,6 +97,9 @@ type KPIList struct { Items []KPI `json:"items"` } +func (*KPI) URI() string { return "kpis.cortex.cloud/v1alpha1" } +func (*KPIList) URI() string { return "kpis.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&KPI{}, &KPIList{}) } diff --git a/api/v1alpha1/pipeline_types.go b/api/v1alpha1/pipeline_types.go index 20495b150..a54b68ca7 100644 --- a/api/v1alpha1/pipeline_types.go +++ b/api/v1alpha1/pipeline_types.go @@ -98,6 +98,9 @@ type PipelineList struct { Items []Pipeline `json:"items"` } +func (*Pipeline) URI() string { return "pipelines.cortex.cloud/v1alpha1" } +func (*PipelineList) URI() string { return "pipelines.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Pipeline{}, &PipelineList{}) } diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 03386d932..56fdafea0 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -97,6 +97,9 @@ type ReservationList struct { Items []Reservation `json:"items"` } +func (*Reservation) URI() string { return "reservations.cortex.cloud/v1alpha1" } +func (*ReservationList) URI() string { return "reservations.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Reservation{}, &ReservationList{}) } diff --git a/api/v1alpha1/step_types.go b/api/v1alpha1/step_types.go index d4866fc31..012034e28 100644 --- a/api/v1alpha1/step_types.go +++ b/api/v1alpha1/step_types.go @@ -129,6 +129,9 @@ type StepList struct { Items []Step `json:"items"` } +func (*Step) URI() string { return "steps.cortex.cloud/v1alpha1" } +func (*StepList) URI() string { return "steps.cortex.cloud/v1alpha1" } + func init() { SchemeBuilder.Register(&Step{}, &StepList{}) } diff --git a/cmd/main.go b/cmd/main.go index 42ac2742a..64d6cf6ca 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -22,6 +22,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/certwatcher" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -54,6 +55,7 @@ import ( "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/db" "github.com/cobaltcore-dev/cortex/pkg/monitoring" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" "github.com/sapcc/go-bits/httpext" "github.com/sapcc/go-bits/must" corev1 "k8s.io/api/core/v1" @@ -243,6 +245,33 @@ func main() { os.Exit(1) } + homeCluster, err := cluster.New(restConfig, func(o *cluster.Options) { o.Scheme = scheme }) + if err != nil { + setupLog.Error(err, "unable to create home cluster") + os.Exit(1) + } + if err := mgr.Add(homeCluster); err != nil { + setupLog.Error(err, "unable to add home cluster") + os.Exit(1) + } + multiclusterClient := &multicluster.Client{ + HomeCluster: homeCluster, + HomeRestConfig: restConfig, + HomeScheme: scheme, + } + for _, override := range config.APIServerOverrides { + cluster, err := multiclusterClient.AddRemote(override.Resource, override.Host, override.CACert) + if err != nil { + setupLog.Error(err, "unable to create cluster for apiserver override", "apiserver", override.Host) + os.Exit(1) + } + // Also tell the manager about this cluster so that controllers can use it. + if err := mgr.Add(cluster); err != nil { + setupLog.Error(err, "unable to add cluster for apiserver override", "apiserver", override.Host) + os.Exit(1) + } + } + // Our custom monitoring registry can add prometheus labels to all metrics. // This is useful to distinguish metrics from different deployments. metrics.Registry = monitoring.WrapRegistry(metrics.Registry, config.Monitoring) @@ -265,14 +294,14 @@ func main() { Conf: config, } // Inferred through the base controller. - decisionController.Client = mgr.GetClient() + decisionController.Client = multiclusterClient decisionController.OperatorName = config.Operator - if err := (decisionController).SetupWithManager(mgr); err != nil { + if err := (decisionController).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler") os.Exit(1) } novashims.NewAPI(config, decisionController).Init(mux) - go decisionsnova.CleanupNovaDecisionsRegularly(ctx, mgr.GetClient(), config) + go decisionsnova.CleanupNovaDecisionsRegularly(ctx, multiclusterClient, config) } if slices.Contains(config.EnabledControllers, "nova-deschedulings-pipeline-controller") { // Deschedulings controller @@ -284,18 +313,18 @@ func main() { CycleDetector: deschedulingnova.NewCycleDetector(), } // Inferred through the base controller. - deschedulingsController.Client = mgr.GetClient() + deschedulingsController.Client = multiclusterClient deschedulingsController.OperatorName = config.Operator - if err := (deschedulingsController).SetupWithManager(mgr); err != nil { + if err := (deschedulingsController).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DeschedulingsReconciler") os.Exit(1) } go deschedulingsController.CreateDeschedulingsPeriodically(ctx) // Deschedulings cleanup on startup if err := (&deschedulingnova.Cleanup{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Cleanup") os.Exit(1) } @@ -306,14 +335,14 @@ func main() { Conf: config, } // Inferred through the base controller. - controller.Client = mgr.GetClient() + controller.Client = multiclusterClient controller.OperatorName = config.Operator - if err := (controller).SetupWithManager(mgr); err != nil { + if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler") os.Exit(1) } manilashims.NewAPI(config, controller).Init(mux) - go decisionsmanila.CleanupManilaDecisionsRegularly(ctx, mgr.GetClient(), config) + go decisionsmanila.CleanupManilaDecisionsRegularly(ctx, multiclusterClient, config) } if slices.Contains(config.EnabledControllers, "cinder-decisions-pipeline-controller") { controller := &decisionscinder.DecisionPipelineController{ @@ -321,14 +350,14 @@ func main() { Conf: config, } // Inferred through the base controller. - controller.Client = mgr.GetClient() + controller.Client = multiclusterClient controller.OperatorName = config.Operator - if err := (controller).SetupWithManager(mgr); err != nil { + if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler") os.Exit(1) } cindershims.NewAPI(config, controller).Init(mux) - go decisionscinder.CleanupCinderDecisionsRegularly(ctx, mgr.GetClient(), config) + go decisionscinder.CleanupCinderDecisionsRegularly(ctx, multiclusterClient, config) } if slices.Contains(config.EnabledControllers, "ironcore-decisions-pipeline-controller") { controller := &decisionsmachines.DecisionPipelineController{ @@ -336,9 +365,9 @@ func main() { Conf: config, } // Inferred through the base controller. - controller.Client = mgr.GetClient() + controller.Client = multiclusterClient controller.OperatorName = config.Operator - if err := (controller).SetupWithManager(mgr); err != nil { + if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler") os.Exit(1) } @@ -347,23 +376,23 @@ func main() { // Setup a controller which will reconcile the history and explanation for // decision resources. explanationController := &explanation.Controller{ - Client: mgr.GetClient(), + Client: multiclusterClient, OperatorName: config.Operator, } - if err := explanationController.SetupWithManager(mgr); err != nil { + if err := explanationController.SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ExplanationController") os.Exit(1) } } if slices.Contains(config.EnabledControllers, "reservations-controller") { - monitor := reservationscontroller.NewControllerMonitor(mgr.GetClient()) + monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) if err := (&reservationscontroller.ReservationReconciler{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), Conf: config, HypervisorClient: reservationscontroller.NewHypervisorClient(), - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Reservation") os.Exit(1) } @@ -372,20 +401,20 @@ func main() { monitor := datasources.NewMonitor() metrics.Registry.MustRegister(&monitor) if err := (&openstack.OpenStackDatasourceReconciler{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), Monitor: monitor, Conf: config, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "OpenStackDatasourceReconciler") os.Exit(1) } if err := (&prometheus.PrometheusDatasourceReconciler{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), Monitor: monitor, Conf: config, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PrometheusDatasourceReconciler") os.Exit(1) } @@ -394,29 +423,29 @@ func main() { monitor := extractor.NewMonitor() metrics.Registry.MustRegister(&monitor) if err := (&extractor.KnowledgeReconciler{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), Monitor: monitor, Conf: config, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "KnowledgeReconciler") os.Exit(1) } if err := (&extractor.TriggerReconciler{ - Client: mgr.GetClient(), + Client: multiclusterClient, Scheme: mgr.GetScheme(), Conf: config, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "TriggerReconciler") os.Exit(1) } } if slices.Contains(config.EnabledControllers, "kpis-controller") { if err := (&kpis.Controller{ - Client: mgr.GetClient(), + Client: multiclusterClient, SupportedKPIsByImpl: kpis.SupportedKPIsByImpl, OperatorName: config.Operator, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "KPIController") os.Exit(1) } diff --git a/docs/guides/multicluster/cortex-home-crb.yaml b/docs/guides/multicluster/cortex-home-crb.yaml new file mode 100644 index 000000000..1854084f2 --- /dev/null +++ b/docs/guides/multicluster/cortex-home-crb.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grant-cortex-remote-oidc-access +subjects: +- kind: User + apiGroup: rbac.authorization.k8s.io + name: system:anonymous +roleRef: + kind: ClusterRole + name: system:service-account-issuer-discovery + apiGroup: rbac.authorization.k8s.io diff --git a/docs/guides/multicluster/cortex-home.yaml b/docs/guides/multicluster/cortex-home.yaml new file mode 100644 index 000000000..c1e093807 --- /dev/null +++ b/docs/guides/multicluster/cortex-home.yaml @@ -0,0 +1,23 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: cortex-home +nodes: + - role: worker + - role: control-plane + extraPortMappings: + - containerPort: 6443 + hostPort: 8443 + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + extraArgs: + service-account-issuer: "https://host.docker.internal:8443" + service-account-jwks-uri: "https://host.docker.internal:8443/openid/v1/jwks" + certSANs: + - api-proxy + - api-proxy.default.svc + - api-proxy.default.svc.cluster.local + - localhost + - 127.0.0.1 + - host.docker.internal diff --git a/docs/guides/multicluster/cortex-remote-crb.yaml b/docs/guides/multicluster/cortex-remote-crb.yaml new file mode 100644 index 000000000..47ea9aa81 --- /dev/null +++ b/docs/guides/multicluster/cortex-remote-crb.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grant-cortex-cluster-admin +subjects: +- kind: User + apiGroup: rbac.authorization.k8s.io + name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-knowledge-controller-manager" +- kind: User + apiGroup: rbac.authorization.k8s.io + name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-scheduling-controller-manager" +- kind: User + apiGroup: rbac.authorization.k8s.io + name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-reservations-controller-manager" +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/docs/guides/multicluster/cortex-remote.yaml b/docs/guides/multicluster/cortex-remote.yaml new file mode 100644 index 000000000..675a14063 --- /dev/null +++ b/docs/guides/multicluster/cortex-remote.yaml @@ -0,0 +1,27 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: cortex-remote +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 6443 + hostPort: 8444 + extraMounts: + - hostPath: /tmp/root-ca-home.pem + containerPath: /etc/ca-certificates/root-ca.pem + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + extraArgs: + oidc-client-id: "https://host.docker.internal:8443" # = audience + oidc-issuer-url: "https://host.docker.internal:8443" + oidc-username-claim: sub + oidc-ca-file: /etc/ca-certificates/root-ca.pem + certSANs: + - api-proxy + - api-proxy.default.svc + - api-proxy.default.svc.cluster.local + - localhost + - 127.0.0.1 + - host.docker.internal \ No newline at end of file diff --git a/docs/guides/multicluster/readme.md b/docs/guides/multicluster/readme.md new file mode 100644 index 000000000..fa0fd3c83 --- /dev/null +++ b/docs/guides/multicluster/readme.md @@ -0,0 +1,104 @@ +# Cortex Multi-Cluster Testing + +Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. + +This guide will walk you through setting up a multi-cluster Cortex deployment using [kind](https://kind.sigs.k8s.io/). We will create two kind clusters: `cortex-home` and `cortex-remote`. The `cortex-home` cluster will host the Cortex control plane, while the `cortex-remote` cluster will be used to store CRDs. + +To store its CRDs in the `cortex-remote` cluster, the `cortex-home` cluster needs to be able to authenticate to the `cortex-remote` cluster's API server. We will achieve this by configuring the `cortex-remote` cluster to trust the service account tokens issued by the `cortex-home` cluster. In this way, no external OIDC provider is needed, because the `cortex-home` cluster's own OIDC issuer for service accounts acts as the identity provider. + +Here is a diagram illustrating the authentication flow: + +```mermaid +sequenceDiagram + participant Home as cortex-home + participant Remote as cortex-remote + Home->>Home: Service Account Token Issued + Home->>Remote: API Request with Token + Remote->>Remote: Token Verified Against Home's OIDC Issuer + Remote->>Home: API Response +``` + +## Home Cluster Setup + +First we set up the `cortex-home` cluster. The provided kind configuration file `cortex-home.yaml` sets up the cluster with the necessary port mappings to allow communication between the two clusters. `cortex-home` will expose its API server on port `8443`, which `cortex-remote` will use to verify service account tokens through `https://host.docker.internal:8443`. + +```bash +kind create cluster --config docs/guides/multicluster/cortex-home.yaml +``` + +Next, we need to expose the OIDC issuer endpoint of the `cortex-home` cluster's API server to the `cortex-remote` cluster. We do this by creating a `ClusterRoleBinding` that grants the `system:service-account-issuer-discovery` role to the `kube-system` service account in the `cortex-home` cluster. + +```bash +kubectl --context kind-cortex-home apply -f docs/guides/multicluster/cortex-home-crb.yaml +``` + +To talk back to the `cortex-home` cluster's OIDC endpoint, the `cortex-remote` cluster needs to trust the root CA certificate used by the `cortex-home` cluster's API server. We can extract this certificate from the `extension-apiserver-authentication` config map in the `kube-system` namespace, and save it to a temporary file for later use. + +```bash +kubectl --context kind-cortex-home --namespace kube-system \ + get configmap extension-apiserver-authentication \ + -o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-home.pem +``` + +## Remote Cluster Setup + +With all the prerequisites in place, we can now set up the `cortex-remote` cluster. We create the cluster using the provided kind configuration file `cortex-remote.yaml`. This configuration will tell the `cortex-remote` cluster to trust the `cortex-home` cluster's API server as OIDC issuer for service account token verification. Also, the `cortex-remote` cluster will trust the root CA certificate we extracted earlier. The `cortex-remote` apiserver will be accessible at `https://host.docker.internal:8444`. + +```bash +kind create cluster --config docs/guides/multicluster/cortex-remote.yaml +``` + +Next, we need to create a `ClusterRoleBinding` in the `cortex-remote` cluster that grants service accounts coming from the `cortex-home` cluster access to the appropriate resources. We do this by applying the provided `cortex-remote-crb.yaml` file. + +```bash +kubectl --context kind-cortex-remote apply -f docs/guides/multicluster/cortex-remote-crb.yaml +``` + +## Deploying Cortex + +Before we launch cortex make sure that the CRDs are installed in the `cortex-remote` cluster. + +```bash +kubectl config use-context kind-cortex-remote +helm install helm/bundles/cortex-crds --generate-name +``` + +Also, we need to extract the root CA certificate used by the `cortex-remote` cluster's API server, so that we can configure the cortex pods in the `cortex-home` cluster to trust it. + +```bash +kubectl --context kind-cortex-remote --namespace kube-system \ + get configmap extension-apiserver-authentication \ + -o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-remote.pem +``` + +Now we can deploy cortex to the `cortex-home` cluster, configuring it to use the `cortex-remote` cluster for CRD storage. We create a temporary Helm values override file that specifies the API server URL and root CA certificate for the `cortex-remote` cluster. In this example, we are configuring the `decisions.cortex.cloud/v1alpha1` resource to be stored in the `cortex-remote` cluster. + +```bash +export TILT_OVERRIDES_PATH=/tmp/cortex-values.yaml +tee $TILT_OVERRIDES_PATH < Date: Tue, 25 Nov 2025 14:56:09 +0100 Subject: [PATCH 2/3] Revert change in pipelines.yaml --- helm/bundles/cortex-ironcore/templates/pipelines.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/helm/bundles/cortex-ironcore/templates/pipelines.yaml b/helm/bundles/cortex-ironcore/templates/pipelines.yaml index 4a1cdea43..6b087bfbf 100644 --- a/helm/bundles/cortex-ironcore/templates/pipelines.yaml +++ b/helm/bundles/cortex-ironcore/templates/pipelines.yaml @@ -8,6 +8,7 @@ spec: description: | This pipeline is used to schedule ironcore machines onto machinepools. type: filter-weigher + createDecisions: true steps: - ref: {name: machinepools-noop} mandatory: false From 6febba469b1b7de532cbfd2a52a846b96cb9b6f9 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Tue, 25 Nov 2025 15:55:15 +0100 Subject: [PATCH 3/3] Add ironcore serviceaccount to guide crb --- docs/guides/multicluster/cortex-remote-crb.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/guides/multicluster/cortex-remote-crb.yaml b/docs/guides/multicluster/cortex-remote-crb.yaml index 47ea9aa81..9928720d7 100644 --- a/docs/guides/multicluster/cortex-remote-crb.yaml +++ b/docs/guides/multicluster/cortex-remote-crb.yaml @@ -12,6 +12,9 @@ subjects: - kind: User apiGroup: rbac.authorization.k8s.io name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-reservations-controller-manager" +- kind: User + apiGroup: rbac.authorization.k8s.io + name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-ironcore-controller-manager" roleRef: kind: ClusterRole name: cluster-admin