From 2cd739e8b292939e71c69d0117c4b37094dd3c25 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Tue, 25 Nov 2025 10:46:46 +0100
Subject: [PATCH 1/3] Implement multicluster support
---
Tiltfile | 14 +-
api/v1alpha1/datasource_types.go | 3 +
api/v1alpha1/decision_types.go | 3 +
api/v1alpha1/descheduling_types.go | 3 +
api/v1alpha1/knowledge_types.go | 3 +
api/v1alpha1/kpi_types.go | 3 +
api/v1alpha1/pipeline_types.go | 3 +
api/v1alpha1/reservation_types.go | 3 +
api/v1alpha1/step_types.go | 3 +
cmd/main.go | 89 +++--
docs/guides/multicluster/cortex-home-crb.yaml | 12 +
docs/guides/multicluster/cortex-home.yaml | 23 ++
.../multicluster/cortex-remote-crb.yaml | 18 +
docs/guides/multicluster/cortex-remote.yaml | 27 ++
docs/guides/multicluster/readme.md | 104 ++++++
.../cortex-ironcore/templates/pipelines.yaml | 1 -
.../datasources/openstack/controller.go | 5 +-
.../datasources/prometheus/controller.go | 5 +-
internal/knowledge/extractor/controller.go | 5 +-
internal/knowledge/extractor/trigger.go | 20 +-
internal/knowledge/kpis/controller.go | 33 +-
.../reservations/controller/controller.go | 5 +-
.../decisions/cinder/pipeline_controller.go | 53 +--
.../decisions/explanation/controller.go | 5 +-
.../decisions/machines/pipeline_controller.go | 53 +--
.../decisions/manila/pipeline_controller.go | 53 +--
.../decisions/nova/pipeline_controller.go | 53 +--
.../nova/pipeline_controller_test.go | 27 --
.../scheduling/descheduling/nova/cleanup.go | 5 +-
.../scheduling/descheduling/nova/executor.go | 5 +-
.../descheduling/nova/executor_test.go | 15 -
.../descheduling/nova/pipeline_controller.go | 35 +-
.../nova/pipeline_controller_test.go | 32 --
pkg/conf/conf.go | 16 +
pkg/multicluster/builder.go | 49 +++
pkg/multicluster/builder_test.go | 61 ++++
pkg/multicluster/client.go | 310 ++++++++++++++++++
pkg/multicluster/client_test.go | 99 ++++++
38 files changed, 985 insertions(+), 271 deletions(-)
create mode 100644 docs/guides/multicluster/cortex-home-crb.yaml
create mode 100644 docs/guides/multicluster/cortex-home.yaml
create mode 100644 docs/guides/multicluster/cortex-remote-crb.yaml
create mode 100644 docs/guides/multicluster/cortex-remote.yaml
create mode 100644 docs/guides/multicluster/readme.md
create mode 100644 pkg/multicluster/builder.go
create mode 100644 pkg/multicluster/builder_test.go
create mode 100644 pkg/multicluster/client.go
create mode 100644 pkg/multicluster/client_test.go
diff --git a/Tiltfile b/Tiltfile
index 1ec900932..965581e7f 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -17,7 +17,11 @@ if not os.getenv('TILT_VALUES_PATH'):
fail("TILT_VALUES_PATH is not set.")
if not os.path.exists(os.getenv('TILT_VALUES_PATH')):
fail("TILT_VALUES_PATH "+ os.getenv('TILT_VALUES_PATH') + " does not exist.")
-tilt_values = os.getenv('TILT_VALUES_PATH')
+tilt_values = [os.getenv('TILT_VALUES_PATH')]
+
+tilt_overrides = os.getenv('TILT_OVERRIDES_PATH')
+if tilt_overrides and os.path.exists(tilt_overrides):
+ tilt_values.append(tilt_overrides)
load('ext://helm_resource', 'helm_resource', 'helm_repo')
helm_repo(
@@ -106,7 +110,7 @@ k8s_yaml(helm('./helm/bundles/cortex-crds', name='cortex-crds', set=[
if 'nova' in ACTIVE_DEPLOYMENTS:
print("Activating Cortex Nova bundle")
- k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=[tilt_values]))
+ k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=tilt_values))
k8s_resource('cortex-nova-postgresql', labels=['Cortex-Nova'], port_forwards=[
port_forward(8000, 5432),
])
@@ -125,7 +129,7 @@ if 'nova' in ACTIVE_DEPLOYMENTS:
if 'manila' in ACTIVE_DEPLOYMENTS:
print("Activating Cortex Manila bundle")
- k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=[tilt_values]))
+ k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=tilt_values))
k8s_resource('cortex-manila-postgresql', labels=['Cortex-Manila'], port_forwards=[
port_forward(8002, 5432),
])
@@ -142,7 +146,7 @@ if 'manila' in ACTIVE_DEPLOYMENTS:
)
if 'cinder' in ACTIVE_DEPLOYMENTS:
- k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=[tilt_values]))
+ k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=tilt_values))
k8s_resource('cortex-cinder-postgresql', labels=['Cortex-Cinder'], port_forwards=[
port_forward(8004, 5432),
])
@@ -160,7 +164,7 @@ if 'cinder' in ACTIVE_DEPLOYMENTS:
if 'ironcore' in ACTIVE_DEPLOYMENTS:
print("Activating Cortex IronCore bundle")
- k8s_yaml(helm('./helm/bundles/cortex-ironcore', name='cortex-ironcore', values=[tilt_values]))
+ k8s_yaml(helm('./helm/bundles/cortex-ironcore', name='cortex-ironcore', values=tilt_values))
k8s_resource('cortex-ironcore-controller-manager', labels=['Cortex-IronCore'])
# Deploy resources in machines/samples
k8s_yaml('samples/ironcore/machinepool.yaml')
diff --git a/api/v1alpha1/datasource_types.go b/api/v1alpha1/datasource_types.go
index c994a2424..9ab0e3aa9 100644
--- a/api/v1alpha1/datasource_types.go
+++ b/api/v1alpha1/datasource_types.go
@@ -293,6 +293,9 @@ type DatasourceList struct {
Items []Datasource `json:"items"`
}
+func (*Datasource) URI() string { return "datasources.cortex.cloud/v1alpha1" }
+func (*DatasourceList) URI() string { return "datasources.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Datasource{}, &DatasourceList{})
}
diff --git a/api/v1alpha1/decision_types.go b/api/v1alpha1/decision_types.go
index ea71c60ff..0e4399ed1 100644
--- a/api/v1alpha1/decision_types.go
+++ b/api/v1alpha1/decision_types.go
@@ -155,6 +155,9 @@ type DecisionList struct {
Items []Decision `json:"items"`
}
+func (*Decision) URI() string { return "decisions.cortex.cloud/v1alpha1" }
+func (*DecisionList) URI() string { return "decisions.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Decision{}, &DecisionList{})
}
diff --git a/api/v1alpha1/descheduling_types.go b/api/v1alpha1/descheduling_types.go
index 060a7c56d..a4b05bd76 100644
--- a/api/v1alpha1/descheduling_types.go
+++ b/api/v1alpha1/descheduling_types.go
@@ -102,6 +102,9 @@ type DeschedulingList struct {
Items []Descheduling `json:"items"`
}
+func (*Descheduling) URI() string { return "deschedulings.cortex.cloud/v1alpha1" }
+func (*DeschedulingList) URI() string { return "deschedulings.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Descheduling{}, &DeschedulingList{})
}
diff --git a/api/v1alpha1/knowledge_types.go b/api/v1alpha1/knowledge_types.go
index cba0d026c..7e90ff7a9 100644
--- a/api/v1alpha1/knowledge_types.go
+++ b/api/v1alpha1/knowledge_types.go
@@ -174,6 +174,9 @@ type KnowledgeList struct {
Items []Knowledge `json:"items"`
}
+func (*Knowledge) URI() string { return "knowledges.cortex.cloud/v1alpha1" }
+func (*KnowledgeList) URI() string { return "knowledges.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Knowledge{}, &KnowledgeList{})
}
diff --git a/api/v1alpha1/kpi_types.go b/api/v1alpha1/kpi_types.go
index 59dee1b74..93ccd5f18 100644
--- a/api/v1alpha1/kpi_types.go
+++ b/api/v1alpha1/kpi_types.go
@@ -97,6 +97,9 @@ type KPIList struct {
Items []KPI `json:"items"`
}
+func (*KPI) URI() string { return "kpis.cortex.cloud/v1alpha1" }
+func (*KPIList) URI() string { return "kpis.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&KPI{}, &KPIList{})
}
diff --git a/api/v1alpha1/pipeline_types.go b/api/v1alpha1/pipeline_types.go
index 20495b150..a54b68ca7 100644
--- a/api/v1alpha1/pipeline_types.go
+++ b/api/v1alpha1/pipeline_types.go
@@ -98,6 +98,9 @@ type PipelineList struct {
Items []Pipeline `json:"items"`
}
+func (*Pipeline) URI() string { return "pipelines.cortex.cloud/v1alpha1" }
+func (*PipelineList) URI() string { return "pipelines.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Pipeline{}, &PipelineList{})
}
diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go
index 03386d932..56fdafea0 100644
--- a/api/v1alpha1/reservation_types.go
+++ b/api/v1alpha1/reservation_types.go
@@ -97,6 +97,9 @@ type ReservationList struct {
Items []Reservation `json:"items"`
}
+func (*Reservation) URI() string { return "reservations.cortex.cloud/v1alpha1" }
+func (*ReservationList) URI() string { return "reservations.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Reservation{}, &ReservationList{})
}
diff --git a/api/v1alpha1/step_types.go b/api/v1alpha1/step_types.go
index d4866fc31..012034e28 100644
--- a/api/v1alpha1/step_types.go
+++ b/api/v1alpha1/step_types.go
@@ -129,6 +129,9 @@ type StepList struct {
Items []Step `json:"items"`
}
+func (*Step) URI() string { return "steps.cortex.cloud/v1alpha1" }
+func (*StepList) URI() string { return "steps.cortex.cloud/v1alpha1" }
+
func init() {
SchemeBuilder.Register(&Step{}, &StepList{})
}
diff --git a/cmd/main.go b/cmd/main.go
index 42ac2742a..64d6cf6ca 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -22,6 +22,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
"sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/cluster"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics"
@@ -54,6 +55,7 @@ import (
"github.com/cobaltcore-dev/cortex/pkg/conf"
"github.com/cobaltcore-dev/cortex/pkg/db"
"github.com/cobaltcore-dev/cortex/pkg/monitoring"
+ "github.com/cobaltcore-dev/cortex/pkg/multicluster"
"github.com/sapcc/go-bits/httpext"
"github.com/sapcc/go-bits/must"
corev1 "k8s.io/api/core/v1"
@@ -243,6 +245,33 @@ func main() {
os.Exit(1)
}
+ homeCluster, err := cluster.New(restConfig, func(o *cluster.Options) { o.Scheme = scheme })
+ if err != nil {
+ setupLog.Error(err, "unable to create home cluster")
+ os.Exit(1)
+ }
+ if err := mgr.Add(homeCluster); err != nil {
+ setupLog.Error(err, "unable to add home cluster")
+ os.Exit(1)
+ }
+ multiclusterClient := &multicluster.Client{
+ HomeCluster: homeCluster,
+ HomeRestConfig: restConfig,
+ HomeScheme: scheme,
+ }
+ for _, override := range config.APIServerOverrides {
+ cluster, err := multiclusterClient.AddRemote(override.Resource, override.Host, override.CACert)
+ if err != nil {
+ setupLog.Error(err, "unable to create cluster for apiserver override", "apiserver", override.Host)
+ os.Exit(1)
+ }
+ // Also tell the manager about this cluster so that controllers can use it.
+ if err := mgr.Add(cluster); err != nil {
+ setupLog.Error(err, "unable to add cluster for apiserver override", "apiserver", override.Host)
+ os.Exit(1)
+ }
+ }
+
// Our custom monitoring registry can add prometheus labels to all metrics.
// This is useful to distinguish metrics from different deployments.
metrics.Registry = monitoring.WrapRegistry(metrics.Registry, config.Monitoring)
@@ -265,14 +294,14 @@ func main() {
Conf: config,
}
// Inferred through the base controller.
- decisionController.Client = mgr.GetClient()
+ decisionController.Client = multiclusterClient
decisionController.OperatorName = config.Operator
- if err := (decisionController).SetupWithManager(mgr); err != nil {
+ if err := (decisionController).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler")
os.Exit(1)
}
novashims.NewAPI(config, decisionController).Init(mux)
- go decisionsnova.CleanupNovaDecisionsRegularly(ctx, mgr.GetClient(), config)
+ go decisionsnova.CleanupNovaDecisionsRegularly(ctx, multiclusterClient, config)
}
if slices.Contains(config.EnabledControllers, "nova-deschedulings-pipeline-controller") {
// Deschedulings controller
@@ -284,18 +313,18 @@ func main() {
CycleDetector: deschedulingnova.NewCycleDetector(),
}
// Inferred through the base controller.
- deschedulingsController.Client = mgr.GetClient()
+ deschedulingsController.Client = multiclusterClient
deschedulingsController.OperatorName = config.Operator
- if err := (deschedulingsController).SetupWithManager(mgr); err != nil {
+ if err := (deschedulingsController).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DeschedulingsReconciler")
os.Exit(1)
}
go deschedulingsController.CreateDeschedulingsPeriodically(ctx)
// Deschedulings cleanup on startup
if err := (&deschedulingnova.Cleanup{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Cleanup")
os.Exit(1)
}
@@ -306,14 +335,14 @@ func main() {
Conf: config,
}
// Inferred through the base controller.
- controller.Client = mgr.GetClient()
+ controller.Client = multiclusterClient
controller.OperatorName = config.Operator
- if err := (controller).SetupWithManager(mgr); err != nil {
+ if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler")
os.Exit(1)
}
manilashims.NewAPI(config, controller).Init(mux)
- go decisionsmanila.CleanupManilaDecisionsRegularly(ctx, mgr.GetClient(), config)
+ go decisionsmanila.CleanupManilaDecisionsRegularly(ctx, multiclusterClient, config)
}
if slices.Contains(config.EnabledControllers, "cinder-decisions-pipeline-controller") {
controller := &decisionscinder.DecisionPipelineController{
@@ -321,14 +350,14 @@ func main() {
Conf: config,
}
// Inferred through the base controller.
- controller.Client = mgr.GetClient()
+ controller.Client = multiclusterClient
controller.OperatorName = config.Operator
- if err := (controller).SetupWithManager(mgr); err != nil {
+ if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler")
os.Exit(1)
}
cindershims.NewAPI(config, controller).Init(mux)
- go decisionscinder.CleanupCinderDecisionsRegularly(ctx, mgr.GetClient(), config)
+ go decisionscinder.CleanupCinderDecisionsRegularly(ctx, multiclusterClient, config)
}
if slices.Contains(config.EnabledControllers, "ironcore-decisions-pipeline-controller") {
controller := &decisionsmachines.DecisionPipelineController{
@@ -336,9 +365,9 @@ func main() {
Conf: config,
}
// Inferred through the base controller.
- controller.Client = mgr.GetClient()
+ controller.Client = multiclusterClient
controller.OperatorName = config.Operator
- if err := (controller).SetupWithManager(mgr); err != nil {
+ if err := (controller).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DecisionReconciler")
os.Exit(1)
}
@@ -347,23 +376,23 @@ func main() {
// Setup a controller which will reconcile the history and explanation for
// decision resources.
explanationController := &explanation.Controller{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
OperatorName: config.Operator,
}
- if err := explanationController.SetupWithManager(mgr); err != nil {
+ if err := explanationController.SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ExplanationController")
os.Exit(1)
}
}
if slices.Contains(config.EnabledControllers, "reservations-controller") {
- monitor := reservationscontroller.NewControllerMonitor(mgr.GetClient())
+ monitor := reservationscontroller.NewControllerMonitor(multiclusterClient)
metrics.Registry.MustRegister(&monitor)
if err := (&reservationscontroller.ReservationReconciler{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Conf: config,
HypervisorClient: reservationscontroller.NewHypervisorClient(),
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Reservation")
os.Exit(1)
}
@@ -372,20 +401,20 @@ func main() {
monitor := datasources.NewMonitor()
metrics.Registry.MustRegister(&monitor)
if err := (&openstack.OpenStackDatasourceReconciler{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Monitor: monitor,
Conf: config,
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "OpenStackDatasourceReconciler")
os.Exit(1)
}
if err := (&prometheus.PrometheusDatasourceReconciler{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Monitor: monitor,
Conf: config,
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "PrometheusDatasourceReconciler")
os.Exit(1)
}
@@ -394,29 +423,29 @@ func main() {
monitor := extractor.NewMonitor()
metrics.Registry.MustRegister(&monitor)
if err := (&extractor.KnowledgeReconciler{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Monitor: monitor,
Conf: config,
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "KnowledgeReconciler")
os.Exit(1)
}
if err := (&extractor.TriggerReconciler{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Conf: config,
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "TriggerReconciler")
os.Exit(1)
}
}
if slices.Contains(config.EnabledControllers, "kpis-controller") {
if err := (&kpis.Controller{
- Client: mgr.GetClient(),
+ Client: multiclusterClient,
SupportedKPIsByImpl: kpis.SupportedKPIsByImpl,
OperatorName: config.Operator,
- }).SetupWithManager(mgr); err != nil {
+ }).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "KPIController")
os.Exit(1)
}
diff --git a/docs/guides/multicluster/cortex-home-crb.yaml b/docs/guides/multicluster/cortex-home-crb.yaml
new file mode 100644
index 000000000..1854084f2
--- /dev/null
+++ b/docs/guides/multicluster/cortex-home-crb.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: grant-cortex-remote-oidc-access
+subjects:
+- kind: User
+ apiGroup: rbac.authorization.k8s.io
+ name: system:anonymous
+roleRef:
+ kind: ClusterRole
+ name: system:service-account-issuer-discovery
+ apiGroup: rbac.authorization.k8s.io
diff --git a/docs/guides/multicluster/cortex-home.yaml b/docs/guides/multicluster/cortex-home.yaml
new file mode 100644
index 000000000..c1e093807
--- /dev/null
+++ b/docs/guides/multicluster/cortex-home.yaml
@@ -0,0 +1,23 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: cortex-home
+nodes:
+ - role: worker
+ - role: control-plane
+ extraPortMappings:
+ - containerPort: 6443
+ hostPort: 8443
+ kubeadmConfigPatches:
+ - |
+ kind: ClusterConfiguration
+ apiServer:
+ extraArgs:
+ service-account-issuer: "https://host.docker.internal:8443"
+ service-account-jwks-uri: "https://host.docker.internal:8443/openid/v1/jwks"
+ certSANs:
+ - api-proxy
+ - api-proxy.default.svc
+ - api-proxy.default.svc.cluster.local
+ - localhost
+ - 127.0.0.1
+ - host.docker.internal
diff --git a/docs/guides/multicluster/cortex-remote-crb.yaml b/docs/guides/multicluster/cortex-remote-crb.yaml
new file mode 100644
index 000000000..47ea9aa81
--- /dev/null
+++ b/docs/guides/multicluster/cortex-remote-crb.yaml
@@ -0,0 +1,18 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: grant-cortex-cluster-admin
+subjects:
+- kind: User
+ apiGroup: rbac.authorization.k8s.io
+ name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-knowledge-controller-manager"
+- kind: User
+ apiGroup: rbac.authorization.k8s.io
+ name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-scheduling-controller-manager"
+- kind: User
+ apiGroup: rbac.authorization.k8s.io
+ name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-reservations-controller-manager"
+roleRef:
+ kind: ClusterRole
+ name: cluster-admin
+ apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
diff --git a/docs/guides/multicluster/cortex-remote.yaml b/docs/guides/multicluster/cortex-remote.yaml
new file mode 100644
index 000000000..675a14063
--- /dev/null
+++ b/docs/guides/multicluster/cortex-remote.yaml
@@ -0,0 +1,27 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: cortex-remote
+nodes:
+ - role: control-plane
+ extraPortMappings:
+ - containerPort: 6443
+ hostPort: 8444
+ extraMounts:
+ - hostPath: /tmp/root-ca-home.pem
+ containerPath: /etc/ca-certificates/root-ca.pem
+ kubeadmConfigPatches:
+ - |
+ kind: ClusterConfiguration
+ apiServer:
+ extraArgs:
+ oidc-client-id: "https://host.docker.internal:8443" # = audience
+ oidc-issuer-url: "https://host.docker.internal:8443"
+ oidc-username-claim: sub
+ oidc-ca-file: /etc/ca-certificates/root-ca.pem
+ certSANs:
+ - api-proxy
+ - api-proxy.default.svc
+ - api-proxy.default.svc.cluster.local
+ - localhost
+ - 127.0.0.1
+ - host.docker.internal
\ No newline at end of file
diff --git a/docs/guides/multicluster/readme.md b/docs/guides/multicluster/readme.md
new file mode 100644
index 000000000..fa0fd3c83
--- /dev/null
+++ b/docs/guides/multicluster/readme.md
@@ -0,0 +1,104 @@
+# Cortex Multi-Cluster Testing
+
+Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster.
+
+This guide will walk you through setting up a multi-cluster Cortex deployment using [kind](https://kind.sigs.k8s.io/). We will create two kind clusters: `cortex-home` and `cortex-remote`. The `cortex-home` cluster will host the Cortex control plane, while the `cortex-remote` cluster will be used to store CRDs.
+
+To store its CRDs in the `cortex-remote` cluster, the `cortex-home` cluster needs to be able to authenticate to the `cortex-remote` cluster's API server. We will achieve this by configuring the `cortex-remote` cluster to trust the service account tokens issued by the `cortex-home` cluster. In this way, no external OIDC provider is needed, because the `cortex-home` cluster's own OIDC issuer for service accounts acts as the identity provider.
+
+Here is a diagram illustrating the authentication flow:
+
+```mermaid
+sequenceDiagram
+ participant Home as cortex-home
+ participant Remote as cortex-remote
+ Home->>Home: Service Account Token Issued
+ Home->>Remote: API Request with Token
+ Remote->>Remote: Token Verified Against Home's OIDC Issuer
+ Remote->>Home: API Response
+```
+
+## Home Cluster Setup
+
+First we set up the `cortex-home` cluster. The provided kind configuration file `cortex-home.yaml` sets up the cluster with the necessary port mappings to allow communication between the two clusters. `cortex-home` will expose its API server on port `8443`, which `cortex-remote` will use to verify service account tokens through `https://host.docker.internal:8443`.
+
+```bash
+kind create cluster --config docs/guides/multicluster/cortex-home.yaml
+```
+
+Next, we need to expose the OIDC issuer endpoint of the `cortex-home` cluster's API server to the `cortex-remote` cluster. We do this by creating a `ClusterRoleBinding` that grants the `system:service-account-issuer-discovery` role to the `kube-system` service account in the `cortex-home` cluster.
+
+```bash
+kubectl --context kind-cortex-home apply -f docs/guides/multicluster/cortex-home-crb.yaml
+```
+
+To talk back to the `cortex-home` cluster's OIDC endpoint, the `cortex-remote` cluster needs to trust the root CA certificate used by the `cortex-home` cluster's API server. We can extract this certificate from the `extension-apiserver-authentication` config map in the `kube-system` namespace, and save it to a temporary file for later use.
+
+```bash
+kubectl --context kind-cortex-home --namespace kube-system \
+ get configmap extension-apiserver-authentication \
+ -o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-home.pem
+```
+
+## Remote Cluster Setup
+
+With all the prerequisites in place, we can now set up the `cortex-remote` cluster. We create the cluster using the provided kind configuration file `cortex-remote.yaml`. This configuration will tell the `cortex-remote` cluster to trust the `cortex-home` cluster's API server as OIDC issuer for service account token verification. Also, the `cortex-remote` cluster will trust the root CA certificate we extracted earlier. The `cortex-remote` apiserver will be accessible at `https://host.docker.internal:8444`.
+
+```bash
+kind create cluster --config docs/guides/multicluster/cortex-remote.yaml
+```
+
+Next, we need to create a `ClusterRoleBinding` in the `cortex-remote` cluster that grants service accounts coming from the `cortex-home` cluster access to the appropriate resources. We do this by applying the provided `cortex-remote-crb.yaml` file.
+
+```bash
+kubectl --context kind-cortex-remote apply -f docs/guides/multicluster/cortex-remote-crb.yaml
+```
+
+## Deploying Cortex
+
+Before we launch cortex make sure that the CRDs are installed in the `cortex-remote` cluster.
+
+```bash
+kubectl config use-context kind-cortex-remote
+helm install helm/bundles/cortex-crds --generate-name
+```
+
+Also, we need to extract the root CA certificate used by the `cortex-remote` cluster's API server, so that we can configure the cortex pods in the `cortex-home` cluster to trust it.
+
+```bash
+kubectl --context kind-cortex-remote --namespace kube-system \
+ get configmap extension-apiserver-authentication \
+ -o jsonpath="{.data['client-ca-file']}" > /tmp/root-ca-remote.pem
+```
+
+Now we can deploy cortex to the `cortex-home` cluster, configuring it to use the `cortex-remote` cluster for CRD storage. We create a temporary Helm values override file that specifies the API server URL and root CA certificate for the `cortex-remote` cluster. In this example, we are configuring the `decisions.cortex.cloud/v1alpha1` resource to be stored in the `cortex-remote` cluster.
+
+```bash
+export TILT_OVERRIDES_PATH=/tmp/cortex-values.yaml
+tee $TILT_OVERRIDES_PATH <
Date: Tue, 25 Nov 2025 14:56:09 +0100
Subject: [PATCH 2/3] Revert change in pipelines.yaml
---
helm/bundles/cortex-ironcore/templates/pipelines.yaml | 1 +
1 file changed, 1 insertion(+)
diff --git a/helm/bundles/cortex-ironcore/templates/pipelines.yaml b/helm/bundles/cortex-ironcore/templates/pipelines.yaml
index 4a1cdea43..6b087bfbf 100644
--- a/helm/bundles/cortex-ironcore/templates/pipelines.yaml
+++ b/helm/bundles/cortex-ironcore/templates/pipelines.yaml
@@ -8,6 +8,7 @@ spec:
description: |
This pipeline is used to schedule ironcore machines onto machinepools.
type: filter-weigher
+ createDecisions: true
steps:
- ref: {name: machinepools-noop}
mandatory: false
From 6febba469b1b7de532cbfd2a52a846b96cb9b6f9 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Tue, 25 Nov 2025 15:55:15 +0100
Subject: [PATCH 3/3] Add ironcore serviceaccount to guide crb
---
docs/guides/multicluster/cortex-remote-crb.yaml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/docs/guides/multicluster/cortex-remote-crb.yaml b/docs/guides/multicluster/cortex-remote-crb.yaml
index 47ea9aa81..9928720d7 100644
--- a/docs/guides/multicluster/cortex-remote-crb.yaml
+++ b/docs/guides/multicluster/cortex-remote-crb.yaml
@@ -12,6 +12,9 @@ subjects:
- kind: User
apiGroup: rbac.authorization.k8s.io
name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-nova-reservations-controller-manager"
+- kind: User
+ apiGroup: rbac.authorization.k8s.io
+ name: "https://host.docker.internal:8443#system:serviceaccount:default:cortex-ironcore-controller-manager"
roleRef:
kind: ClusterRole
name: cluster-admin