diff --git a/documentdb-kubectl-plugin/cmd/status.go b/documentdb-kubectl-plugin/cmd/status.go index 2fcf506b..b7545825 100644 --- a/documentdb-kubectl-plugin/cmd/status.go +++ b/documentdb-kubectl-plugin/cmd/status.go @@ -101,11 +101,11 @@ func (o *statusOptions) run(ctx context.Context, cmd *cobra.Command) error { if err != nil { return fmt.Errorf("failed to read spec.clusterReplication.primary: %w", err) } - clusterNames, found, err := unstructured.NestedStringSlice(document.Object, "spec", "clusterReplication", "clusterList") + clusterListRaw, found, err := unstructured.NestedSlice(document.Object, "spec", "clusterReplication", "clusterList") if err != nil { return fmt.Errorf("failed to read spec.clusterReplication.clusterList: %w", err) } - if !found || len(clusterNames) == 0 { + if !found || len(clusterListRaw) == 0 { return errors.New("DocumentDB spec.clusterReplication.clusterList is empty") } @@ -120,8 +120,9 @@ func (o *statusOptions) run(ctx context.Context, cmd *cobra.Command) error { } fmt.Fprintln(cmd.OutOrStdout()) - statuses := make([]clusterStatus, 0, len(clusterNames)) - for _, cluster := range clusterNames { + statuses := make([]clusterStatus, 0, len(clusterListRaw)) + for _, clusterObj := range clusterListRaw { + cluster := clusterObj.(map[string]interface{})["name"].(string) role := "Replica" if cluster == primaryCluster { role = "Primary" diff --git a/documentdb-kubectl-plugin/cmd/status_run_test.go b/documentdb-kubectl-plugin/cmd/status_run_test.go index 7e180d49..f731d99e 100644 --- a/documentdb-kubectl-plugin/cmd/status_run_test.go +++ b/documentdb-kubectl-plugin/cmd/status_run_test.go @@ -42,7 +42,11 @@ func TestStatusRunRendersClusterTable(t *testing.T) { docName := "documentdb-sample" hubDoc := newDocument(docName, namespace, "cluster-a", "Ready") - if err := unstructured.SetNestedStringSlice(hubDoc.Object, []string{"cluster-a", "cluster-b"}, "spec", "clusterReplication", "clusterList"); err != nil { + clusterList := []interface{}{ + map[string]interface{}{"name": "cluster-a"}, + map[string]interface{}{"name": "cluster-b"}, + } + if err := unstructured.SetNestedSlice(hubDoc.Object, clusterList, "spec", "clusterReplication", "clusterList"); err != nil { t.Fatalf("failed to set clusterList: %v", err) } if err := unstructured.SetNestedField(hubDoc.Object, "PrimaryConn", "status", "connectionString"); err != nil { diff --git a/documentdb-playground/aws-setup/scripts/delete-cluster.sh b/documentdb-playground/aws-setup/scripts/delete-cluster.sh index df14baa0..358dde2b 100755 --- a/documentdb-playground/aws-setup/scripts/delete-cluster.sh +++ b/documentdb-playground/aws-setup/scripts/delete-cluster.sh @@ -13,6 +13,7 @@ REGION="us-west-2" DELETE_CLUSTER="${DELETE_CLUSTER:-true}" DELETE_OPERATOR="${DELETE_OPERATOR:-true}" DELETE_INSTANCE="${DELETE_INSTANCE:-true}" +SKIP_CONFIRMATION="false" # Parse command line arguments while [[ $# -gt 0 ]]; do @@ -37,6 +38,10 @@ while [[ $# -gt 0 ]]; do REGION="$2" shift 2 ;; + -y|--yes) + SKIP_CONFIRMATION="true" + shift + ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "" @@ -45,12 +50,14 @@ while [[ $# -gt 0 ]]; do echo " --instance-and-operator Delete instances and operator (keep cluster)" echo " --cluster-name NAME EKS cluster name (default: documentdb-cluster)" echo " --region REGION AWS region (default: us-west-2)" + echo " -y, --yes Skip confirmation prompt" echo " -h, --help Show this help message" echo "" echo "Examples:" echo " $0 # Delete everything (default)" echo " $0 --instance-only # Delete only DocumentDB instances" echo " $0 --instance-and-operator # Delete instances and operator, keep cluster" + echo " $0 --yes # Delete everything without confirmation" exit 0 ;; *) @@ -86,6 +93,11 @@ error() { # Confirmation prompt confirm_deletion() { + if [ "$SKIP_CONFIRMATION" == "true" ]; then + log "Skipping confirmation (--yes flag provided)" + return 0 + fi + echo "" echo "=======================================" echo " DELETION WARNING" diff --git a/documentdb-playground/multi-clould-setup/multi-cloud-deployment-guide.md b/documentdb-playground/multi-clould-setup/multi-cloud-deployment-guide.md index af952966..c84205a8 100644 --- a/documentdb-playground/multi-clould-setup/multi-cloud-deployment-guide.md +++ b/documentdb-playground/multi-clould-setup/multi-cloud-deployment-guide.md @@ -31,7 +31,7 @@ possible and will be documented as they are tested. - Two kubernetes clusters that are network connected to each other. For example using - [Azure VPN Gatway](https://learn.microsoft.com/en-us/azure/vpn-gateway/vpn-gateway-about-vpngateways) - [Azure ExpressRoute](https://learn.microsoft.com/en-us/azure/expressroute/expressroute-introduction) -- ENV variables `$AZURE_MEMBER` and `$ON_PREM_MEMBER` with the kubectl context names for your clusters +- ENV variables `$CLOUD_MEMBER` and `$ON_PREM_MEMBER` with the kubectl context names for your clusters - (e.g. "azure-documentdb-cluster", "k3s-cluster-context") ## Architecture Overview @@ -94,7 +94,7 @@ Run `kubectl get membercluster -A` again and see `True` under `JOINED` to confir ```bash # Install on primary -kubectl config use-context $AZURE_MEMBER +kubectl config use-context $CLOUD_MEMBER helm repo add jetstack https://charts.jetstack.io helm repo update helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --set installCRDs=true @@ -251,7 +251,7 @@ kubectl apply -f ./documentdb-base.yaml After a few seconds, ensure that the operator is running on both of the clusters ```sh -kubectl config use-context $AZURE_MEMBER +kubectl config use-context $CLOUD_MEMBER kubectl get deployment -n documentdb-operator kubectl config use-context $ON_PREM_MEMBER kubectl get deployment -n documentdb-operator @@ -274,21 +274,21 @@ Physical replication provides high availability and disaster recovery capabiliti ```bash kubectl config use-context $ON_PREM_MEMBER kubectl create configmap cluster-name -n kube-system --from-literal=name=on-prem-cluster-name -kubectl config use-context $AZURE_MEMBER -kubectl create configmap cluster-name -n kube-system --from-literal=name=azure-cluster-name +kubectl config use-context $CLOUD_MEMBER +kubectl create configmap cluster-name -n kube-system --from-literal=name=cloud-cluster-name ``` OR ```bash -cat < azure-cluster-name.yaml +cat < cloud-cluster-name.yaml apiVersion: v1 kind: ConfigMap metadata: name: cluster-name namespace: kube-system data: - name: "azure-cluster-name" + name: "cloud-cluster-name" EOF cat < on-prem-name.yaml @@ -301,7 +301,7 @@ data: name: "on-prem-cluster-name" EOF -kubectl config use-context $AZURE_MEMBER +kubectl config use-context $CLOUD_MEMBER kubectl apply -f ./primary-name.yaml kubectl config use-context $ON_PREM_MEMBER kubectl apply -f ./replica-name.yaml @@ -332,10 +332,10 @@ spec: storage: pvcSize: 10Gi clusterReplication: - primary: azure-cluster-name + primary: cloud-cluster-name clusterList: - - azure-cluster-name - - on-prem-cluster-name + - name: cloud-cluster-name + - name: on-prem-cluster-name exposeViaService: serviceType: ClusterIP @@ -364,7 +364,7 @@ kubectl apply -f ./documentdb-resource.yaml After a few seconds, ensure that the operator is running on both of the clusters ```sh -kubectl config use-context $AZURE_MEMBER +kubectl config use-context $CLOUD_MEMBER kubectl get pods -n documentdb-operator-ns kubectl config use-context $ON_PREM_MEMBER kubectl get pods -n documentdb-operator-ns @@ -374,7 +374,7 @@ Output: ```text NAME READY STATUS RESTARTS AGE -azure-cluster-name-1 2/2 Running 0 3m33s +cloud-cluster-name-1 2/2 Running 0 3m33s ``` ## Testing and Verification @@ -382,8 +382,8 @@ azure-cluster-name-1 2/2 Running 0 3m33s 1. Test connection to DocumentDB: ```bash -# Get the service IP from primary (azure) -kubectl config use-context $AZURE_MEMBER +# Get the service IP from primary (cloud) +kubectl config use-context $CLOUD_MEMBER service_ip=$(kubectl get service documentdb-service-documentdb-preview -n documentdb-preview-ns -o jsonpath="{.status.loadBalancer.ingress[0].ip}") # Connect using mongosh @@ -410,7 +410,7 @@ kubectl config use-context hub kubectl patch documentdb documentdb-preview -n documentdb-preview-ns \ --type='json' -p='[ {"op": "replace", "path": "/spec/clusterReplication/primary", "value":"on-prem-cluster-name"}, - {"op": "replace", "path": "/spec/clusterReplication/clusterList", "value":["on-prem-cluster-name"]} + {"op": "replace", "path": "/spec/clusterReplication/clusterList", "value":[{"name": "on-prem-cluster-name"}]} ]' ``` diff --git a/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml b/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml index a2a92fdc..d8e5fb91 100644 --- a/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml +++ b/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml @@ -56,12 +56,36 @@ spec: description: ClusterList is the list of clusters participating in replication. items: - type: string + properties: + environment: + description: |- + EnvironmentOverride is the cloud environment of the member cluster. + Will default to the global setting + enum: + - eks + - aks + - gke + type: string + name: + description: Name is the name of the member cluster. + type: string + storageClass: + description: StorageClassOverride specifies the storage + class for DocumentDB persistent volumes in this member + cluster. + type: string + required: + - name + type: object type: array - enableFleetForCrossCloud: - description: EnableFleetForCrossCloud determines whether to use - KubeFleet mechanics for the replication - type: boolean + crossCloudNetworkingStrategy: + description: CrossCloudNetworking determines which type of networking + mechanics for the replication + enum: + - AzureFleet + - Istio + - None + type: string highAvailability: description: Whether or not to have replicas on the primary cluster. type: boolean diff --git a/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml b/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml index b0f9b2bc..be0d9042 100644 --- a/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml +++ b/operator/documentdb-helm-chart/templates/03_documentdb_wal_replica.yaml @@ -171,4 +171,4 @@ metadata: labels: app.kubernetes.io/name: wal-replia-manager app.kubernetes.io/managed-by: kustomize -{{- end }} \ No newline at end of file +{{- end }} diff --git a/operator/src/api/preview/documentdb_types.go b/operator/src/api/preview/documentdb_types.go index d26c5620..95a65544 100644 --- a/operator/src/api/preview/documentdb_types.go +++ b/operator/src/api/preview/documentdb_types.go @@ -81,16 +81,28 @@ type StorageConfiguration struct { } type ClusterReplication struct { - // EnableFleetForCrossCloud determines whether to use KubeFleet mechanics for the replication - EnableFleetForCrossCloud bool `json:"enableFleetForCrossCloud,omitempty"` + // CrossCloudNetworking determines which type of networking mechanics for the replication + // +kubebuilder:validation:Enum=AzureFleet;Istio;None + CrossCloudNetworkingStrategy string `json:"crossCloudNetworkingStrategy,omitempty"` // Primary is the name of the primary cluster for replication. Primary string `json:"primary"` // ClusterList is the list of clusters participating in replication. - ClusterList []string `json:"clusterList"` + ClusterList []MemberCluster `json:"clusterList"` // Whether or not to have replicas on the primary cluster. HighAvailability bool `json:"highAvailability,omitempty"` } +type MemberCluster struct { + // Name is the name of the member cluster. + Name string `json:"name"` + // EnvironmentOverride is the cloud environment of the member cluster. + // Will default to the global setting + // +kubebuilder:validation:Enum=eks;aks;gke + EnvironmentOverride string `json:"environment,omitempty"` + // StorageClassOverride specifies the storage class for DocumentDB persistent volumes in this member cluster. + StorageClassOverride string `json:"storageClass,omitempty"` +} + type ExposeViaService struct { // ServiceType determines the type of service to expose for DocumentDB. // +kubebuilder:validation:Enum=LoadBalancer;ClusterIP diff --git a/operator/src/api/preview/zz_generated.deepcopy.go b/operator/src/api/preview/zz_generated.deepcopy.go index 94b244b6..6436f3a9 100644 --- a/operator/src/api/preview/zz_generated.deepcopy.go +++ b/operator/src/api/preview/zz_generated.deepcopy.go @@ -16,7 +16,7 @@ func (in *ClusterReplication) DeepCopyInto(out *ClusterReplication) { *out = *in if in.ClusterList != nil { in, out := &in.ClusterList, &out.ClusterList - *out = make([]string, len(*in)) + *out = make([]MemberCluster, len(*in)) copy(*out, *in) } } @@ -143,6 +143,21 @@ func (in *ExposeViaService) DeepCopy() *ExposeViaService { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MemberCluster) DeepCopyInto(out *MemberCluster) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemberCluster. +func (in *MemberCluster) DeepCopy() *MemberCluster { + if in == nil { + return nil + } + out := new(MemberCluster) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Resource) DeepCopyInto(out *Resource) { *out = *in diff --git a/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml b/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml index a2a92fdc..d8e5fb91 100644 --- a/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml +++ b/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml @@ -56,12 +56,36 @@ spec: description: ClusterList is the list of clusters participating in replication. items: - type: string + properties: + environment: + description: |- + EnvironmentOverride is the cloud environment of the member cluster. + Will default to the global setting + enum: + - eks + - aks + - gke + type: string + name: + description: Name is the name of the member cluster. + type: string + storageClass: + description: StorageClassOverride specifies the storage + class for DocumentDB persistent volumes in this member + cluster. + type: string + required: + - name + type: object type: array - enableFleetForCrossCloud: - description: EnableFleetForCrossCloud determines whether to use - KubeFleet mechanics for the replication - type: boolean + crossCloudNetworkingStrategy: + description: CrossCloudNetworking determines which type of networking + mechanics for the replication + enum: + - AzureFleet + - Istio + - None + type: string highAvailability: description: Whether or not to have replicas on the primary cluster. type: boolean diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index 39fa73b5..38324860 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -15,7 +15,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" ) -func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, documentdb_image string, serviceAccountName string, log logr.Logger) *cnpgv1.Cluster { +func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, documentdb_image, serviceAccountName, storageClass string, log logr.Logger) *cnpgv1.Cluster { sidecarPluginName := documentdb.Spec.SidecarInjectorPluginName if sidecarPluginName == "" { sidecarPluginName = util.DEFAULT_SIDECAR_INJECTOR_PLUGIN @@ -31,9 +31,9 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu } // Configure storage class - use specified storage class or nil for default - var storageClass *string - if documentdb.Spec.Resource.Storage.StorageClass != "" { - storageClass = &documentdb.Spec.Resource.Storage.StorageClass + var storageClassPointer *string + if storageClass != "" { + storageClassPointer = &storageClass } return &cnpgv1.Cluster{ @@ -56,7 +56,7 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu Instances: documentdb.Spec.InstancesPerNode, ImageName: documentdb_image, StorageConfiguration: cnpgv1.StorageConfiguration{ - StorageClass: storageClass, // Use configured storage class or default + StorageClass: storageClassPointer, // Use configured storage class or default Size: documentdb.Spec.Resource.Storage.PvcSize, }, InheritedMetadata: getInheritedMetadataLabels(documentdb.Name), diff --git a/operator/src/internal/controller/documentdb_controller.go b/operator/src/internal/controller/documentdb_controller.go index ddf39466..5f7207f4 100644 --- a/operator/src/internal/controller/documentdb_controller.go +++ b/operator/src/internal/controller/documentdb_controller.go @@ -109,7 +109,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) documentdbImage := util.GetDocumentDBImageForInstance(documentdb) currentCnpgCluster := &cnpgv1.Cluster{} - desiredCnpgCluster := cnpg.GetCnpgClusterSpec(req, documentdb, documentdbImage, documentdb.Name, logger) + desiredCnpgCluster := cnpg.GetCnpgClusterSpec(req, documentdb, documentdbImage, documentdb.Name, replicationContext.StorageClass, logger) if replicationContext.IsReplicating() { err = r.AddClusterReplicationToClusterSpec(ctx, documentdb, replicationContext, desiredCnpgCluster) @@ -133,7 +133,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Check if anything has changed in the generated cnpg spec - err, requeueTime := r.TryUpdateCluster(ctx, currentCnpgCluster, desiredCnpgCluster, documentdb) + err, requeueTime := r.TryUpdateCluster(ctx, currentCnpgCluster, desiredCnpgCluster, documentdb, replicationContext) if err != nil { logger.Error(err, "Failed to update CNPG Cluster") } @@ -157,7 +157,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) if currentCnpgCluster.Status.Phase == "Cluster in healthy state" && replicationContext.IsPrimary() { grantCommand := "GRANT documentdb_admin_role TO streaming_replica;" - if err := r.executeSQLCommand(ctx, documentdb, req.Namespace, replicationContext.Self, grantCommand, "grant-permissions"); err != nil { + if err := r.executeSQLCommand(ctx, documentdb, replicationContext, grantCommand, "grant-permissions"); err != nil { logger.Error(err, "Failed to grant permissions to streaming_replica") return ctrl.Result{RequeueAfter: RequeueAfterShort}, nil } @@ -302,13 +302,14 @@ func Promote(ctx context.Context, cli client.Client, } // executeSQLCommand creates a pod to execute SQL commands against the azure-cluster-rw service -func (r *DocumentDBReconciler) executeSQLCommand(ctx context.Context, documentdb *dbpreview.DocumentDB, namespace, self, sqlCommand, uniqueName string) error { +// TODO: Should find a less intrusive way to do this with CNPG +func (r *DocumentDBReconciler) executeSQLCommand(ctx context.Context, documentdb *dbpreview.DocumentDB, replicationContext *util.ReplicationContext, sqlCommand, uniqueName string) error { zero := int32(0) - host := self + "-rw" + host := replicationContext.Self + "-rw" sqlPod := &batchv1.Job{ ObjectMeta: ctrl.ObjectMeta{ Name: fmt.Sprintf("%s-%s-sql-executor", documentdb.Name, uniqueName), - Namespace: namespace, + Namespace: documentdb.Namespace, }, Spec: batchv1.JobSpec{ Template: corev1.PodTemplateSpec{ @@ -348,6 +349,15 @@ func (r *DocumentDBReconciler) executeSQLCommand(ctx context.Context, documentdb }, } + if replicationContext.IsIstioNetworking() { + sqlPod.Spec.Template.ObjectMeta = + ctrl.ObjectMeta{ + Annotations: map[string]string{ + "sidecar.istio.io/inject": "false", + }, + } + } + if err := r.Client.Create(ctx, sqlPod); err != nil { if !errors.IsAlreadyExists(err) { return err diff --git a/operator/src/internal/controller/physical_replication.go b/operator/src/internal/controller/physical_replication.go index 1fe5dc20..bcecab46 100644 --- a/operator/src/internal/controller/physical_replication.go +++ b/operator/src/internal/controller/physical_replication.go @@ -33,11 +33,16 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( ) error { isPrimary := documentdb.Spec.ClusterReplication.Primary == replicationContext.Self - if documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud { + if replicationContext.IsAzureFleetNetworking() { err := r.CreateServiceImportAndExport(ctx, replicationContext, documentdb) if err != nil { return err } + } else if replicationContext.IsIstioNetworking() { + err := r.CreateIstioRemoteServices(ctx, replicationContext, documentdb) + if err != nil { + return err + } } // No more errors possible, so we can safely edit the spec @@ -54,7 +59,8 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( } } else if documentdb.Spec.ClusterReplication.HighAvailability { // If primary and HA we want a local standby and a slot for the WAL replica - cnpgCluster.Spec.Instances = 2 + // TODO change to 2 when WAL replica is available + cnpgCluster.Spec.Instances = 3 cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL = append(cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL, "select * from pg_create_physical_replication_slot('wal_replica');") @@ -90,7 +96,7 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( Self: replicationContext.Self, } - if documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud { + if replicationContext.IsAzureFleetNetworking() { // need to create services for each of the other clusters cnpgCluster.Spec.Managed = &cnpgv1.ManagedConfiguration{ Services: &cnpgv1.ManagedServices{ @@ -121,7 +127,7 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( }, }, } - for clusterName, serviceName := range replicationContext.GenerateExternalClusterServices(documentdb.Namespace, documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud) { + for clusterName, serviceName := range replicationContext.GenerateExternalClusterServices(documentdb.Namespace, replicationContext.IsAzureFleetNetworking()) { cnpgCluster.Spec.ExternalClusters = append(cnpgCluster.Spec.ExternalClusters, cnpgv1.ExternalCluster{ Name: clusterName, ConnectionParameters: map[string]string{ @@ -136,6 +142,58 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( return nil } +func (r *DocumentDBReconciler) CreateIstioRemoteServices(ctx context.Context, replicationContext *util.ReplicationContext, documentdb *dbpreview.DocumentDB) error { + // Create dummy -rw services for remote clusters so DNS resolution works + // These services have non-matching selectors, so they have no local endpoints + // Istio will automatically route traffic through the east-west gateway + for _, remoteCluster := range replicationContext.Others { + // Create the -rw (read-write/primary) service for each remote cluster + serviceNameRW := remoteCluster + "-rw" + foundServiceRW := &corev1.Service{} + err := r.Get(ctx, types.NamespacedName{Name: serviceNameRW, Namespace: documentdb.Namespace}, foundServiceRW) + if err != nil && errors.IsNotFound(err) { + log.Log.Info("Creating Istio dummy service for remote cluster", "service", serviceNameRW, "cluster", remoteCluster) + + serviceRW := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceNameRW, + Namespace: documentdb.Namespace, + Labels: map[string]string{ + "cnpg.io/cluster": remoteCluster, + "replica_type": "primary", + }, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Name: "postgres", + Port: 5432, + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromInt(5432), + }, + }, + Selector: map[string]string{ + // Non-matching selector ensures no local endpoints + "cnpg.io/cluster": "does-not-exist", + "cnpg.io/podRole": "does-not-exist", + }, + SessionAffinity: corev1.ServiceAffinityNone, + Type: corev1.ServiceTypeClusterIP, + }, + } + + err = r.Create(ctx, serviceRW) + if err != nil { + return fmt.Errorf("failed to create Istio dummy service %s: %w", serviceNameRW, err) + } + } else if err != nil { + return fmt.Errorf("failed to check for existing service %s: %w", serviceNameRW, err) + } + } + + return nil +} + func (r *DocumentDBReconciler) CreateServiceImportAndExport(ctx context.Context, replicationContext *util.ReplicationContext, documentdb *dbpreview.DocumentDB) error { for serviceName := range replicationContext.GenerateOutgoingServiceNames(documentdb.Namespace) { foundServiceExport := &fleetv1alpha1.ServiceExport{} @@ -185,7 +243,7 @@ func (r *DocumentDBReconciler) CreateServiceImportAndExport(ctx context.Context, return nil } -func (r *DocumentDBReconciler) TryUpdateCluster(ctx context.Context, current, desired *cnpgv1.Cluster, documentdb *dbpreview.DocumentDB) (error, time.Duration) { +func (r *DocumentDBReconciler) TryUpdateCluster(ctx context.Context, current, desired *cnpgv1.Cluster, documentdb *dbpreview.DocumentDB, replicationContext *util.ReplicationContext) (error, time.Duration) { if current.Spec.ReplicaCluster == nil || desired.Spec.ReplicaCluster == nil { // FOR NOW assume that we aren't going to turn on or off physical replication return nil, -1 @@ -250,21 +308,21 @@ func (r *DocumentDBReconciler) TryUpdateCluster(ctx context.Context, current, de } // push out the promotion token - err = r.CreateTokenService(ctx, current.Status.DemotionToken, documentdb.Namespace, documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud) + err = r.CreateTokenService(ctx, current.Status.DemotionToken, documentdb.Namespace, replicationContext) if err != nil { return err, time.Second * 10 } } else if primaryChanged && desired.Spec.ReplicaCluster.Primary == current.Spec.ReplicaCluster.Self { // Replica => primary - // Look for the token + // Look for the token if this is a managed failover oldPrimaryAvailable := slices.Contains( - documentdb.Spec.ClusterReplication.ClusterList, + replicationContext.Others, current.Spec.ReplicaCluster.Primary) replicaClusterConfig := desired.Spec.ReplicaCluster // If the old primary is available, we can read the token from it if oldPrimaryAvailable { - token, err, refreshTime := r.ReadToken(ctx, documentdb.Namespace, documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud) + token, err, refreshTime := r.ReadToken(ctx, documentdb.Namespace, replicationContext) if err != nil || refreshTime > 0 { return err, refreshTime } @@ -339,11 +397,11 @@ func (r *DocumentDBReconciler) TryUpdateCluster(ctx context.Context, current, de return nil, -1 } -func (r *DocumentDBReconciler) ReadToken(ctx context.Context, namespace string, fleetEnabled bool) (string, error, time.Duration) { +func (r *DocumentDBReconciler) ReadToken(ctx context.Context, namespace string, replicationContext *util.ReplicationContext) (string, error, time.Duration) { tokenServiceName := "promotion-token" - // If we are not using fleet, we only need to read the token from the configmap - if !fleetEnabled { + // If we are not using cross-cloud networking, we only need to read the token from the configmap + if !replicationContext.IsAzureFleetNetworking() && !replicationContext.IsIstioNetworking() { configMap := &corev1.ConfigMap{} err := r.Get(ctx, types.NamespacedName{Name: tokenServiceName, Namespace: namespace}, configMap) if err != nil { @@ -355,6 +413,61 @@ func (r *DocumentDBReconciler) ReadToken(ctx context.Context, namespace string, return configMap.Data["index.html"], nil, -1 } + // For Istio, create a dummy service so DNS resolution works + if replicationContext.IsIstioNetworking() { + foundService := &corev1.Service{} + err := r.Get(ctx, types.NamespacedName{Name: tokenServiceName, Namespace: namespace}, foundService) + if err != nil && errors.IsNotFound(err) { + log.Log.Info("Creating Istio dummy service for promotion token", "service", tokenServiceName) + + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: tokenServiceName, + Namespace: namespace, + Labels: map[string]string{ + "app": tokenServiceName, + }, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Port: 80, + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromInt(80), + }, + }, + Selector: map[string]string{ + // Non-matching selector ensures no local endpoints + "app": "does-not-exist", + }, + }, + } + + err = r.Create(ctx, service) + if err != nil && !errors.IsAlreadyExists(err) { + return "", fmt.Errorf("failed to create Istio dummy service for promotion token: %w", err), time.Second * 10 + } + } else if err != nil { + return "", fmt.Errorf("failed to check for existing service %s: %w", tokenServiceName, err), time.Second * 10 + } + + // Read token via HTTP through Istio service mesh + tokenRequestUrl := fmt.Sprintf("http://%s.%s.svc", tokenServiceName, namespace) + resp, err := http.Get(tokenRequestUrl) + if err != nil { + return "", fmt.Errorf("failed to get token from service: %w", err), time.Second * 10 + } + defer resp.Body.Close() + + token, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read token: %w", err), time.Second * 10 + } + + return string(token[:]), nil, -1 + } + + // This is the AzureFleet case foundMCS := &fleetv1alpha1.MultiClusterService{} err := r.Get(ctx, types.NamespacedName{Name: tokenServiceName, Namespace: namespace}, foundMCS) if err != nil && errors.IsNotFound(err) { @@ -409,7 +522,7 @@ func (r *DocumentDBReconciler) PromotionTokenNeedsUpdate(ctx context.Context, na return configMap.Data["index.html"] == "", nil } -func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token string, namespace string, fleetEnabled bool) error { +func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token string, namespace string, replicationContext *util.ReplicationContext) error { tokenServiceName := "promotion-token" labels := map[string]string{ "app": tokenServiceName, @@ -443,8 +556,8 @@ func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token str return fmt.Errorf("No token found yet") } - // When not using fleet, just transfer with the configmap - if !fleetEnabled { + // When not using cross-cloud networking, just transfer with the configmap + if !replicationContext.IsAzureFleetNetworking() && !replicationContext.IsIstioNetworking() { return nil } @@ -463,7 +576,7 @@ func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token str Ports: []corev1.ContainerPort{ { ContainerPort: 80, - Protocol: "TCP", + Protocol: corev1.ProtocolTCP, }, }, VolumeMounts: []corev1.VolumeMount{ @@ -507,7 +620,7 @@ func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token str { Port: 80, TargetPort: intstr.FromInt(80), - Protocol: "TCP", + Protocol: corev1.ProtocolTCP, }, }, }, @@ -518,17 +631,19 @@ func (r *DocumentDBReconciler) CreateTokenService(ctx context.Context, token str return fmt.Errorf("failed to create Service: %w", err) } - // Create ServiceExport for fleet networking - serviceExport := &fleetv1alpha1.ServiceExport{ - ObjectMeta: metav1.ObjectMeta{ - Name: tokenServiceName, - Namespace: namespace, - }, - } + // Create ServiceExport only for fleet networking + if replicationContext.IsAzureFleetNetworking() { + serviceExport := &fleetv1alpha1.ServiceExport{ + ObjectMeta: metav1.ObjectMeta{ + Name: tokenServiceName, + Namespace: namespace, + }, + } - err = r.Client.Create(ctx, serviceExport) - if err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("failed to create ServiceExport: %w", err) + err = r.Client.Create(ctx, serviceExport) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create ServiceExport: %w", err) + } } return nil diff --git a/operator/src/internal/utils/replication_context.go b/operator/src/internal/utils/replication_context.go index 80b0c30d..084c3c3e 100644 --- a/operator/src/internal/utils/replication_context.go +++ b/operator/src/internal/utils/replication_context.go @@ -14,14 +14,25 @@ import ( ) type ReplicationContext struct { - Self string - Others []string - PrimaryRegion string - currentLocalPrimary string - targetLocalPrimary string - state replicationState + Self string + Others []string + PrimaryRegion string + CrossCloudNetworkingStrategy crossCloudNetworkingStrategy + Environment string + StorageClass string + currentLocalPrimary string + targetLocalPrimary string + state replicationState } +type crossCloudNetworkingStrategy string + +const ( + None crossCloudNetworkingStrategy = "None" + AzureFleet crossCloudNetworkingStrategy = "AzureFleet" + Istio crossCloudNetworkingStrategy = "Istio" +) + type replicationState int32 const ( @@ -31,11 +42,15 @@ const ( ) func GetReplicationContext(ctx context.Context, client client.Client, documentdb dbpreview.DocumentDB) (*ReplicationContext, error) { + singleClusterReplicationContext := ReplicationContext{ + state: NoReplication, + CrossCloudNetworkingStrategy: None, + Environment: documentdb.Spec.Environment, + StorageClass: documentdb.Spec.Resource.Storage.StorageClass, + Self: documentdb.Name, + } if documentdb.Spec.ClusterReplication == nil { - return &ReplicationContext{ - state: NoReplication, - Self: documentdb.Name, - }, nil + return &singleClusterReplicationContext, nil } self, others, err := splitSelfAndOthers(ctx, client, documentdb) @@ -45,26 +60,35 @@ func GetReplicationContext(ctx context.Context, client client.Client, documentdb // If no remote clusters, then just proceed with a regular cluster if len(others) == 0 { - return &ReplicationContext{ - state: NoReplication, - Self: documentdb.Name, - }, nil + return &singleClusterReplicationContext, nil } state := Replica - if documentdb.Spec.ClusterReplication.Primary == self { + if documentdb.Spec.ClusterReplication.Primary == self.Name { state = Primary } primaryRegion := documentdb.Spec.ClusterReplication.Primary + storageClass := documentdb.Spec.Resource.Storage.StorageClass + if self.StorageClassOverride != "" { + storageClass = self.StorageClassOverride + } + environment := documentdb.Spec.Environment + if self.EnvironmentOverride != "" { + environment = self.EnvironmentOverride + } + return &ReplicationContext{ - Self: self, - Others: others, - PrimaryRegion: primaryRegion, - state: state, - targetLocalPrimary: documentdb.Status.TargetPrimary, - currentLocalPrimary: documentdb.Status.LocalPrimary, + Self: self.Name, + Others: others, + CrossCloudNetworkingStrategy: crossCloudNetworkingStrategy(documentdb.Spec.ClusterReplication.CrossCloudNetworkingStrategy), + PrimaryRegion: primaryRegion, + Environment: environment, + StorageClass: storageClass, + state: state, + targetLocalPrimary: documentdb.Status.TargetPrimary, + currentLocalPrimary: documentdb.Status.LocalPrimary, }, nil } @@ -114,7 +138,7 @@ func (r ReplicationContext) EndpointEnabled() bool { func (r ReplicationContext) GenerateExternalClusterServices(namespace string, fleetEnabled bool) func(yield func(string, string) bool) { return func(yield func(string, string) bool) { for _, other := range r.Others { - serviceName := r.Self + "-rw." + namespace + ".svc" + serviceName := other + "-rw." + namespace + ".svc" if fleetEnabled { serviceName = namespace + "-" + generateServiceName(other, r.Self, namespace) + ".fleet-system.svc" } @@ -174,24 +198,27 @@ func (r *ReplicationContext) CreateStandbyNamesList() []string { return standbyNames } -func splitSelfAndOthers(ctx context.Context, client client.Client, documentdb dbpreview.DocumentDB) (string, []string, error) { - self := documentdb.Name +func splitSelfAndOthers(ctx context.Context, client client.Client, documentdb dbpreview.DocumentDB) (*dbpreview.MemberCluster, []string, error) { + selfName := documentdb.Name var err error - if documentdb.Spec.ClusterReplication.EnableFleetForCrossCloud { - self, err = GetSelfName(ctx, client) + if documentdb.Spec.ClusterReplication.CrossCloudNetworkingStrategy != string(None) { + selfName, err = GetSelfName(ctx, client) if err != nil { - return "", nil, err + return nil, nil, err } } others := []string{} + var self dbpreview.MemberCluster for _, c := range documentdb.Spec.ClusterReplication.ClusterList { - if c != self { - others = append(others, c) + if c.Name != selfName { + others = append(others, c.Name) + } else { + self = c } } - return self, others, nil + return &self, others, nil } func GetSelfName(ctx context.Context, client client.Client) (string, error) { @@ -208,3 +235,11 @@ func GetSelfName(ctx context.Context, client client.Client) (string, error) { } return self, nil } + +func (r *ReplicationContext) IsAzureFleetNetworking() bool { + return r.CrossCloudNetworkingStrategy == AzureFleet +} + +func (r *ReplicationContext) IsIstioNetworking() bool { + return r.CrossCloudNetworkingStrategy == Istio +} diff --git a/operator/src/internal/utils/util.go b/operator/src/internal/utils/util.go index af2f41e6..90e966a8 100644 --- a/operator/src/internal/utils/util.go +++ b/operator/src/internal/utils/util.go @@ -75,7 +75,7 @@ func GetDocumentDBServiceDefinition(documentdb *dbpreview.DocumentDB, replicatio // Add environment-specific annotations for LoadBalancer services if serviceType == corev1.ServiceTypeLoadBalancer { - service.ObjectMeta.Annotations = getEnvironmentSpecificAnnotations(documentdb.Spec.Environment) + service.ObjectMeta.Annotations = getEnvironmentSpecificAnnotations(replicationContext.Environment) } return service diff --git a/operator/src/scripts/aks-fleet-deployment/deploy-fleet-bicep.sh b/operator/src/scripts/aks-fleet-deployment/deploy-fleet-bicep.sh index 794b9eca..174665a2 100755 --- a/operator/src/scripts/aks-fleet-deployment/deploy-fleet-bicep.sh +++ b/operator/src/scripts/aks-fleet-deployment/deploy-fleet-bicep.sh @@ -130,7 +130,7 @@ fi echo "Fetching kubeconfig contexts..." FIRST_CLUSTER="" set +e -az fleet get-credentials --resource-group "$RESOURCE_GROUP" --name "$FLEET_NAME" --overwrite-existing >/dev/null 2>&1 +az fleet get-credentials --resource-group "$RESOURCE_GROUP" --name "$FLEET_NAME" --overwrite-existing GET_CREDS_RC=$? set -e if [ $GET_CREDS_RC -ne 0 ]; then @@ -144,7 +144,7 @@ fi while read -r cluster; do [ -z "$cluster" ] && continue set +e - az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$cluster" --admin --overwrite-existing >/dev/null 2>&1 + az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$cluster" --overwrite-existing >/dev/null 2>&1 rc=$? set -e if [ $rc -eq 0 ]; then diff --git a/operator/src/scripts/aks-fleet-deployment/deploy-multi-region.sh b/operator/src/scripts/aks-fleet-deployment/deploy-multi-region.sh index 9349f0ae..cf9787ac 100755 --- a/operator/src/scripts/aks-fleet-deployment/deploy-multi-region.sh +++ b/operator/src/scripts/aks-fleet-deployment/deploy-multi-region.sh @@ -2,18 +2,19 @@ # filepath: /operator/src/scripts/aks-fleet-deployment/deploy-multi-region.sh set -euo pipefail -# Deploy multi-region DocumentDB using Fleet with Traffic Manager +# Deploy multi-region DocumentDB using Fleet with Azure DNS # Usage: ./deploy-multi-region.sh [password] # # Environment variables: # RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) # DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) -# ENABLE_TRAFFIC_MANAGER: Enable Traffic Manager creation (default: true) -# TRAFFIC_MANAGER_PROFILE_NAME: Traffic Manager profile name (default: ${RESOURCE_GROUP}-documentdb-tm) +# ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) +# AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) +# AZURE_DNS_PARENT_ZONE_RESOURCE_ID: Azure DNS parent zone resource ID (default: multi-cloud.pgmongo-dev.cosmos.windows-int.net) # # Examples: # ./deploy-multi-region.sh -# ENABLE_TRAFFIC_MANAGER=false ./deploy-multi-region.sh mypassword +# ENABLE_AZURE_DNS=false ./deploy-multi-region.sh mypassword # Get the directory where this script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -21,9 +22,10 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Resource group RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" -# Traffic Manager configuration -TRAFFIC_MANAGER_PROFILE_NAME="${TRAFFIC_MANAGER_PROFILE_NAME:-${RESOURCE_GROUP}-documentdb-tm}" -ENABLE_TRAFFIC_MANAGER="${ENABLE_TRAFFIC_MANAGER:-true}" +# Azure DNS configuration +AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" # Set password from argument or environment variable DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" @@ -78,9 +80,11 @@ echo "Selected primary cluster: $PRIMARY_CLUSTER" CLUSTER_LIST="" for cluster in "${CLUSTER_ARRAY[@]}"; do if [ -z "$CLUSTER_LIST" ]; then - CLUSTER_LIST=" - ${cluster}" + CLUSTER_LIST=" - name: ${cluster}" + CLUSTER_LIST="${CLUSTER_LIST}"$'\n'" environment: aks" else - CLUSTER_LIST="${CLUSTER_LIST}"$'\n'" - ${cluster}" + CLUSTER_LIST="${CLUSTER_LIST}"$'\n'" - name: ${cluster}" + CLUSTER_LIST="${CLUSTER_LIST}"$'\n'" environment: aks" fi done @@ -127,19 +131,15 @@ echo "=======================================" # Determine hub context HUB_CONTEXT="${HUB_CONTEXT:-hub}" if ! kubectl config get-contexts "$HUB_CONTEXT" &>/dev/null; then - echo "Hub context not found, trying to find first member cluster..." - HUB_CONTEXT="${CLUSTER_ARRAY[0]}" - if [ -z "$HUB_CONTEXT" ]; then - echo "Error: No suitable context found. Please ensure you have credentials for the fleet." - exit 1 - fi + echo "Error: Hub context not found. Please ensure you have credentials for the fleet." + exit 1 fi echo "Using hub context: $HUB_CONTEXT" # Check if resources already exist EXISTING_RESOURCES="" -if kubectl --context "$HUB_CONTEXT" get namespace documentdb-preview-ns &>/dev/null 2>&1; then +if kubectl --context "$HUB_CONTEXT" get namespace documentdb-preview-ns; then EXISTING_RESOURCES="${EXISTING_RESOURCES}namespace " fi if kubectl --context "$HUB_CONTEXT" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null 2>&1; then @@ -169,7 +169,9 @@ if [ -n "$EXISTING_RESOURCES" ]; then kubectl --context "$HUB_CONTEXT" delete clusterresourceplacement documentdb-crp --ignore-not-found=true kubectl --context "$HUB_CONTEXT" delete namespace documentdb-preview-ns --ignore-not-found=true echo "Waiting for namespace deletion to complete..." - kubectl --context "$HUB_CONTEXT" wait --for=delete namespace/documentdb-preview-ns --timeout=60s 2>/dev/null || true + for cluster in "${CLUSTER_ARRAY[@]}"; do + kubectl --context "$cluster" wait --for=delete namespace/documentdb-preview-ns --timeout=60s 2>/dev/null || true + done ;; 2) echo "Updating existing deployment..." @@ -296,44 +298,35 @@ for cluster in "${CLUSTER_ARRAY[@]}"; do fi done -# Step 4: Create Traffic Manager for DocumentDB load balancing -if [ "$ENABLE_TRAFFIC_MANAGER" = "true" ]; then +# Step 4: Create Azure DNS zone for DocumentDB +if [ "$ENABLE_AZURE_DNS" = "true" ]; then echo "" echo "=======================================" - echo "Creating Traffic Manager for DocumentDB..." + echo "Creating Azure DNS zone for DocumentDB..." echo "=======================================" - # Create Traffic Manager profile - echo "Creating Traffic Manager profile: $TRAFFIC_MANAGER_PROFILE_NAME" - if az network traffic-manager profile show --name "$TRAFFIC_MANAGER_PROFILE_NAME" --resource-group "$RESOURCE_GROUP" &>/dev/null; then - echo "Traffic Manager profile already exists, updating..." + parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") + fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" + + # Create Azure DNS zone + if az network dns zone show --name "$fullName" --resource-group "$RESOURCE_GROUP" &>/dev/null; then + echo "Azure DNS zone already exists, updating..." else - az network traffic-manager profile create \ - --name "$TRAFFIC_MANAGER_PROFILE_NAME" \ + az network dns zone create \ + --name "$fullName" \ --resource-group "$RESOURCE_GROUP" \ - --routing-method "Priority" \ - --unique-dns-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ - --ttl 30 \ - --protocol TCP \ - --port 10260 \ - --interval 30 \ - --timeout 10 \ - --max-failures 3 + --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" fi # Wait for DocumentDB services to be ready and create endpoints echo "" echo "Waiting for DocumentDB services to be ready..." sleep 30 - - # Create Traffic Manager endpoints for each cluster - for i in "${!CLUSTER_ARRAY[@]}"; do - cluster="${CLUSTER_ARRAY[$i]}" - REGION=$(echo "$cluster" | awk -F- '{print $2}') - ENDPOINT_NAME="documentdb-${REGION}" - - echo "Creating Traffic Manager endpoint: $ENDPOINT_NAME" - + + # Create DNS records for each cluster + for cluster in "${CLUSTER_ARRAY[@]}"; do + echo "Creating DNS record: $cluster" + # Create service name by concatenating documentdb-preview with cluster name (max 63 chars) SERVICE_NAME="documentdb-service-${cluster}" SERVICE_NAME="${SERVICE_NAME:0:63}" @@ -351,73 +344,65 @@ if [ "$ENABLE_TRAFFIC_MANAGER" = "true" ]; then if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then echo " External IP for $cluster: $EXTERNAL_IP" - - # Delete existing endpoint if it exists - az network traffic-manager endpoint delete \ - --name "$ENDPOINT_NAME" \ - --profile-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ + + # Delete existing DNS record if it exists + az network dns record-set a delete \ + --name "$cluster" \ + --zone-name "$fullName" \ --resource-group "$RESOURCE_GROUP" \ - --type ExternalEndpoints &>/dev/null || true - - # Set priority to 1 for primary cluster, 2+ for others - if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then - PRIORITY=1 - else - PRIORITY=$((i + 2)) - fi + --yes - # Create Traffic Manager endpoint - az network traffic-manager endpoint create \ - --name "$ENDPOINT_NAME" \ - --profile-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ + # Create DNS record + az network dns record-set a create \ + --name "$cluster" \ + --zone-name "$fullName" \ --resource-group "$RESOURCE_GROUP" \ - --type ExternalEndpoints \ - --target "$EXTERNAL_IP" \ - --endpoint-location "$REGION" \ - --priority "$PRIORITY" + --ttl 5 + az network dns record-set a add-record \ + --record-set-name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ipv4-address "$EXTERNAL_IP" \ + --ttl 5 - echo " ✓ Created endpoint $ENDPOINT_NAME with priority $PRIORITY" + echo " ✓ Created DNS record $cluster" else echo " ✗ Failed to get external IP for $cluster" fi done + + # Delete and recreate SRV record for MongoDB + az network dns record-set srv delete \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --yes - # Get Traffic Manager FQDN - TRAFFIC_MANAGER_FQDN=$(az network traffic-manager profile show \ - --name "$TRAFFIC_MANAGER_PROFILE_NAME" \ + az network dns record-set srv create \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ttl 1 + + mongoFQDN=$(az network dns record-set srv add-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$fullName" \ --resource-group "$RESOURCE_GROUP" \ - --query dnsConfig.fqdn -o tsv) + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$PRIMARY_CLUSTER.$fullName" | jq -r ".fqdn") echo "" - echo "✓ Traffic Manager created successfully!" - echo " Profile: $TRAFFIC_MANAGER_PROFILE_NAME" - echo " FQDN: $TRAFFIC_MANAGER_FQDN" + echo "✓ DNS zone created successfully!" + echo " Zone Name: $fullName" + echo " MongoDB FQDN: $mongoFQDN" fi echo "" -echo "=======================================" -echo "Connection Information" -echo "=======================================" -echo "" -echo "Username: default_user" -echo "Password: $DOCUMENTDB_PASSWORD" -echo "" - -if [ "$ENABLE_TRAFFIC_MANAGER" = "true" ] && [ -n "${TRAFFIC_MANAGER_FQDN:-}" ]; then - echo "🌐 Connect via Traffic Manager (load balanced):" - echo "mongosh $TRAFFIC_MANAGER_FQDN:10260 -u default_user -p \$DOCUMENTDB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates" - echo "" - echo "Or use port forwarding:" - echo "kubectl --context $PRIMARY_CLUSTER port-forward -n documentdb-preview-ns svc/documentdb-preview 10260:10260" - echo "mongosh localhost:10260 -u default_user -p \$DOCUMENTDB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates" -else - echo "To connect to the primary cluster ($PRIMARY_CLUSTER):" - echo "kubectl --context $PRIMARY_CLUSTER port-forward -n documentdb-preview-ns svc/documentdb-preview 10260:10260" - echo "mongosh localhost:10260 -u default_user -p \$DOCUMENTDB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates" -fi -echo "" -echo "Connection string:" -kubectl --context $PRIMARY_CLUSTER get documentdb -n documentdb-preview-ns -A -o json | jq ".items[0].status.connectionString" +echo "Connection Information:" +echo " Username: default_user" +echo " Password: $DOCUMENTDB_PASSWORD" echo "" # Generate failover commands for all non-primary clusters @@ -436,22 +421,6 @@ echo "" echo "To monitor the deployment:" echo "watch 'kubectl --context $HUB_CONTEXT get clusterresourceplacement documentdb-crp -o wide'" -if [ "$ENABLE_TRAFFIC_MANAGER" = "true" ]; then - echo "" - echo "To manage Traffic Manager:" - echo "# Check Traffic Manager status" - echo "az network traffic-manager profile show --name $TRAFFIC_MANAGER_PROFILE_NAME --resource-group $RESOURCE_GROUP" - echo "" - echo "# List endpoints" - echo "az network traffic-manager endpoint list --profile-name $TRAFFIC_MANAGER_PROFILE_NAME --resource-group $RESOURCE_GROUP" - echo "" - echo "# Test DNS resolution" - echo "nslookup ${TRAFFIC_MANAGER_FQDN:-$TRAFFIC_MANAGER_PROFILE_NAME.trafficmanager.net}" - echo "" - echo "# Delete Traffic Manager (if needed)" - echo "az network traffic-manager profile delete --name $TRAFFIC_MANAGER_PROFILE_NAME --resource-group $RESOURCE_GROUP" -fi - echo "" echo "To check DocumentDB status across all clusters:" # Create a space-separated string from the array diff --git a/operator/src/scripts/aks-fleet-deployment/failover.sh b/operator/src/scripts/aks-fleet-deployment/failover.sh deleted file mode 100755 index 7b528c04..00000000 --- a/operator/src/scripts/aks-fleet-deployment/failover.sh +++ /dev/null @@ -1,71 +0,0 @@ -#/bin/bash - -RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" -DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" -DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" -TRAFFIC_MANAGER_PROFILE_NAME="${TRAFFIC_MANAGER_PROFILE_NAME:-${RESOURCE_GROUP}-documentdb-tm}" -HUB_CONTEXT="${HUB_CONTEXT:-hub}" - -# Get all clusters -echo "Discovering member clusters in resource group: $RESOURCE_GROUP..." -MEMBER_CLUSTERS=$(az aks list -g "$RESOURCE_GROUP" -o json | jq -r '.[] | select(.name|startswith("member-")) | .name' | sort) - -if [ -z "$MEMBER_CLUSTERS" ]; then - echo "Error: No member clusters found in resource group $RESOURCE_GROUP" - echo "Please ensure the fleet is deployed first using ./deploy-fleet-bicep.sh" - exit 1 -fi - -PRIMARY_CLUSTER=$(kubectl get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq ".spec.clusterReplication.primary") - -# Convert to array -CLUSTER_ARRAY=($MEMBER_CLUSTERS) -echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" -for cluster in "${CLUSTER_ARRAY[@]}"; do - echo " - $cluster" - if [ "$cluster" == "$PRIMARY_CLUSTER" ]; then - echo " (current primary)" - else - TARGET_CLUSTER="$cluster" - fi -done - -echo "Updating Traffic Manager to point to new primary: $TARGET_CLUSTER..." - -# Find the lowest priority not in use -PRIORITIES=$(az network traffic-manager profile show \ - --resource-group "$RESOURCE_GROUP" \ - --profile-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ - | jq ".endpoints[].priority" \ - | sort -n) -LOWEST_AVAILABLE_PRIORITY=1 -for x in $PRIORITIES; do - if [ "$x" = "$LOWEST_AVAILABLE_PRIORITY" ]; then - LOWEST_AVAILABLE_PRIORITY=$((LOWEST_AVAILABLE_PRIORITY + 1)) - else - break - fi -done - - -PRIMARY_REGION=$(echo "$PRIMARY_CLUSTER" | awk -F- '{print $2}') -TARGET_REGION=$(echo "$TARGET_CLUSTER" | awk -F- '{print $2}') - -# Set the old primary to that priority, set the target to 1 -az network traffic-manager endpoint update \ - --type externalEndpoints \ - --resource-group "$RESOURCE_GROUP" \ - --profile-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ - --name "documentdb-$PRIMARY_REGION" \ - --priority $LOWEST_AVAILABLE_PRIORITY - -az network traffic-manager endpoint update \ - --type externalEndpoints \ - --resource-group "$RESOURCE_GROUP" \ - --profile-name "$TRAFFIC_MANAGER_PROFILE_NAME" \ - --name "documentdb-$TARGET_REGION" \ - --priority 1 - -echo "Initiating failover to $TARGET_CLUSTER..." -kubectl --context "$HUB_CONTEXT" patch documentdb "$DOCUMENTDB_NAME" -n "$DOCUMENTDB_NAMESPACE" \ - --type='merge' -p="{\"spec\":{\"clusterReplication\":{\"primary\":\"$TARGET_CLUSTER\"}}}" diff --git a/operator/src/scripts/aks-fleet-deployment/install-documentdb-operator.sh b/operator/src/scripts/aks-fleet-deployment/install-documentdb-operator.sh index c3205b40..fab66a47 100755 --- a/operator/src/scripts/aks-fleet-deployment/install-documentdb-operator.sh +++ b/operator/src/scripts/aks-fleet-deployment/install-documentdb-operator.sh @@ -10,7 +10,7 @@ set -euo pipefail RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" HUB_CONTEXT=${HUB_CONTEXT:-hub} -CHART_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd)/documentdb-helm-chart" +CHART_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)/documentdb-helm-chart" VERSION="${VERSION:-200}" VALUES_FILE="${VALUES_FILE:-}" @@ -37,13 +37,16 @@ fi echo "Applying cert-manager CRDs on hub ($HUB_CONTEXT)..." run kubectl --context "$HUB_CONTEXT" apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.crds.yaml -# Build/package chart if local tgz not present +# Build/package chart CHART_PKG="./documentdb-operator-0.0.${VERSION}.tgz" -if [ ! -f "$CHART_PKG" ]; then - echo "Packaging chart (helm dependency update && helm package)..." - run helm dependency update "$CHART_DIR" - run helm package "$CHART_DIR" --version 0.0."${VERSION}" +if [ -f "$CHART_PKG" ]; then + echo "Found existing chart package $CHART_PKG" + rm -f "$CHART_PKG" fi +echo "Packaging chart (helm dependency update && helm package)..." +run helm dependency update "$CHART_DIR" +run helm package "$CHART_DIR" --version 0.0."${VERSION}" + # Install/upgrade operator using the packaged chart if available, otherwise fallback to OCI registry if [ -f "$CHART_PKG" ]; then diff --git a/operator/src/scripts/aks-fleet-deployment/multi-region.yaml b/operator/src/scripts/aks-fleet-deployment/multi-region.yaml index a1eef56c..79e82b96 100644 --- a/operator/src/scripts/aks-fleet-deployment/multi-region.yaml +++ b/operator/src/scripts/aks-fleet-deployment/multi-region.yaml @@ -33,7 +33,7 @@ spec: pvcSize: 10Gi clusterReplication: highAvailability: true - enableFleetForCrossCloud: true + crossCloudNetworkingStrategy: AzureFleet primary: {{PRIMARY_CLUSTER}} clusterList: {{CLUSTER_LIST}} diff --git a/operator/src/scripts/multi-cloud-deployment/.gitignore b/operator/src/scripts/multi-cloud-deployment/.gitignore new file mode 100644 index 00000000..1503cc8a --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/.gitignore @@ -0,0 +1 @@ +certs \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/README.md b/operator/src/scripts/multi-cloud-deployment/README.md new file mode 100644 index 00000000..43ba863b --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/README.md @@ -0,0 +1,608 @@ +# Multi-Cloud DocumentDB Deployment + +This directory contains templates and scripts for deploying DocumentDB across multiple cloud providers (Azure AKS, Google GKE, and AWS EKS) with cross-cloud replication using Istio service mesh and AKS Fleet for resource propagation. + +## Architecture + +- **Fleet Resource**: Deployed in East US 2 (management hub for resource propagation) +- **Multi-Cloud Clusters**: + - **AKS**: Single member cluster in configurable region (default: eastus2) + - **GKE**: Cluster in us-central1-a + - **EKS**: Cluster in us-west-2 +- **Network**: + - AKS: Uses default Azure CNI + - GKE: Default GKE networking + - EKS: Default EKS networking with NLB for cross-cloud connectivity +- **Service Mesh**: Istio multi-cluster mesh for cross-cloud service discovery +- **VM Size**: Standard_DS3_v2 for AKS, e2-standard-4 for GKE, m5.large for EKS (configurable) +- **Node Count**: 1-2 nodes per cluster for cost optimization +- **Kubernetes Version**: Uses region default GA version (configurable) +- **DocumentDB**: Multi-cloud deployment with primary/replica architecture and Istio-based replication + +## Prerequisites + +- **Azure**: Azure CLI installed and logged in (`az login`) +- **GCP**: Google Cloud SDK installed and logged in (`gcloud auth login`) + - gke-gcloud-auth-plugin: `sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin` +- **AWS**: AWS CLI installed and configured (`aws configure`) + - eksctl installed for EKS cluster management +- **Kubernetes Tools**: + - kubectl installed + - kubelogin for Azure AD authentication: `az aks install-cli` + - Helm 3.x installed +- **Other Tools**: + - jq for JSON processing: `brew install jq` (macOS) or `apt-get install jq` (Linux) + - openssl for password generation +- **Permissions**: + - Azure: Contributor access to the subscription + - GCP: Container Admin, Compute Network Admin, and Service Account User roles + - AWS: Sufficient IAM permissions to create EKS clusters and IAM roles +- **Quotas**: Sufficient quota in target regions for clusters + +## Quick Start + +### Deploy Everything (One Command) + +```bash +./deploy.sh +``` + +This single script will: +1. **Deploy Infrastructure**: + - Create Azure resource group + - Deploy AKS Fleet resource + - Deploy AKS member cluster + - Deploy GKE cluster (in parallel) + - Deploy EKS cluster with EBS CSI driver and AWS Load Balancer Controller +2. **Configure Multi-Cloud Mesh**: + - Join GKE and EKS clusters to the AKS Fleet + - Install cert-manager on all clusters + - Set up Istio multi-cluster service mesh with shared root CA + - Configure cross-cloud networking with east-west gateways +3. **Deploy DocumentDB Operator**: + - Install DocumentDB operator on hub cluster + - Propagate base resources (CRDs, RBAC) to all member clusters via Fleet +4. **Set Up Access**: + - Configure kubectl contexts for all clusters + - Set up RBAC access for Fleet + +### Deploy DocumentDB Database + +After the infrastructure is deployed: + +```bash +# With auto-generated password +./deploy-documentdb.sh + +# With custom password +./deploy-documentdb.sh "MySecureP@ssw0rd" + +# Disable Azure DNS creation (for testing) +ENABLE_AZURE_DNS=false ./deploy-documentdb.sh +``` + +This will: +- Create cluster identification ConfigMaps on each member cluster +- Select a primary cluster (defaults to EKS cluster) +- Deploy DocumentDB with Istio-based cross-cloud replication +- Create Azure DNS zone with records for each cluster (if enabled) +- Create SRV record for MongoDB connection string +- Provide connection information and failover commands + +## Configuration + +### Infrastructure Configuration + +Edit `parameters.bicepparam` to customize AKS deployment: +- Hub cluster name (used for fleet naming) +- Hub region (fleet location) +- Member cluster name and region +- VM sizes +- Node counts +- Kubernetes version + +Or use environment variables for all clouds: + +```bash +# Azure AKS +export RESOURCE_GROUP="my-multi-cloud-rg" +export RG_LOCATION="eastus2" +export HUB_REGION="eastus2" +export AKS_CLUSTER_NAME="aks-documentdb-cluster" +export AKS_REGION="eastus2" +export HUB_VM_SIZE="Standard_D4s_v3" + +# Google GKE +export PROJECT_ID="my-gcp-project-id" +export GCP_USER="user@example.com" +export ZONE="us-central1-a" +export GKE_CLUSTER_NAME="gke-documentdb-cluster" + +# AWS EKS +export EKS_CLUSTER_NAME="eks-documentdb-cluster" +export EKS_REGION="us-west-2" + +# DocumentDB Operator +export VERSION="200" # Operator version +export VALUES_FILE="/path/to/custom/values.yaml" # Optional Helm values + +./deploy.sh +``` + +### DocumentDB Configuration + +Edit `documentdb-cluster.yaml` to customize: +- Database size and instances +- Replication settings (primary cluster, HA mode) +- Cross-cloud networking strategy (Istio) +- Storage class per environment +- Service exposure type +- Log levels + +The template uses placeholders replaced at runtime: +- `{{DOCUMENTDB_PASSWORD}}`: The database password +- `{{PRIMARY_CLUSTER}}`: The selected primary cluster +- `{{CLUSTER_LIST}}`: YAML list of all clusters with their environments + +### Azure DNS Configuration + +```bash +export ENABLE_AZURE_DNS="true" # Enable/disable DNS creation +export AZURE_DNS_ZONE_NAME="my-documentdb-zone" # DNS zone name (default: resource group name) +export AZURE_DNS_PARENT_ZONE_RESOURCE_ID="/subscriptions/.../dnszones/parent.zone" +``` + +## Environment Variables + +The deployment scripts automatically set and export: +- `FLEET_ID`: Full resource ID of the AKS fleet +- `IDENTITY`: Your Azure AD user ID +- `DOCUMENTDB_PASSWORD`: Database password (when deploying DocumentDB) +- `RESOURCE_GROUP`: Resource group name (default: german-aks-fleet-rg) +- `PROJECT_ID`: GCP project ID (default: sanguine-office-475117-s6) +- `ZONE`: GCP zone (default: us-central1-a) +- `EKS_REGION`: AWS region (default: us-west-2) + +## kubectl Contexts + +After deployment, contexts are automatically configured for: +- `hub`: AKS Fleet hub cluster +- `aks-documentdb-cluster`: AKS member cluster (default name) +- `gke-documentdb-cluster`: GKE cluster (default name) +- `eks-documentdb-cluster`: EKS cluster (default name) + +## Management + +### Check Deployment Status + +```bash +# Check operator status on hub +kubectl --context hub get deploy -n documentdb-operator + +# Check DocumentDB base resources propagation +kubectl --context hub get clusterresourceplacement documentdb-base -o wide + +# Check DocumentDB cluster resources propagation +kubectl --context hub get clusterresourceplacement documentdb-crp -o wide + +# View specific cluster +kubectl --context get documentdb,pods -n documentdb-preview-ns +``` + +### Connect to Database + +#### Via Port-Forward (for testing) + +```bash +# Connect to primary cluster +kubectl --context port-forward \ + -n documentdb-preview-ns svc/documentdb-service- 10260:10260 + +mongosh localhost:10260 -u default_user -p \ + --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates +``` + +#### Via Azure DNS (production) + +When `ENABLE_AZURE_DNS=true`, use the MongoDB SRV connection string: + +```bash +mongosh "mongodb+srv://default_user:@_mongodb._tcp../?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +``` + +Example: +```bash +mongosh "mongodb+srv://default_user:mypassword@_mongodb._tcp.german-aks-fleet-rg.multi-cloud.pgmongo-dev.cosmos.windows-int.net/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256" +``` + +### Failover Operations + +Failover is performed using the DocumentDB kubectl plugin: + +```bash +kubectl documentdb promote \ + --documentdb documentdb-preview \ + --namespace documentdb-preview-ns \ + --hub-context hub \ + --target-cluster \ + --cluster-context +``` + +## Fleet Management + +```bash +# Show AKS fleet details +az fleet show --name --resource-group $RESOURCE_GROUP + +# List fleet members (includes Azure members only, not cross-cloud) +az fleet member list --fleet-name --resource-group $RESOURCE_GROUP + +# Check all ClusterResourcePlacements +kubectl --context hub get clusterresourceplacement + +# View base resources placement (CRDs, RBAC) +kubectl --context hub describe clusterresourceplacement documentdb-base + +# View DocumentDB cluster placement +kubectl --context hub describe clusterresourceplacement documentdb-crp + +# Check multi-cloud fleet membership (GKE and EKS) +kubectl --context hub get membercluster +``` + +## Multi-Cloud Mesh Management + +### Verify Istio Installation + +```bash +# Check Istio components on each cluster +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n istio-system + echo +done + +# Verify east-west gateway services +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get svc -n istio-system istio-eastwestgateway + echo +done +``` + +### Verify Cross-Cloud Connectivity + +```bash +# Check remote secrets (for service discovery) +kubectl --context aks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context gke-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" +kubectl --context eks-documentdb-cluster get secrets -n istio-system | grep "istio-remote-secret" + +# Verify mesh network configuration +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get namespace istio-system --show-labels + echo +done +``` + +## DocumentDB Management + +### Check Deployment Status + +```bash +# Quick status across all clusters +for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $c ===" + kubectl --context $c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet' + echo +done + +# Check operator status on all clusters +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get deploy -n documentdb-operator + kubectl --context $cluster get pods -n documentdb-operator +done +``` + +### Monitor Replication + +```bash +# Watch ClusterResourcePlacement status +watch 'kubectl --context hub get clusterresourceplacement documentdb-crp -o wide' + +# Monitor all DocumentDB instances +watch 'for c in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do \ + echo "=== $c ==="; \ + kubectl --context $c get documentdb,pods -n documentdb-preview-ns; \ + echo; \ +done' + +# Check DocumentDB service endpoints +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get svc -n documentdb-preview-ns + echo +done +``` + +### Verify Cross-Cloud Replication + +```bash +# Check WAL replica status in Istio mesh +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n documentdb-preview-ns -l component=wal-replica + echo +done + +# Verify Istio sidecar injection +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get pods -n documentdb-preview-ns -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' + echo +done +``` + +### Azure DNS Management + +```bash +# List DNS records for DocumentDB +az network dns record-set list \ + --zone-name \ + --resource-group $RESOURCE_GROUP \ + --output table + +# Show SRV record for MongoDB connection +az network dns record-set srv show \ + --name "_mongodb._tcp" \ + --zone-name \ + --resource-group $RESOURCE_GROUP + +# Show A/CNAME records for each cluster +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + az network dns record-set a show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ + az network dns record-set cname show --name $cluster --zone-name --resource-group $RESOURCE_GROUP 2>/dev/null || \ + echo "Record not found" + echo +done +``` + +## RBAC Management + +The deployment script automatically assigns the "Azure Kubernetes Fleet Manager RBAC Cluster Admin" role for AKS Fleet access. To manage RBAC: + +```bash +# View current role assignment +az role assignment list --assignee $IDENTITY --scope $FLEET_ID + +# Add another user +az role assignment create --role "Azure Kubernetes Fleet Manager RBAC Cluster Admin" \ + --assignee --scope $FLEET_ID +``` + +For GCP and AWS, ensure you have appropriate IAM permissions configured via `gcloud` and `aws` CLI. + +## Troubleshooting + +### Authentication Issues + +**Azure AKS:** +```bash +# Get fleet credentials +az fleet get-credentials --resource-group $RESOURCE_GROUP --name + +# If web authentication is blocked, use Azure CLI +kubelogin convert-kubeconfig -l azurecli + +# Use admin credentials for member clusters +az aks get-credentials --resource-group $RESOURCE_GROUP --name --admin +``` + +**Google GKE:** +```bash +# Refresh credentials +gcloud container clusters get-credentials --zone + +# Verify authentication +gcloud auth list +gcloud config get-value account +``` + +**AWS EKS:** +```bash +# Update kubeconfig +aws eks update-kubeconfig --name --region + +# Verify IAM identity +aws sts get-caller-identity +``` + +### Resource Propagation Issues + +```bash +# Check ClusterResourcePlacement status +kubectl --context hub get clusterresourceplacement documentdb-base -o yaml +kubectl --context hub get clusterresourceplacement documentdb-crp -o yaml + +# Verify fleet members (Azure native) +az fleet member list --fleet-name --resource-group $RESOURCE_GROUP + +# Verify multi-cloud member clusters +kubectl --context hub get membercluster +kubectl --context hub describe membercluster + +# Check if resources reached target clusters +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster get documentdb -n documentdb-preview-ns + kubectl --context $cluster get pods -n documentdb-preview-ns + echo +done +``` + +### Istio Mesh Issues + +```bash +# Verify Istio installation +istioctl --context version + +# Check proxy status +istioctl --context proxy-status + +# Verify mesh configuration +istioctl --context analyze + +# Check east-west gateway connectivity +kubectl --context get svc -n istio-system istio-eastwestgateway + +# Verify remote secrets +kubectl --context get secrets -n istio-system | grep istio-remote-secret +``` + +### EKS-Specific Issues + +**EBS CSI Driver:** +```bash +# Check CSI driver status +kubectl --context eks-documentdb-cluster get pods -n kube-system -l app=ebs-csi-controller + +# Verify storage class +kubectl --context eks-documentdb-cluster get storageclass documentdb-storage +``` + +**AWS Load Balancer Controller:** +```bash +# Check controller status +kubectl --context eks-documentdb-cluster get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller + +# Verify subnet tags +VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) +aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" --query 'Subnets[].{ID:SubnetId,Tags:Tags}' --region $EKS_REGION +``` + +### DNS Issues + +```bash +# Verify DNS zone exists +az network dns zone show --name --resource-group $RESOURCE_GROUP + +# Check DNS records +az network dns record-set list --zone-name --resource-group $RESOURCE_GROUP + +# Test DNS resolution +nslookup .. +nslookup _mongodb._tcp.. -type=SRV +``` + +### Cross-Cloud Connectivity + +```bash +# Deploy test pod with network tools +kubectl --context aks-documentdb-cluster run test-pod --image=nicolaka/netshoot -it --rm -- /bin/bash + +# From within the pod, test connectivity to other clusters +# Using Istio service discovery +curl -v http://documentdb-service-gke-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 +curl -v http://documentdb-service-eks-documentdb-cluster.documentdb-preview-ns.svc.cluster.local:10260 +``` + +### Debugging + +```bash +# Check operator logs on hub +kubectl --context hub logs -n documentdb-operator deployment/documentdb-operator --tail=100 + +# Check operator logs on member clusters +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + echo "=== $cluster ===" + kubectl --context $cluster logs -n documentdb-operator deployment/documentdb-operator --tail=50 + echo +done + +# View DocumentDB resource status +kubectl --context describe documentdb documentdb-preview -n documentdb-preview-ns + +# Check Istio sidecar logs +kubectl --context logs -n documentdb-preview-ns -c istio-proxy +``` + +## Clean Up + +```bash +# Delete DocumentDB resources from all clusters +kubectl --context hub delete clusterresourceplacement documentdb-crp +kubectl --context hub delete namespace documentdb-preview-ns + +# Wait for namespace deletion to complete on all clusters +for cluster in aks-documentdb-cluster gke-documentdb-cluster eks-documentdb-cluster; do + kubectl --context $cluster wait --for=delete namespace/documentdb-preview-ns --timeout=60s || true +done + +# Delete base operator resources +kubectl --context hub delete clusterresourceplacement documentdb-base + +# Delete entire Azure resource group (includes AKS fleet and member) +az group delete --name $RESOURCE_GROUP --yes --no-wait + +# Delete GKE cluster +gcloud container clusters delete $GKE_CLUSTER_NAME \ + --zone $ZONE \ + --project $PROJECT_ID \ + --quiet + +# Delete EKS cluster (also deletes associated IAM roles and service accounts) +eksctl delete cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION + +# Delete Azure DNS zone (if created) +az network dns zone delete \ + --name \ + --resource-group $RESOURCE_GROUP \ + --yes + +# Clean up local kubectl contexts +kubectl config delete-context hub +kubectl config delete-context aks-documentdb-cluster +kubectl config delete-context gke-documentdb-cluster +kubectl config delete-context eks-documentdb-cluster +``` + +## Scripts + +- **`deploy.sh`**: All-in-one multi-cloud deployment (AKS Fleet + GKE + EKS + cert-manager + Istio mesh + operator) +- **`deploy-documentdb.sh`**: Deploy multi-cloud DocumentDB with Istio-based replication and optional Azure DNS +- **`main.bicep`**: Bicep template for AKS Fleet and single member cluster +- **`parameters.bicepparam`**: Configuration parameters for AKS deployment +- **`documentdb-base.yaml`**: Fleet ClusterResourcePlacement for base resources (CRDs, RBAC, namespaces) +- **`documentdb-cluster.yaml`**: DocumentDB multi-cloud configuration template with Fleet ClusterResourcePlacement + +## Key Features + +- **Multi-Cloud Architecture**: Deploy across Azure AKS, Google GKE, and AWS EKS +- **Istio Service Mesh**: Cross-cloud service discovery and secure communication +- **Automated Mesh Setup**: Shared root CA, east-west gateways, and remote secrets +- **AKS Fleet Integration**: Resource propagation via ClusterResourcePlacement to all clouds +- **Cross-Cloud Replication**: DocumentDB replication using Istio for connectivity +- **Dynamic Discovery**: Automatically configures all clusters and generates failover commands +- **Azure DNS Integration**: Optional DNS zone creation with A/CNAME and SRV records for MongoDB +- **Cloud-Specific Configuration**: + - EKS: EBS CSI driver and AWS Load Balancer Controller + - GKE: Default persistent disk provisioner + - AKS: Azure Disk CSI driver +- **Parallel Deployment**: AKS, GKE, and EKS deployed concurrently for faster setup +- **Smart Defaults**: Sensible defaults with environment variable overrides + +## Additional Resources + +- [Azure AKS Fleet Documentation](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/) +- [AKS Authentication Guide](https://learn.microsoft.com/en-us/azure/aks/kubelogin-authentication) +- [Fleet ClusterResourcePlacement API](https://learn.microsoft.com/en-us/azure/kubernetes-fleet/concepts-resource-propagation) +- [Istio Multi-Cluster Installation](https://istio.io/latest/docs/setup/install/multicluster/) +- [Istio Multi-Primary Multi-Network](https://istio.io/latest/docs/setup/install/multicluster/multi-primary_multi-network/) +- [Google GKE Documentation](https://cloud.google.com/kubernetes-engine/docs) +- [AWS EKS Documentation](https://docs.aws.amazon.com/eks/) +- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/) +- [eksctl Documentation](https://eksctl.io/) +- [DocumentDB Kubernetes Operator Documentation](../../README.md) \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh b/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh new file mode 100755 index 00000000..e5a15630 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/deploy-documentdb.sh @@ -0,0 +1,421 @@ +#!/usr/bin/env bash +# filepath: /Users/geeichbe/Projects/documentdb-kubernetes-operator/scripts/aks-fleet-deployment/deploy-multi-region.sh +set -euo pipefail + +# Deploy multi-region DocumentDB using Fleet with Traffic Manager +# Usage: ./deploy-documentdb.sh [password] +# +# Environment variables: +# RESOURCE_GROUP: Azure resource group (default: german-aks-fleet-rg) +# DOCUMENTDB_PASSWORD: Database password (will be generated if not provided) +# ENABLE_AZURE_DNS: Enable Azure DNS creation (default: true) +# AZURE_DNS_ZONE_NAME: Azure DNS zone name (default: same as resource group) +# AZURE_DNS_PARENT_ZONE_RESOURCE_ID: Azure DNS parent zone resource ID (default: multi-cloud.pgmongo-dev.cosmos.windows-int.net) +# +# Examples: +# ./deploy-multi-region.sh +# ENABLE_AZURE_DNS=false ./deploy-multi-region.sh mypassword + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Resource group +RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" + +AKS_CLUSTER_NAME="${AKS_CLUSTER_NAME:-aks-documentdb-cluster}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME:-eks-documentdb-cluster}" + +# Azure DNS configuration +AZURE_DNS_ZONE_NAME="${AZURE_DNS_ZONE_NAME:-${RESOURCE_GROUP}}" +AZURE_DNS_PARENT_ZONE_RESOURCE_ID="${AZURE_DNS_PARENT_ZONE_RESOURCE_ID:-/subscriptions/81901d5e-31aa-46c5-b61a-537dbd5df1e7/resourceGroups/alaye-documentdb-dns/providers/Microsoft.Network/dnszones/multi-cloud.pgmongo-dev.cosmos.windows-int.net}" +ENABLE_AZURE_DNS="${ENABLE_AZURE_DNS:-true}" + +# Set password from argument or environment variable +DOCUMENTDB_PASSWORD="${1:-${DOCUMENTDB_PASSWORD:-}}" + +# If no password provided, generate a secure one +if [ -z "$DOCUMENTDB_PASSWORD" ]; then + echo "No password provided. Generating a secure password..." + DOCUMENTDB_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-25) + echo "Generated password: $DOCUMENTDB_PASSWORD" + echo "(Save this password - you'll need it to connect to the database)" + echo "" +fi + +# Export for envsubst +export DOCUMENTDB_PASSWORD + + +# Convert to array and add GCP +CLUSTER_ARRAY=("$EKS_CLUSTER_NAME" "$AKS_CLUSTER_NAME" "$GKE_CLUSTER_NAME") +echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo " - $cluster" +done + +PRIMARY_CLUSTER=${CLUSTER_ARRAY[0]} +echo "" +echo "Selected primary cluster: $PRIMARY_CLUSTER" + +# Build the cluster list YAML with proper indentation +CLUSTER_LIST=$(cat </dev/null; then + echo "✗ Context $cluster not found, skipping" + continue + fi + + # Create or update the cluster-name ConfigMap + kubectl --context "$cluster" create configmap cluster-name \ + -n kube-system \ + --from-literal=name="$cluster" \ + --dry-run=client -o yaml | kubectl --context "$cluster" apply -f - + + # Verify the ConfigMap was created + if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then + echo "✓ ConfigMap created/updated for $cluster" + else + echo "✗ Failed to create ConfigMap for $cluster" + fi +done + +# Step 2: Deploy DocumentDB resources via Fleet +echo "" +echo "=======================================" +echo "Deploying DocumentDB multi-region configuration..." +echo "=======================================" + +# Determine hub context +HUB_CONTEXT="${HUB_CONTEXT:-hub}" +if ! kubectl config get-contexts "$HUB_CONTEXT" &>/dev/null; then + echo "Hub context not found, trying to find first member cluster..." + HUB_CONTEXT="${CLUSTER_ARRAY[0]}" + if [ -z "$HUB_CONTEXT" ]; then + echo "Error: No suitable context found. Please ensure you have credentials for the fleet." + exit 1 + fi +fi + +echo "Using hub context: $HUB_CONTEXT" + +# Check if resources already exist +EXISTING_RESOURCES="" +if kubectl --context "$HUB_CONTEXT" get namespace documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}namespace " +fi +if kubectl --context "$HUB_CONTEXT" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}secret " +fi +if kubectl --context "$HUB_CONTEXT" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}documentdb " +fi +if kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp &>/dev/null 2>&1; then + EXISTING_RESOURCES="${EXISTING_RESOURCES}clusterresourceplacement " +fi + +if [ -n "$EXISTING_RESOURCES" ]; then + echo "" + echo "⚠️ Warning: The following resources already exist: $EXISTING_RESOURCES" + echo "" + echo "Options:" + echo "1. Delete existing resources and redeploy ()" + echo "2. Update existing deployment (preserve data)" + echo "3. Cancel" + echo "" + read -p "Choose an option (1/2/3): " CHOICE + + case $CHOICE in + 1) + echo "Deleting existing resources..." + kubectl --context "$HUB_CONTEXT" delete clusterresourceplacement documentdb-crp --ignore-not-found=true + kubectl --context "$HUB_CONTEXT" delete namespace documentdb-preview-ns --ignore-not-found=true + echo "Waiting for namespace deletion to complete..." + for cluster in "${CLUSTER_ARRAY[@]}"; do + kubectl --context "$cluster" wait --for=delete namespace/documentdb-preview-ns --timeout=60s + done + ;; + 2) + echo "Updating existing deployment..." + ;; + 3) + echo "Cancelled." + exit 0 + ;; + *) + echo "Invalid choice. Cancelled." + exit 1 + ;; + esac +fi + +# Create a temporary file with substituted values +TEMP_YAML=$(mktemp) + +# Use sed for safer substitution +sed -e "s/{{DOCUMENTDB_PASSWORD}}/$DOCUMENTDB_PASSWORD/g" \ + -e "s/{{PRIMARY_CLUSTER}}/$PRIMARY_CLUSTER/g" \ + "$SCRIPT_DIR/documentdb-cluster.yaml" | \ +while IFS= read -r line; do + if [[ "$line" == '{{CLUSTER_LIST}}' ]]; then + echo "$CLUSTER_LIST" + else + echo "$line" + fi +done > "$TEMP_YAML" + +# Debug: show the generated YAML section with clusterReplication +echo "" +echo "Generated configuration preview:" +echo "--------------------------------" +echo "Primary cluster: $PRIMARY_CLUSTER" +echo "Cluster list:" +echo "$CLUSTER_LIST" +echo "--------------------------------" + +# cat "$TEMP_YAML" + +# Apply the configuration +echo "" +echo "Applying DocumentDB multi-region configuration..." +kubectl --context "$HUB_CONTEXT" apply -f "$TEMP_YAML" + +# Clean up temp file +rm -f "$TEMP_YAML" + +# Check the ClusterResourcePlacement status +echo "" +echo "Checking ClusterResourcePlacement status..." +kubectl --context "$HUB_CONTEXT" get clusterresourceplacement documentdb-crp -o wide + +# Wait a bit for propagation +echo "" +echo "Waiting for resources to propagate to member clusters..." +sleep 10 + +# Step 3: Verify deployment on each member cluster +echo "" +echo "=======================================" +echo "Checking deployment status on member clusters..." +echo "=======================================" + +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo "" + echo "=== $cluster ===" + + # Check if context exists + if ! kubectl config get-contexts "$cluster" &>/dev/null; then + echo "✗ Context not found, skipping" + continue + fi + + # Check ConfigMap + if kubectl --context "$cluster" get configmap cluster-name -n kube-system &>/dev/null; then + CLUSTER_ID=$(kubectl --context "$cluster" get configmap cluster-name -n kube-system -o jsonpath='{.data.name}') + echo "✓ Cluster identified as: $CLUSTER_ID" + else + echo "✗ Cluster identification ConfigMap not found" + fi + + # Check if namespace exists + if kubectl --context "$cluster" get namespace documentdb-preview-ns &>/dev/null; then + echo "✓ Namespace exists" + + # Check if secret exists + if kubectl --context "$cluster" get secret documentdb-credentials -n documentdb-preview-ns &>/dev/null; then + echo "✓ Secret exists" + else + echo "✗ Secret not found" + fi + + # Check if DocumentDB exists + if kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns &>/dev/null; then + echo "✓ DocumentDB resource exists" + + # Get DocumentDB status + STATUS=$(kubectl --context "$cluster" get documentdb documentdb-preview -n documentdb-preview-ns -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + echo " Status: $STATUS" + + # Check if this is the primary or replica + if [ "$cluster" = "$PRIMARY_CLUSTER" ]; then + echo " Role: PRIMARY" + else + echo " Role: REPLICA" + fi + else + echo "✗ DocumentDB resource not found" + fi + + # Check pods + PODS=$(kubectl --context "$cluster" get pods -n documentdb-preview-ns --no-headers 2>/dev/null | wc -l || echo "0") + echo " Pods: $PODS" + + # Show pod status if any exist + if [ "$PODS" -gt 0 ]; then + kubectl --context "$cluster" get pods -n documentdb-preview-ns 2>/dev/null | head -5 + fi + else + echo "✗ Namespace not found (resources may still be propagating)" + fi +done + +# Step 4: Create Azure DNS zone for DocumentDB +if [ "$ENABLE_AZURE_DNS" = "true" ]; then + echo "" + echo "=======================================" + echo "Creating Azure DNS zone for DocumentDB..." + echo "=======================================" + + parentName=$(az network dns zone show --id $AZURE_DNS_PARENT_ZONE_RESOURCE_ID | jq -r ".name") + fullName="${AZURE_DNS_ZONE_NAME}.${parentName}" + + # Create Azure DNS zone + if az network dns zone show --name "$AZURE_DNS_ZONE_NAME" --resource-group "$RESOURCE_GROUP" &>/dev/null; then + echo "Azure DNS zone already exists, updating..." + else + az network dns zone create \ + --name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --parent-name "$AZURE_DNS_PARENT_ZONE_RESOURCE_ID" + fi + + # Wait for DocumentDB services to be ready and create endpoints + echo "" + echo "Waiting for DocumentDB services to be ready..." + sleep 30 + + # Create DNS records for each cluster + for cluster in "${CLUSTER_ARRAY[@]}"; do + echo "Creating DNS record: $cluster" + + # Create service name by concatenating documentdb-preview with cluster name (max 63 chars) + SERVICE_NAME="documentdb-service-${cluster}" + SERVICE_NAME="${SERVICE_NAME:0:63}" + + # Get the external IP of the DocumentDB service + EXTERNAL_IP="" + for attempt in {1..12}; do # Try for 2 minutes + EXTERNAL_IP=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "") + if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then + break + fi + EXTERNAL_HOSTNAME=$(kubectl --context "$cluster" get svc "$SERVICE_NAME" -n documentdb-preview-ns -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") + if [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then + break + fi + echo " Waiting for external IP for $cluster (service: $SERVICE_NAME, attempt $attempt/12)..." + sleep 10 + done + + if [ -n "$EXTERNAL_IP" ] && [ "$EXTERNAL_IP" != "" ]; then + echo " External IP for $cluster: $EXTERNAL_IP" + + # TODO Delete existing DNS record if it exists + az network dns record-set a delete \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --yes + + # Create DNS record + az network dns record-set a create \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ttl 5 + az network dns record-set a add-record \ + --record-set-name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ipv4-address "$EXTERNAL_IP" \ + --ttl 5 + + echo " ✓ Created DNS record $cluster" + elif [ -n "$EXTERNAL_HOSTNAME" ] && [ "$EXTERNAL_HOSTNAME" != "" ]; then + echo " External hostname for $cluster: $EXTERNAL_HOSTNAME" + + # TODO Delete existing DNS record if it exists + az network dns record-set cname delete \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --yes + + # Create DNS record + az network dns record-set cname create \ + --name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ttl 5 + az network dns record-set cname set-record \ + --record-set-name "$cluster" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --cname "$EXTERNAL_HOSTNAME" \ + --ttl 5 + + echo " ✓ Created DNS record $cluster" + else + echo " ✗ Failed to get external IP for $cluster" + fi + done + + az network dns record-set srv delete \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --yes + + az network dns record-set srv create \ + --name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --ttl 5 + + mongoFQDN=$(az network dns record-set srv add-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$fullName" \ + --resource-group "$RESOURCE_GROUP" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$PRIMARY_CLUSTER.$fullName" | jq -r ".fqdn") + + echo "" + echo "✓ DNS zone created successfully!" + echo " Zone Name: $fullName" + echo " MongoDB FQDN: $mongoFQDN" +fi + +echo "" +echo "Connection Information:" +echo " Username: default_user" +echo " Password: $DOCUMENTDB_PASSWORD" +echo "" +echo "To monitor the deployment:" +echo "watch 'kubectl --context $HUB_CONTEXT get clusterresourceplacement documentdb-crp -o wide'" +echo "" +echo "To check DocumentDB status across all clusters:" +# Create a space-separated string from the array +CLUSTER_STRING=$(IFS=' '; echo "${CLUSTER_ARRAY[*]}") +echo "for c in $CLUSTER_STRING; do echo \"=== \$c ===\"; kubectl --context \$c get documentdb,pods -n documentdb-preview-ns 2>/dev/null || echo 'Not deployed yet'; echo; done" \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh b/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh new file mode 100644 index 00000000..5e2d8d53 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/deploy-gke.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +PROJECT_ID="${PROJECT_ID:-gke-documentdb-demo}" +GKE_USER="${GKE_USER:-alexanderlaye57@gmail.com}" +CLUSTER_NAME="${CLUSTER_NAME:-gke-documentdb-cluster}" +ZONE="${ZONE:-us-central1-a}" + +# one time +#gcloud projects create $PROJECT_ID +#sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin + +gcloud config set project $PROJECT_ID +gcloud config set account $USER +gcloud auth login --brief + +gcloud services enable container.googleapis.com +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/container.admin" +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/compute.networkAdmin" +gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$USER" --role="roles/iam.serviceAccountUser" + +gcloud container clusters create "$CLUSTER_NAME" \ + --zone "$ZONE" \ + --num-nodes "2" \ + --machine-type "e2-standard-4" \ + --enable-ip-access \ + --project $PROJECT_ID + +gcloud container clusters get-credentials "$CLUSTER_NAME" \ + --location="$ZONE" +kubectl config rename-context "$(kubectl config current-context)" $CLUSTER_NAME + +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.13.2 \ + --set installCRDs=true \ + --set prometheus.enabled=false \ + --set webhook.timeoutSeconds=30 + + +cat < /dev/null; then + echo "ERROR: Azure CLI not found. Please install Azure CLI first." >&2 + exit 1 + fi + + # Check kubectl + if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found. Please install kubectl first." >&2 + exit 1 + fi + + # Check Helm + if ! command -v helm &> /dev/null; then + echo "ERROR: Helm not found. Please install Helm first." >&2 + exit 1 + fi + + # Check gcloud CLI + if ! command -v gcloud &> /dev/null; then + echo "ERROR: gcloud CLI not found. Please install Google Cloud SDK first." >&2 + exit 1 + fi + + # Check AWS CLI + if ! command -v aws &> /dev/null; then + echo "ERROR: AWS CLI not found. Please install AWS CLI first." >&2 + exit 1 + fi + + # Check eksctl + if ! command -v eksctl &> /dev/null; then + echo "ERROR: eksctl not found. Please install eksctl first." >&2 + exit 1 + fi + + # Check jq + if ! command -v jq &> /dev/null; then + echo "ERROR: jq not found. Please install jq first." >&2 + exit 1 + fi + + # Check Azure login + if ! az account show &> /dev/null; then + echo "ERROR: Not logged into Azure. Please run 'az login' first." >&2 + exit 1 + fi + + # Check gcloud login + gcloud config set account $GCP_USER + if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | grep -q .; then + echo "ERROR: Not logged into Google Cloud. Please run 'gcloud auth login' first." >&2 + exit 1 + fi + + # Check AWS credentials + if ! aws sts get-caller-identity &> /dev/null; then + echo "ERROR: AWS credentials not configured. Please run 'aws configure' first." >&2 + exit 1 + fi + + echo "✅ All prerequisites met" +} + +wait_for_no_inprogress() { + local rg="$1" + echo "Checking for in-progress AKS operations in resource group '$rg'..." + local inprogress + inprogress=$(az aks list -g "$rg" -o json \ + | jq -r '.[] | select(.provisioningState != "Succeeded" and .provisioningState != null) | [.name, .provisioningState] | @tsv') + + if [ -z "$inprogress" ]; then + echo "No in-progress AKS operations detected." + return 0 + fi + + echo "Found clusters still provisioning:" + echo "$inprogress" | while IFS=$'\t' read -r name state; do echo " - $name: $state"; done + echo "Please re-run this script after the above operations complete." >&2 + return 1 +} + +# ============================================================================ +# Step 1: Deploy AKS Fleet Infrastructure +# ============================================================================ + +aks_fleet_deploy() { + echo "Creating or using resource group..." + EXISTING_RG_LOCATION=$(az group show --name "$RESOURCE_GROUP" --query location -o tsv 2>/dev/null || true) + if [ -n "$EXISTING_RG_LOCATION" ]; then + echo "Using existing resource group '$RESOURCE_GROUP' in location '$EXISTING_RG_LOCATION'" + RG_LOCATION="$EXISTING_RG_LOCATION" + else + az group create --name "$RESOURCE_GROUP" --location "$RG_LOCATION" + fi + + echo "Deploying AKS Fleet with Bicep..." + if ! wait_for_no_inprogress "$RESOURCE_GROUP"; then + echo "Exiting without changes due to in-progress operations." >&2 + exit 1 + fi + + PARAMS=( + --parameters "$TEMPLATE_DIR/parameters.bicepparam" + --parameters hubRegion="$HUB_REGION" + --parameters memberRegion="$AKS_REGION" + --parameters memberName="$AKS_CLUSTER_NAME" + ) + + if [ -n "$HUB_VM_SIZE" ]; then + echo "Overriding hubVmSize with: $HUB_VM_SIZE" + PARAMS+=( --parameters hubVmSize="$HUB_VM_SIZE" ) + fi + + DEPLOYMENT_NAME="aks-fleet-$(date +%s)" + az deployment group create \ + --name "$DEPLOYMENT_NAME" \ + --resource-group $RESOURCE_GROUP \ + --template-file "$TEMPLATE_DIR/main.bicep" \ + "${PARAMS[@]}" >/dev/null + + # Retrieve outputs + DEPLOYMENT_OUTPUT=$(az deployment group show \ + --resource-group $RESOURCE_GROUP \ + --name "$DEPLOYMENT_NAME" \ + --query "properties.outputs" -o json) + + FLEET_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetName.value') + FLEET_ID_FROM_OUTPUT=$(echo $DEPLOYMENT_OUTPUT | jq -r '.fleetId.value') + AKS_CLUSTER_NAME=$(echo $DEPLOYMENT_OUTPUT | jq -r '.memberClusterName.value') + + SUBSCRIPTION_ID=$(az account show --query id -o tsv) + export FLEET_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.ContainerService/fleets/${FLEET_NAME}" + + # Set up RBAC + echo "Setting up RBAC access for Fleet..." + export IDENTITY=$(az ad signed-in-user show --query "id" --output tsv) + export ROLE="Azure Kubernetes Fleet Manager RBAC Cluster Admin" + echo "Assigning role '$ROLE' to user '$IDENTITY'..." + az role assignment create --role "${ROLE}" --assignee ${IDENTITY} --scope ${FLEET_ID} >/dev/null 2>&1 || true + + # Fetch kubeconfig contexts + echo "Fetching kubeconfig contexts..." + az fleet get-credentials --resource-group "$RESOURCE_GROUP" --name "$FLEET_NAME" --overwrite-existing + + az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" --overwrite-existing +} + +# ============================================================================ +# Step 1.2: Deploy GKE Infrastructure +# ============================================================================ + +# TODO move this to a check at the top +# sudo apt-get install google-cloud-cli-gke-gcloud-auth-plugin + +# Create project if it doesn't exist +gke_deploy() { + if ! gcloud projects describe $PROJECT_ID &>/dev/null; then + gcloud projects create $PROJECT_ID + fi + + gcloud config set project $PROJECT_ID + + gcloud services enable container.googleapis.com + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/container.admin" + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/compute.networkAdmin" + gcloud projects add-iam-policy-binding $PROJECT_ID --member="user:$GCP_USER" --role="roles/iam.serviceAccountUser" + + # Delete cluster if it exists + if gcloud container clusters describe "$GKE_CLUSTER_NAME" --zone "$ZONE" --project $PROJECT_ID &>/dev/null; then + gcloud container clusters delete "$GKE_CLUSTER_NAME" \ + --zone "$ZONE" \ + --project $PROJECT_ID \ + --quiet + fi + + gcloud container clusters create "$GKE_CLUSTER_NAME" \ + --zone "$ZONE" \ + --num-nodes "2" \ + --machine-type "e2-standard-4" \ + --enable-ip-access \ + --project $PROJECT_ID + + kubectl config delete-context "$GKE_CLUSTER_NAME" || true + kubectl config delete-cluster "$GKE_CLUSTER_NAME" || true + kubectl config delete-user "$GKE_CLUSTER_NAME" || true + gcloud container clusters get-credentials "$GKE_CLUSTER_NAME" \ + --location="$ZONE" + fullName="gke_${PROJECT_ID}_${ZONE}_${GKE_CLUSTER_NAME}" + # Replace all occurrences of the generated name with GKE_CLUSTER_NAME in kubeconfig + sed -i "s|$fullName|$GKE_CLUSTER_NAME|g" ~/.kube/config +} + + +# ============================================================================ +# Step 1.3: Deploy EKS Infrastructure +# ============================================================================ + +eks_deploy() { + NODE_TYPE="m5.large" + + if eksctl get cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION &> /dev/null; then + echo "Cluster $EKS_CLUSTER_NAME already exists." + else + eksctl create cluster \ + --name $EKS_CLUSTER_NAME \ + --region $EKS_REGION \ + --node-type $NODE_TYPE \ + --nodes 2 \ + --nodes-min 2 \ + --nodes-max 2 \ + --managed \ + --with-oidc + fi + + eksctl create iamserviceaccount \ + --cluster $EKS_CLUSTER_NAME \ + --namespace kube-system \ + --name ebs-csi-controller-sa \ + --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \ + --override-existing-serviceaccounts \ + --approve \ + --region $EKS_REGION + + # Install EBS CSI driver addon + eksctl create addon \ + --name aws-ebs-csi-driver \ + --cluster $EKS_CLUSTER_NAME \ + --region $EKS_REGION \ + --force + + # Wait for EBS CSI driver to be ready + echo "Waiting for EBS CSI driver to be ready..." + sleep 5 + kubectl wait --for=condition=ready pod -l app=ebs-csi-controller -n kube-system --timeout=300s || echo "EBS CSI driver pods may still be starting" + + echo "Installing AWS Load Balancer Controller..." + + # Check if already installed + if helm list -n kube-system | grep -q aws-load-balancer-controller; then + echo "AWS Load Balancer Controller already installed. Skipping installation." + else + # Get VPC ID for the cluster + VPC_ID=$(aws eks describe-cluster --name $EKS_CLUSTER_NAME --region $EKS_REGION --query 'cluster.resourcesVpcConfig.vpcId' --output text) + echo "Using VPC ID: $VPC_ID" + + # Verify subnet tags for Load Balancer Controller + echo "Verifying subnet tags for Load Balancer Controller..." + PUBLIC_SUBNETS=$(aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=true" \ + --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) + + PRIVATE_SUBNETS=$(aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=map-public-ip-on-launch,Values=false" \ + --query 'Subnets[].SubnetId' --output text --region $EKS_REGION) + + # Tag public subnets for internet-facing load balancers + if [ -n "$PUBLIC_SUBNETS" ]; then + echo "Tagging public subnets for internet-facing load balancers..." + for subnet in $PUBLIC_SUBNETS; do + aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/elb,Value=1 --region $EKS_REGION 2>/dev/null || true + echo "Tagged public subnet: $subnet" + done + fi + + # Tag private subnets for internal load balancers + if [ -n "$PRIVATE_SUBNETS" ]; then + echo "Tagging private subnets for internal load balancers..." + for subnet in $PRIVATE_SUBNETS; do + aws ec2 create-tags --resources "$subnet" --tags Key=kubernetes.io/role/internal-elb,Value=1 --region $EKS_REGION 2>/dev/null || true + echo "Tagged private subnet: $subnet" + done + fi + + # Download the official IAM policy (latest version) + echo "Downloading AWS Load Balancer Controller IAM policy (latest version)..." + curl -o /tmp/iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json + + # Get account ID + ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + + # Check if policy exists and create/update as needed + if aws iam get-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy &>/dev/null; then + echo "IAM policy already exists, updating to latest version..." + # Delete and recreate to ensure we have the latest version + aws iam delete-policy --policy-arn arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy 2>/dev/null || true + sleep 5 # Wait for deletion to propagate + fi + + # Create IAM policy with latest permissions + echo "Creating IAM policy with latest permissions..." + aws iam create-policy \ + --policy-name AWSLoadBalancerControllerIAMPolicy \ + --policy-document file:///tmp/iam_policy.json 2>/dev/null || \ + echo "IAM policy already exists or was just created" + # Wait a moment for policy to be available + sleep 5 + + # Create IAM service account with proper permissions using eksctl + echo "Creating IAM service account with proper permissions..." + eksctl create iamserviceaccount \ + --cluster=$EKS_CLUSTER_NAME \ + --namespace=kube-system \ + --name=aws-load-balancer-controller \ + --role-name "AmazonEKSLoadBalancerControllerRole-$EKS_CLUSTER_NAME" \ + --attach-policy-arn=arn:aws:iam::$ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy \ + --approve \ + --override-existing-serviceaccounts \ + --region=$EKS_REGION + + # Add EKS Helm repository + helm repo add eks https://aws.github.io/eks-charts + helm repo update eks + + # Install Load Balancer Controller using the existing service account + helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ + -n kube-system \ + --set clusterName=$EKS_CLUSTER_NAME \ + --set serviceAccount.create=false \ + --set serviceAccount.name=aws-load-balancer-controller \ + --set region=$EKS_REGION \ + --set vpcId=$VPC_ID + + # Wait for Load Balancer Controller to be ready + echo "Waiting for Load Balancer Controller to be ready..." + sleep 5 + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=aws-load-balancer-controller -n kube-system --timeout=300s || echo "Load Balancer Controller pods may still be starting" + + # Clean up temp file + rm -f /tmp/iam_policy.json + + echo "AWS Load Balancer Controller installed" + fi + + if kubectl get storageclass documentdb-storage &> /dev/null; then + echo "DocumentDB storage class already exists. Skipping creation." + else + kubectl apply -f - </dev/null || true +helm repo update >/dev/null 2>&1 + +for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + echo "Installing cert-manager on $cluster..." + kubectl config use-context "$cluster" 2>/dev/null + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set installCRDs=true \ + --wait --timeout=5m >/dev/null 2>&1 || echo " Warning: cert-manager installation issue on $cluster" + echo "✓ cert-manager installed on $cluster" +done + +echo "✅ cert-manager installed on all clusters" + +# ============================================================================ +# Step 5: Install Istio and setup mesh +# ============================================================================ + +# Create an issuer in istio-system namespace on hub +temp_dir=$(mktemp -d) +echo "Temporary directory created at: $temp_dir" + +# Check if istioctl is installed, if not install it to temp_dir +if ! command -v istioctl &> /dev/null; then + echo "istioctl not found, installing to $temp_dir..." + ISTIO_VERSION="1.24.0" + curl -L https://istio.io/downloadIstio | ISTIO_VERSION=$ISTIO_VERSION TARGET_ARCH=x86_64 sh - -d "$temp_dir" >/dev/null 2>&1 + export PATH="$temp_dir/istio-$ISTIO_VERSION/bin:$PATH" + echo "✓ istioctl installed to $temp_dir/istio-$ISTIO_VERSION/bin" +else + echo "✓ istioctl already installed: $(which istioctl)" +fi + +if [ -z "$ISTIO_DIR" ]; then + git clone https://github.com/istio/istio.git "$temp_dir/istio" + export ISTIO_DIR="$temp_dir/istio" +fi +rm -rf "$TEMPLATE_DIR/certs" +mkdir $TEMPLATE_DIR/certs +pushd $TEMPLATE_DIR/certs +make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" root-ca +index=1 +for cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + make -f "$ISTIO_DIR/tools/certs/Makefile.selfsigned.mk" "${cluster}-cacerts" + kubectl --context "$cluster" delete namespace/istio-system --wait=true --ignore-not-found=true + kubectl --context "$cluster" create namespace istio-system + kubectl --context "$cluster" wait --for=jsonpath='{.status.phase}'=Active namespace/istio-system --timeout=60s + # create certs + kubectl --context "$cluster" create secret generic cacerts -n istio-system \ + --from-file="${cluster}/ca-cert.pem" \ + --from-file="${cluster}/ca-key.pem" \ + --from-file="${cluster}/root-cert.pem" \ + --from-file="${cluster}/cert-chain.pem" + + kubectl --context="${cluster}" label namespace istio-system topology.istio.io/network=network${index} + + #install istio on each cluster + cat < $remoteSecretFile + for other_cluster in ${MEMBER_CLUSTER_NAMES[@]}; do + if [ "$cluster" = "$other_cluster" ]; then + continue + fi + kubectl apply -f $remoteSecretFile --context="${other_cluster}" + done +done + +popd + +# 5.1 add lb tags to istio ew gateway on aws +kubectl --context "$EKS_CLUSTER_NAME" -n istio-system annotate service istio-eastwestgateway \ + service.beta.kubernetes.io/aws-load-balancer-type="nlb" \ + service.beta.kubernetes.io/aws-load-balancer-scheme="internet-facing" \ + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled="true" \ + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type="ip" + +# ============================================================================ +# Step 6: Install DocumentDB Operator +# ============================================================================ + +CHART_DIR="$(cd "$TEMPLATE_DIR/../.." && pwd)/documentdb-chart" +CHART_PKG="$TEMPLATE_DIR/documentdb-operator-0.0.${VERSION}.tgz" + +# Apply cert-manager CRDs on hub +echo "Applying cert-manager CRDs on hub ($HUB_CONTEXT)..." +kubectl --context "$HUB_CONTEXT" apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.crds.yaml #>/dev/null 2>&1 + +# Create documentdb-operator namespace with Istio injection on hub +cat </dev/null || echo "0") + DESIRED=$(kubectl --context "$cluster" get deploy documentdb-operator -n documentdb-operator -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") + echo " $cluster: $READY/$DESIRED replicas ready" +done + +# ============================================================================ +# Save environment variables and aliases +# ============================================================================ diff --git a/operator/src/scripts/multi-cloud-deployment/dns_failover.sh b/operator/src/scripts/multi-cloud-deployment/dns_failover.sh new file mode 100755 index 00000000..4e9af07a --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/dns_failover.sh @@ -0,0 +1,55 @@ +#/bin/bash + +RESOURCE_GROUP="${RESOURCE_GROUP:-german-aks-fleet-rg}" +DOCUMENTDB_NAME="${DOCUMENTDB_NAME:-documentdb-preview}" +DOCUMENTDB_NAMESPACE="${DOCUMENTDB_NAMESPACE:-documentdb-preview-ns}" +HUB_CONTEXT="${HUB_CONTEXT:-hub}" +GKE_CLUSTER_NAME="${GKE_CLUSTER_NAME:-gke-documentdb-cluster}" + +MEMBER_CLUSTERS=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.clusterList[].name") +PRIMARY_CLUSTER=$(kubectl --context "$HUB_CONTEXT" get documentdb $DOCUMENTDB_NAME -n $DOCUMENTDB_NAMESPACE -o json | jq -r ".spec.clusterReplication.primary") +TARGET_CLUSTER=$1 + +# Convert to array +CLUSTER_ARRAY=($MEMBER_CLUSTERS) +echo "Found ${#CLUSTER_ARRAY[@]} member clusters:" +for cluster in "${CLUSTER_ARRAY[@]}"; do + echo " - $cluster" + if [ "$cluster" == "$PRIMARY_CLUSTER" ]; then + echo " (current primary)" + elif [ "$cluster" == "$TARGET_CLUSTER" ]; then + echo " (target primary)" + fi +done + + +dnsName=$(az network dns zone list --resource-group $RESOURCE_GROUP --query="[0].name" -o tsv) + +#delete old srv record +az network dns record-set srv remove-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$dnsName" \ + --resource-group "$RESOURCE_GROUP" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$PRIMARY_CLUSTER.$dnsName" \ + --keep-empty-record-set + +#create new one +az network dns record-set srv add-record \ + --record-set-name "_mongodb._tcp" \ + --zone-name "$dnsName" \ + --resource-group "$RESOURCE_GROUP" \ + --priority 0 \ + --weight 0 \ + --port 10260 \ + --target "$TARGET_CLUSTER.$dnsName" + +echo "To initiate failover to $TARGET_CLUSTER run:" +echo "kubectl documentdb promote \\" +echo " --documentdb documentdb-preview \\" +echo " --namespace documentdb-preview-ns \\" +echo " --hub-context $HUB_CONTEXT \\" +echo " --target-cluster $TARGET_CLUSTER \\" +echo " --cluster-context $TARGET_CLUSTER" diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml new file mode 100644 index 00000000..c33dacb6 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/documentdb-base.yaml @@ -0,0 +1,99 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: documentdb-base +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: documentdb-operator + - group: "" + version: v1 + kind: Namespace + name: cnpg-system + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: documentdbs.db.microsoft.com + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: publications.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: poolers.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: clusterimagecatalogs.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: imagecatalogs.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: scheduledbackups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: backups.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: subscriptions.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: databases.postgresql.cnpg.io + - group: "apiextensions.k8s.io" + version: v1 + kind: CustomResourceDefinition + name: clusters.postgresql.cnpg.io + # RBAC roles and bindings + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cluster-role + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg-edit + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: documentdb-operator-cloudnative-pg-view + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: documentdb-operator-cluster-rolebinding + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: documentdb-operator-cloudnative-pg + - group: "admissionregistration.k8s.io" + version: v1 + kind: MutatingWebhookConfiguration + name: cnpg-mutating-webhook-configuration + - group: "admissionregistration.k8s.io" + version: v1 + kind: ValidatingWebhookConfiguration + name: cnpg-validating-webhook-configuration + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: wal-replica-manager + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: wal-replica-manager-binding + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml b/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml new file mode 100644 index 00000000..3a00305d --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/documentdb-cluster.yaml @@ -0,0 +1,61 @@ +# Namespace definition +apiVersion: v1 +kind: Namespace +metadata: + name: documentdb-preview-ns + labels: + istio-injection: enabled + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: documentdb-credentials + namespace: documentdb-preview-ns +type: Opaque +stringData: + username: default_user + password: {{DOCUMENTDB_PASSWORD}} + +--- + +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: documentdb-preview + namespace: documentdb-preview-ns +spec: + nodeCount: 1 + instancesPerNode: 1 + documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16 + resource: + storage: + pvcSize: 10Gi + clusterReplication: + highAvailability: true + primary: {{PRIMARY_CLUSTER}} + crossCloudNetworkingStrategy: Istio + clusterList: +{{CLUSTER_LIST}} + exposeViaService: + serviceType: LoadBalancer + logLevel: info + +--- + +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: documentdb-crp +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: documentdb-preview-ns + policy: + placementType: PickAll + strategy: + type: RollingUpdate \ No newline at end of file diff --git a/operator/src/scripts/multi-cloud-deployment/insert_test.py b/operator/src/scripts/multi-cloud-deployment/insert_test.py new file mode 100644 index 00000000..4bac1959 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/insert_test.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +import sys +import time +from pymongo import MongoClient, errors +from datetime import datetime + +if len(sys.argv) != 2: + print(f"Usage: python insert_test.py ") + sys.exit(1) + +connection_string = sys.argv[1] + +client = MongoClient(connection_string) + +db = client.testdb +collection = db.testcollection + +print(f"{'Inserted Document':<30} {'Insert Count':<15}") +print("-" * 77) +start_time = time.time() +end_time = start_time + (10 * 60) # 10 minutes +count = 0 + +while time.time() < end_time: + failed = False + write_result = "" + try: + doc = { + "count": count, + "message": f"Insert operation {count}" + } + result = collection.insert_one(doc) + write_result = result.inserted_id + count += 1 + print(f"{str(write_result):<30} {count:<15}") + except Exception as e: + failed = True + short_err = getattr(getattr(e, 'details', {}), 'get', lambda *_: None)('errmsg') + print(f"Error: {short_err or str(e)}") + + + time.sleep(1) + +print(f"Completed {count} insert operations in 10 minutes") +final_read_count = collection.count_documents({}) +print(f"Final read count: {final_read_count}") +client.close() diff --git a/operator/src/scripts/multi-cloud-deployment/main.bicep b/operator/src/scripts/multi-cloud-deployment/main.bicep new file mode 100644 index 00000000..eb54d572 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/main.bicep @@ -0,0 +1,74 @@ +targetScope = 'resourceGroup' + +@description('Name of the Fleet Hub AKS cluster') +param hubClusterName string = 'aks-fleet-hub' + +@description('Location for the Fleet Hub') +param hubRegion string = 'eastus2' + +@description('Name for member cluster') +param memberName string = 'aks-fleet-member' + +@description('Location for member cluster') +param memberRegion string = 'eastus2' + +@description('Kubernetes version. Leave empty to use the region default GA version.') +param kubernetesVersion string = '' + +@description('VM size for cluster nodes') +param hubVmSize string = 'Standard_DS3_v2' + +@description('Number of nodes per cluster') +param nodeCount int = 1 + +var fleetName = '${hubClusterName}-fleet' + +// Optionally include kubernetesVersion in cluster properties +var maybeK8sVersion = empty(kubernetesVersion) ? {} : { kubernetesVersion: kubernetesVersion } + +// Fleet resource +resource fleet 'Microsoft.ContainerService/fleets@2025-03-01' = { + name: fleetName + location: hubRegion + properties: { + hubProfile: { + dnsPrefix: fleetName + } + } +} + +// Member AKS Cluster (using default Azure CNI without custom VNets) +resource memberCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = { + name: memberName + location: memberRegion + identity: { + type: 'SystemAssigned' + } + properties: union({ + dnsPrefix: 'member-${memberRegion}-dns' + agentPoolProfiles: [ + { + name: 'agentpool' + count: nodeCount + vmSize: hubVmSize + mode: 'System' + osType: 'Linux' + } + ] + }, maybeK8sVersion) +} + +// Member clusters fleet membership +resource memberFleetMembers 'Microsoft.ContainerService/fleets/members@2023-10-15' = { + name: 'member-${memberRegion}-${uniqueString(resourceGroup().id, memberRegion)}' + parent: fleet + properties: { + clusterResourceId: memberCluster.id + } +} + +// Outputs +output fleetId string = fleet.id +output fleetName string = fleet.name +output memberClusterId string = memberCluster.id +output memberClusterName string = memberCluster.name diff --git a/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam b/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam new file mode 100644 index 00000000..c58e7310 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/parameters.bicepparam @@ -0,0 +1,8 @@ +using './main.bicep' + +param hubClusterName = 'aks-fleet-hub' +param hubRegion = 'eastus2' +param memberRegion = 'eastus2' +param kubernetesVersion = '' +param nodeCount = 1 +param hubVmSize = 'Standard_DS3_v2' diff --git a/operator/src/scripts/multi-cloud-deployment/read_test.py b/operator/src/scripts/multi-cloud-deployment/read_test.py new file mode 100644 index 00000000..0e8debc3 --- /dev/null +++ b/operator/src/scripts/multi-cloud-deployment/read_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +import sys +import time +from pymongo import MongoClient, errors +from datetime import datetime + +if len(sys.argv) != 2: + print("Usage: python insert_test.py ") + sys.exit(1) + +connection_string = sys.argv[1] + +client = MongoClient(connection_string) + +db = client.testdb +collection = db.testcollection + +# Perform single insert operation +print(f"Performing initial insert operation...") +print(f"Using: {connection_string.split('@')[1] if '@' in connection_string else 'local'}") +try: + doc = { + "count": 0, + "message": "Initial test document", + "timestamp": datetime.now() + } + result = collection.insert_one(doc) + print(f"Successfully inserted document with ID: {result.inserted_id}") +except Exception as e: + print(f"ERROR inserting document:") + print(f" Exception Type: {type(e).__name__}") + print(f" Exception Message: {str(e)}") + if hasattr(e, 'details'): + print(f" Details: {e.details}") + sys.exit(1) + +print() +print(f"Starting read operations for 10 minutes...") +print(f"{'Timestamp':<20} {'Read Count':<15} {'Status':<20}") +print("-" * 80) + +start_time = time.time() +end_time = start_time + (10 * 60) # 10 minutes +read_count = 0 +error_count = 0 + +while time.time() < end_time: + timestamp = datetime.now().strftime("%H:%M:%S") + try: + count = collection.count_documents({}) + read_count += 1 + print(f"{timestamp:<20} {count:<15} {'Success':<20}") + except Exception as e: + error_count += 1 + print(f"{timestamp:<20} {'N/A':<15} {'ERROR':<20}") + print(f" Exception Type: {type(e).__name__}") + print(f" Exception Message: {str(e)}") + if hasattr(e, 'details'): + print(f" Details: {e.details}") + if hasattr(e, '__cause__'): + print(f" Cause: {e.__cause__}") + print() + + time.sleep(1) + +print() +print(f"Completed {read_count} successful read operations in 10 minutes") +print(f"Total errors: {error_count}") +try: + final_count = collection.count_documents({}) + print(f"Final document count: {final_count}") +except Exception as e: + print(f"ERROR reading final count:") + print(f" Exception Type: {type(e).__name__}") + print(f" Exception Message: {str(e)}") +client.close()