diff --git a/docs/operator-public-documentation/preview/backup-and-restore.md b/docs/operator-public-documentation/preview/backup-and-restore.md index dfe5e560..06ac2086 100644 --- a/docs/operator-public-documentation/preview/backup-and-restore.md +++ b/docs/operator-public-documentation/preview/backup-and-restore.md @@ -7,7 +7,7 @@ 1. Run the CSI driver deployment script **before** installing the documentdb-operator: ```bash -scripts/test-scripts/deploy-csi-driver.sh +./operator/src/scripts/test-scripts/deploy-csi-driver.sh ``` 2. Validate storage and snapshot components: diff --git a/documentdb-playground/aks-fleet-deployment/README.md b/documentdb-playground/aks-fleet-deployment/README.md index 3ddf5dce..777cbd83 100644 --- a/documentdb-playground/aks-fleet-deployment/README.md +++ b/documentdb-playground/aks-fleet-deployment/README.md @@ -255,6 +255,79 @@ az network vnet peering list --resource-group $RESOURCE_GROUP \ --vnet-name member-westus3-vnet --output table ``` +## Backup and Restore +### Backup + +Create a one-time backup: +```bash +kubectl --context hub apply -f - <", 0)) @@ -79,7 +79,7 @@ var _ = Describe("Backup", func() { Phase: cnpgv1.BackupPhase(""), StartedAt: nil, StoppedAt: nil, - Error: "", + Message: "", }, } @@ -88,7 +88,7 @@ var _ = Describe("Backup", func() { Expect(string(backup.Status.Phase)).To(Equal(cnpgv1.BackupPhaseCompleted)) Expect(backup.Status.StartedAt).To(Equal(&startedAt)) Expect(backup.Status.StoppedAt).To(Equal(&stoppedAt)) - Expect(backup.Status.Error).To(Equal("none")) + Expect(backup.Status.Message).To(Equal("none")) // ExpiredAt should be StoppedAt + 30 days (default) Expect(backup.Status.ExpiredAt).ToNot(BeNil()) Expect(backup.Status.ExpiredAt.Time.Equal(stoppedAt.Time.Add(30 * 24 * time.Hour))).To(BeTrue()) @@ -114,7 +114,7 @@ var _ = Describe("Backup", func() { Phase: cnpgv1.BackupPhaseCompleted, StartedAt: &startedAt, StoppedAt: &stoppedAt, - Error: "none", + Message: "none", ExpiredAt: &expiredAt, }, } @@ -225,6 +225,13 @@ var _ = Describe("Backup", func() { Expect(status.IsDone()).To(BeTrue()) }) + It("returns true when phase is Skipped", func() { + status := &BackupStatus{ + Phase: BackupPhaseSkipped, + } + Expect(status.IsDone()).To(BeTrue()) + }) + It("returns false when phase is Running", func() { status := &BackupStatus{ Phase: cnpgv1.BackupPhaseRunning, diff --git a/operator/src/api/preview/backup_types.go b/operator/src/api/preview/backup_types.go index 226e876f..0d75d566 100644 --- a/operator/src/api/preview/backup_types.go +++ b/operator/src/api/preview/backup_types.go @@ -43,9 +43,11 @@ type BackupStatus struct { // +optional ExpiredAt *metav1.Time `json:"expiredAt,omitempty"` - // Error contains error information if the backup failed. + // Message contains additional information about the backup status. + // For failed backups, this contains the error message. + // For skipped backups, this explains why the backup was skipped. // +optional - Error string `json:"error,omitempty"` + Message string `json:"message,omitempty"` } // +kubebuilder:object:root=true @@ -55,7 +57,7 @@ type BackupStatus struct { // +kubebuilder:printcolumn:name="StartedAt",type=string,JSONPath=".status.startedAt",description="Backup start time" // +kubebuilder:printcolumn:name="StoppedAt",type=string,JSONPath=".status.stoppedAt",description="Backup completion time" // +kubebuilder:printcolumn:name="ExpiredAt",type=string,JSONPath=".status.expiredAt",description="Backup expiration time" -// +kubebuilder:printcolumn:name="Error",type=string,JSONPath=".status.error",description="Backup error information" +// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=".status.message",description="Backup status message" type Backup struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/operator/src/config/crd/bases/db.microsoft.com_backups.yaml b/operator/src/config/crd/bases/db.microsoft.com_backups.yaml index bc9fc524..a8fd0542 100644 --- a/operator/src/config/crd/bases/db.microsoft.com_backups.yaml +++ b/operator/src/config/crd/bases/db.microsoft.com_backups.yaml @@ -35,9 +35,9 @@ spec: jsonPath: .status.expiredAt name: ExpiredAt type: string - - description: Backup error information - jsonPath: .status.error - name: Error + - description: Backup status message + jsonPath: .status.message + name: Message type: string name: preview schema: @@ -88,14 +88,17 @@ spec: status: description: BackupStatus defines the observed state of Backup. properties: - error: - description: Error contains error information if the backup failed. - type: string expiredAt: description: ExpiredAt is the time when the backup is considered expired and can be deleted. format: date-time type: string + message: + description: |- + Message contains additional information about the backup status. + For failed backups, this contains the error message. + For skipped backups, this explains why the backup was skipped. + type: string phase: description: Phase represents the current phase of the backup operation. type: string diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index b87e4746..a9e7aa5d 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -16,7 +16,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" ) -func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, documentdb_image, serviceAccountName, storageClass string, log logr.Logger) *cnpgv1.Cluster { +func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, documentdb_image, serviceAccountName, storageClass string, isPrimaryRegion bool, log logr.Logger) *cnpgv1.Cluster { sidecarPluginName := documentdb.Spec.SidecarInjectorPluginName if sidecarPluginName == "" { sidecarPluginName = util.DEFAULT_SIDECAR_INJECTOR_PLUGIN @@ -88,7 +88,7 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu "host replication all all trust", }, }, - Bootstrap: getBootstrapConfiguration(documentdb, log), + Bootstrap: getBootstrapConfiguration(documentdb, isPrimaryRegion, log), LogLevel: cmp.Or(documentdb.Spec.LogLevel, "info"), Backup: &cnpgv1.BackupConfiguration{ VolumeSnapshot: &cnpgv1.VolumeSnapshotConfiguration{ @@ -112,8 +112,8 @@ func getInheritedMetadataLabels(appName string) *cnpgv1.EmbeddedObjectMetadata { } } -func getBootstrapConfiguration(documentdb *dbpreview.DocumentDB, log logr.Logger) *cnpgv1.BootstrapConfiguration { - if documentdb.Spec.Bootstrap != nil && documentdb.Spec.Bootstrap.Recovery != nil && documentdb.Spec.Bootstrap.Recovery.Backup.Name != "" { +func getBootstrapConfiguration(documentdb *dbpreview.DocumentDB, isPrimaryRegion bool, log logr.Logger) *cnpgv1.BootstrapConfiguration { + if isPrimaryRegion && documentdb.Spec.Bootstrap != nil && documentdb.Spec.Bootstrap.Recovery != nil && documentdb.Spec.Bootstrap.Recovery.Backup.Name != "" { backupName := documentdb.Spec.Bootstrap.Recovery.Backup.Name log.Info("DocumentDB cluster will be bootstrapped from backup", "backupName", backupName) return &cnpgv1.BootstrapConfiguration{ diff --git a/operator/src/internal/controller/backup_controller.go b/operator/src/internal/controller/backup_controller.go index 87a4045e..5f75d304 100644 --- a/operator/src/internal/controller/backup_controller.go +++ b/operator/src/internal/controller/backup_controller.go @@ -19,6 +19,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" dbpreview "github.com/microsoft/documentdb-operator/api/preview" + util "github.com/microsoft/documentdb-operator/internal/utils" ) // BackupReconciler reconciles a Backup object @@ -54,9 +55,13 @@ func (r *BackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - // No further action needed for completed backups - if backup.Status.IsDone() { - return ctrl.Result{}, nil + // If the backup is already done and not expired, requeue to check expiration + if backup.Status.IsDone() && backup.Status.ExpiredAt != nil { + requeueAfter := time.Until(backup.Status.ExpiredAt.Time) + if requeueAfter < 0 { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // Fetch the associated DocumentDB cluster @@ -82,7 +87,22 @@ func (r *BackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr } if err := r.Get(ctx, cnpgBackupKey, cnpgBackup); err != nil { if apierrors.IsNotFound(err) { - return r.createCNPGBackup(ctx, backup, cluster.Spec.Backup) + + // Skip backup if the cluster is not primary + replicationContext, err := util.GetReplicationContext(ctx, r.Client, *cluster) + if err != nil { + logger.Error(err, "Failed to determine replication context") + return ctrl.Result{}, err + } + if !replicationContext.IsPrimary() { + return r.SetBackupPhaseSkipped(ctx, backup, "Backups can only be created from the primary cluster", cluster.Spec.Backup) + } + if !replicationContext.EndpointEnabled() { + logger.Info("Backup deferred: primary cluster endpoint not ready, waiting for promotion to complete") + return ctrl.Result{RequeueAfter: time.Minute * 1}, nil + } + + return r.createCNPGBackup(ctx, backup, cluster) } logger.Error(err, "Failed to get CNPG Backup") return ctrl.Result{}, err @@ -155,14 +175,19 @@ func buildVolumeSnapshotClass(environment string) *snapshotv1.VolumeSnapshotClas } // createCNPGBackup creates a new CNPG Backup resource -func (r *BackupReconciler) createCNPGBackup(ctx context.Context, backup *dbpreview.Backup, backupConfiguration *dbpreview.BackupConfiguration) (ctrl.Result, error) { - cnpgBackup, err := backup.CreateCNPGBackup(r.Scheme) +func (r *BackupReconciler) createCNPGBackup(ctx context.Context, backup *dbpreview.Backup, cluster *dbpreview.DocumentDB) (ctrl.Result, error) { + cnpgClusterName := cluster.Name + if cluster.Spec.ClusterReplication != nil && cluster.Spec.ClusterReplication.Primary != "" { + cnpgClusterName = cluster.Spec.ClusterReplication.Primary + } + + cnpgBackup, err := backup.CreateCNPGBackup(r.Scheme, cnpgClusterName) if err != nil { - return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), backupConfiguration) + return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), cluster.Spec.Backup) } if err := r.Create(ctx, cnpgBackup); err != nil { - return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), backupConfiguration) + return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), cluster.Spec.Backup) } r.Recorder.Event(backup, "Normal", "BackupInitialized", "Successfully initialized backup") @@ -200,7 +225,7 @@ func (r *BackupReconciler) SetBackupPhaseFailed(ctx context.Context, backup *dbp original := backup.DeepCopy() backup.Status.Phase = cnpgv1.BackupPhaseFailed - backup.Status.Error = errMessage + backup.Status.Message = errMessage backup.Status.ExpiredAt = backup.CalculateExpirationTime(backupConfiguration) if err := r.Status().Patch(ctx, backup, client.MergeFrom(original)); err != nil { @@ -210,7 +235,32 @@ func (r *BackupReconciler) SetBackupPhaseFailed(ctx context.Context, backup *dbp } r.Recorder.Event(backup, "Warning", "BackupFailed", errMessage) - return ctrl.Result{}, nil + requeueAfter := time.Until(backup.Status.ExpiredAt.Time) + if requeueAfter < 0 { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil +} + +func (r *BackupReconciler) SetBackupPhaseSkipped(ctx context.Context, backup *dbpreview.Backup, message string, backupConfiguration *dbpreview.BackupConfiguration) (ctrl.Result, error) { + original := backup.DeepCopy() + + backup.Status.Phase = dbpreview.BackupPhaseSkipped + backup.Status.Message = message + backup.Status.ExpiredAt = backup.CalculateExpirationTime(backupConfiguration) + + if err := r.Status().Patch(ctx, backup, client.MergeFrom(original)); err != nil { + logger := log.FromContext(ctx) + logger.Error(err, "Failed to patch Backup status") + return ctrl.Result{}, err + } + + r.Recorder.Event(backup, "Warning", "BackupSkipped", message) + requeueAfter := time.Until(backup.Status.ExpiredAt.Time) + if requeueAfter < 0 { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil } // SetupWithManager sets up the controller with the Manager. diff --git a/operator/src/internal/controller/backup_controller_test.go b/operator/src/internal/controller/backup_controller_test.go index f5479bfb..f6734487 100644 --- a/operator/src/internal/controller/backup_controller_test.go +++ b/operator/src/internal/controller/backup_controller_test.go @@ -65,8 +65,17 @@ var _ = Describe("Backup Controller", func() { }, } + // Create the associated DocumentDB cluster + cluster := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: backupNamespace, + }, + } + Expect(fakeClient.Create(ctx, cluster)).To(Succeed()) + // Call under test - res, err := reconciler.createCNPGBackup(ctx, backup, nil) + res, err := reconciler.createCNPGBackup(ctx, backup, cluster) Expect(err).ToNot(HaveOccurred()) // controller uses a 5s requeue Expect(res.RequeueAfter).To(Equal(5 * time.Second)) @@ -197,7 +206,7 @@ var _ = Describe("Backup Controller", func() { updated := &dbpreview.Backup{} Expect(fakeClient.Get(ctx, client.ObjectKey{Name: backupName, Namespace: backupNamespace}, updated)).To(Succeed()) Expect(string(updated.Status.Phase)).To(Equal(string(cnpgv1.BackupPhaseFailed))) - Expect(updated.Status.Error).To(Equal("connection timeout")) + Expect(updated.Status.Message).To(Equal("connection timeout")) Expect(updated.Status.StartedAt).ToNot(BeNil()) Expect(updated.Status.StoppedAt).ToNot(BeNil()) Expect(updated.Status.StartedAt.Time.Unix()).To(Equal(startTime.Unix())) diff --git a/operator/src/internal/controller/documentdb_controller.go b/operator/src/internal/controller/documentdb_controller.go index 63cc9709..47065f22 100644 --- a/operator/src/internal/controller/documentdb_controller.go +++ b/operator/src/internal/controller/documentdb_controller.go @@ -110,7 +110,7 @@ func (r *DocumentDBReconciler) Reconcile(ctx context.Context, req ctrl.Request) documentdbImage := util.GetDocumentDBImageForInstance(documentdb) currentCnpgCluster := &cnpgv1.Cluster{} - desiredCnpgCluster := cnpg.GetCnpgClusterSpec(req, documentdb, documentdbImage, documentdb.Name, replicationContext.StorageClass, logger) + desiredCnpgCluster := cnpg.GetCnpgClusterSpec(req, documentdb, documentdbImage, documentdb.Name, replicationContext.StorageClass, replicationContext.IsPrimary(), logger) if replicationContext.IsReplicating() { err = r.AddClusterReplicationToClusterSpec(ctx, documentdb, replicationContext, desiredCnpgCluster) diff --git a/operator/src/internal/controller/physical_replication.go b/operator/src/internal/controller/physical_replication.go index bcecab46..efb039a2 100644 --- a/operator/src/internal/controller/physical_replication.go +++ b/operator/src/internal/controller/physical_replication.go @@ -61,9 +61,12 @@ func (r *DocumentDBReconciler) AddClusterReplicationToClusterSpec( // If primary and HA we want a local standby and a slot for the WAL replica // TODO change to 2 when WAL replica is available cnpgCluster.Spec.Instances = 3 - cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL = - append(cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL, + // Restoring from backup won't have PostInitSQL configured + if cnpgCluster.Spec.Bootstrap != nil && cnpgCluster.Spec.Bootstrap.InitDB != nil && cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL != nil { + cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL = append( + cnpgCluster.Spec.Bootstrap.InitDB.PostInitSQL, "select * from pg_create_physical_replication_slot('wal_replica');") + } // Also need to configure quorum writes cnpgCluster.Spec.PostgresConfiguration.Synchronous = &cnpgv1.SynchronousReplicaConfiguration{ Method: cnpgv1.SynchronousReplicaConfigurationMethodAny,