From 61bcf374156f0975e6bffa47cdb44547e1b1faf6 Mon Sep 17 00:00:00 2001 From: wenting Date: Wed, 24 Sep 2025 14:16:18 -0400 Subject: [PATCH 1/2] backup and restore --- .../actions/setup-test-environment/action.yml | 7 + .github/workflows/release_images.yml | 12 +- .github/workflows/test-backup-and-restore.yml | 358 ++++++++++++++++++ docs/designs/backup-and-restore-design.md | 177 +++++++++ .../v1/backup-and-restore.md | 291 ++++++++++++++ .../crds/db.microsoft.com_backups.yaml | 115 ++++++ .../crds/db.microsoft.com_documentdbs.yaml | 31 ++ .../db.microsoft.com_scheduledbackups.yaml | 97 +++++ .../templates/05_clusterrole.yaml | 17 +- operator/src/api/preview/backup_funcs.go | 143 +++++++ operator/src/api/preview/backup_funcs_test.go | 357 +++++++++++++++++ operator/src/api/preview/backup_types.go | 78 ++++ operator/src/api/preview/documentdb_types.go | 34 ++ .../src/api/preview/scheduledbackup_funcs.go | 47 +++ .../api/preview/scheduledbackup_funcs_test.go | 107 ++++++ .../src/api/preview/scheduledbackup_types.go | 64 ++++ operator/src/api/preview/suite_test.go | 16 + .../src/api/preview/zz_generated.deepcopy.go | 273 ++++++++++++- operator/src/cmd/main.go | 19 + .../crd/bases/db.microsoft.com_backups.yaml | 115 ++++++ .../bases/db.microsoft.com_documentdbs.yaml | 31 ++ .../db.microsoft.com_scheduledbackups.yaml | 97 +++++ operator/src/go.mod | 8 +- operator/src/go.sum | 2 + operator/src/internal/cnpg/cnpg_cluster.go | 22 +- .../internal/controller/backup_controller.go | 227 +++++++++++ .../controller/backup_controller_test.go | 254 +++++++++++++ .../controller/scheduledbackup_controller.go | 155 ++++++++ .../scheduledbackup_controller_test.go | 86 +++++ .../src/internal/controller/suite_test.go | 16 + .../scripts/test-scripts/deploy-csi-driver.sh | 83 ++++ 31 files changed, 3332 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/test-backup-and-restore.yml create mode 100644 docs/designs/backup-and-restore-design.md create mode 100644 docs/operator-public-documentation/v1/backup-and-restore.md create mode 100644 operator/documentdb-helm-chart/crds/db.microsoft.com_backups.yaml create mode 100644 operator/documentdb-helm-chart/crds/db.microsoft.com_scheduledbackups.yaml create mode 100644 operator/src/api/preview/backup_funcs.go create mode 100644 operator/src/api/preview/backup_funcs_test.go create mode 100644 operator/src/api/preview/backup_types.go create mode 100644 operator/src/api/preview/scheduledbackup_funcs.go create mode 100644 operator/src/api/preview/scheduledbackup_funcs_test.go create mode 100644 operator/src/api/preview/scheduledbackup_types.go create mode 100644 operator/src/api/preview/suite_test.go create mode 100644 operator/src/config/crd/bases/db.microsoft.com_backups.yaml create mode 100644 operator/src/config/crd/bases/db.microsoft.com_scheduledbackups.yaml create mode 100644 operator/src/internal/controller/backup_controller.go create mode 100644 operator/src/internal/controller/backup_controller_test.go create mode 100644 operator/src/internal/controller/scheduledbackup_controller.go create mode 100644 operator/src/internal/controller/scheduledbackup_controller_test.go create mode 100644 operator/src/internal/controller/suite_test.go create mode 100755 operator/src/scripts/test-scripts/deploy-csi-driver.sh diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index 5e58132d..d0d84e42 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -273,6 +273,12 @@ runs: echo "✓ Cluster resource check completed for ${{ inputs.architecture }}" + - name: Deploy CSI driver + shell: bash + run: | + chmod +x ./scripts/test-scripts/deploy-csi-driver.sh + ./scripts/test-scripts/deploy-csi-driver.sh + - name: Install cert-manager shell: bash run: | @@ -583,6 +589,7 @@ runs: resource: storage: pvcSize: 5Gi + storageClass: csi-hostpath-sc exposeViaService: serviceType: ClusterIP EOF diff --git a/.github/workflows/release_images.yml b/.github/workflows/release_images.yml index 60285095..e0a055cc 100644 --- a/.github/workflows/release_images.yml +++ b/.github/workflows/release_images.yml @@ -45,11 +45,19 @@ jobs: image_tag: ${{ inputs.image_tag }} secrets: inherit + test-backup-and-restore: + name: Test Backup and Restore + if: ${{ inputs.run_tests == true }} + uses: ./.github/workflows/test-backup-and-restore.yml + with: + image_tag: ${{ inputs.image_tag }} + secrets: inherit + copy-and-push-manifest: name: Release Images runs-on: ubuntu-latest - needs: [test-e2e, test-integration] - if: always() && (needs.test-e2e.result == 'success' || needs.test-e2e.result == 'skipped') && (needs.test-integration.result == 'success' || needs.test-integration.result == 'skipped') + needs: [test-e2e, test-integration, test-backup-and-restore] + if: always() && (needs.test-e2e.result == 'success' || needs.test-e2e.result == 'skipped') && (needs.test-integration.result == 'success' || needs.test-integration.result == 'skipped') && (needs.test-backup-and-restore.result == 'success' || needs.test-backup-and-restore.result == 'skipped') strategy: matrix: image: [operator, sidecar] diff --git a/.github/workflows/test-backup-and-restore.yml b/.github/workflows/test-backup-and-restore.yml new file mode 100644 index 00000000..ed6527f5 --- /dev/null +++ b/.github/workflows/test-backup-and-restore.yml @@ -0,0 +1,358 @@ +name: Test - Backup and Restore + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + schedule: + - cron: '0 2 * * *' + workflow_dispatch: + inputs: + documentdb_version: + description: 'DocumentDB image version to test' + required: false + default: '16' + node_count: + description: 'Number of DocumentDB nodes' + required: false + default: '1' + image_tag: + description: 'Optional: Use existing image tag instead of building locally' + required: false + type: string + workflow_call: + inputs: + image_tag: + description: 'Optional: Use existing image tag instead of building locally' + required: false + type: string + documentdb_version: + description: 'DocumentDB image version to test' + required: false + default: '16' + type: string + node_count: + description: 'Number of DocumentDB nodes' + required: false + default: '1' + type: string + +permissions: + contents: read + actions: read + packages: read + +env: + CERT_MANAGER_NS: cert-manager + OPERATOR_NS: documentdb-operator + DB_NS: documentdb-backup-and-restore-test + DB_NAME: documentdb-backup-and-restore + DB_RESTORE_NAME: documentdb-restore-from-backup + DB_USERNAME: k8s_secret_user + DB_PASSWORD: K8sSecret100 + DB_PORT: 10260 + +jobs: + # Conditional build workflow - only run if image_tag is not provided + build: + name: Build Images and Charts + if: ${{ inputs.image_tag == '' || inputs.image_tag == null }} + uses: ./.github/workflows/test-build-and-package.yml + with: + image_tag_prefix: 'backup-and-restore-test' + chart_version_prefix: '0.1.0' + secrets: inherit + + backup-and-restore-test: + name: Run Backup and Restore Tests + runs-on: ${{ matrix.runner }} + timeout-minutes: 60 + needs: build + if: always() && (needs.build.result == 'success' || needs.build.result == 'skipped') + + strategy: + matrix: + include: + - architecture: amd64 + runner: ubuntu-22.04 + test_scenario_name: "single-node" + node_count: 1 + instances_per_node: 1 + - architecture: arm64 + runner: ubuntu-22.04-arm + test_scenario_name: "single-node" + node_count: 1 + instances_per_node: 1 + env: + # Use provided image tag or outputs from the build workflow + IMAGE_TAG: ${{ inputs.image_tag || needs.build.outputs.image_tag }} + CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download artifacts + if: ${{ inputs.image_tag == '' || inputs.image_tag == null }} + uses: actions/download-artifact@v4 + with: + pattern: 'build-*' + path: ./artifacts + + - name: Log test configuration + run: | + echo "## Backup and Restore Test Configuration" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [[ -n "${{ inputs.image_tag }}" ]]; then + echo "- **Mode**: Using provided image tag" >> $GITHUB_STEP_SUMMARY + echo "- **Image Tag**: \`${{ inputs.image_tag }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Source**: External (no local build)" >> $GITHUB_STEP_SUMMARY + else + echo "- **Mode**: Using locally built images" >> $GITHUB_STEP_SUMMARY + echo "- **Image Tag**: \`${{ env.IMAGE_TAG }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Source**: Local build pipeline" >> $GITHUB_STEP_SUMMARY + fi + echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY + + - name: Setup test environment + uses: ./.github/actions/setup-test-environment + with: + architecture: ${{ matrix.architecture }} + runner: ${{ matrix.runner }} + test-scenario-name: ${{ matrix.test_scenario_name }} + node-count: '${{ matrix.node_count }}' + instances-per-node: '${{ matrix.instances_per_node }}' + cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} + operator-namespace: ${{ env.OPERATOR_NS }} + db-namespace: ${{ env.DB_NS }} + db-cluster-name: ${{ env.DB_NAME }} + db-username: ${{ env.DB_USERNAME }} + db-password: ${{ env.DB_PASSWORD }} + db-port: ${{ env.DB_PORT }} + image-tag: ${{ env.IMAGE_TAG }} + chart-version: ${{ env.CHART_VERSION }} + use-external-images: ${{ inputs.image_tag != '' && inputs.image_tag != null }} + github-token: ${{ secrets.GITHUB_TOKEN }} + repository-owner: ${{ github.repository_owner }} + + - name: Setup port forwarding for comprehensive tests + uses: ./.github/actions/setup-port-forwarding + with: + namespace: ${{ env.DB_NS }} + cluster-name: ${{ env.DB_NAME }} + port: ${{ env.DB_PORT }} + architecture: ${{ matrix.architecture }} + test-type: 'comprehensive' + + - name: Insert test data using mongosh + run: | + echo "Inserting test data into DocumentDB cluster..." + if mongosh 127.0.0.1:$DB_PORT \ + -u $DB_USERNAME \ + -p $DB_PASSWORD \ + --authenticationMechanism SCRAM-SHA-256 \ + --tls \ + --tlsAllowInvalidCertificates \ + --eval "for (let i = 1; i <= 100; i++) { db.testCollection.insertOne({ index: i, message: 'This is document ' + i }); }" ; then + echo "✓ Test data insertion completed successfully on ${{ matrix.architecture }}" + else + echo "❌ Test data insertion failed on ${{ matrix.architecture }}" + exit 1 + fi + + echo "Verifying inserted test data..." + count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates) + if [[ "$count" -eq 100 ]]; then + echo "✓ Test data verification completed successfully on ${{ matrix.architecture }}" + else + echo "❌ Test data verification failed on ${{ matrix.architecture }}" + exit 1 + fi + + - name: Create ScheduledBackup to trigger backups + shell: bash + run: | + cat </dev/null || true + rm -f /tmp/pf_pid + fi + + # Clean up output log + rm -f /tmp/pf_output.log + + # Clean up output log + rm -f /tmp/pf_output.log + + - name: Restore from backup + shell: bash + run: | + # Get the latest backup name + backup_name=$(kubectl -n ${{ env.DB_NS }} get backups -o jsonpath='{.items[?(@.status.phase=="completed")].metadata.name}' | tr ' ' '\n' | sort | tail -n 1) + + # Create DocumentDB resource + cat </dev/null || true + rm -f /tmp/pf_pid + fi + + # Clean up output log + rm -f /tmp/pf_output.log + + # Clean up output log + rm -f /tmp/pf_output.log + + - name: Test if expired backups are cleaned up + shell: bash + run: | + echo "Verifying expired backups are cleaned up..." + # pick up one backup name + backup_name=$(kubectl -n $DB_NS get backups -o jsonpath='{.items[0].metadata.name}') + # set expiration time to past + kubectl -n $DB_NS patch backup $backup_name --type='json' --type=merge -p='{"status":{"expiredAt":"2000-01-01T00:00:00Z"}}' --subresource=status + # wait for cleanup + MAX_RETRIES=10 + SLEEP_INTERVAL=15 + ITER=0 + while [ $ITER -lt $MAX_RETRIES ]; do + backup_status=$(kubectl -n $DB_NS get backup $backup_name --ignore-not-found) + if [ -z "$backup_status" ]; then + echo "✓ Expired backup cleaned up successfully." + exit 0 + else + echo "Backup $backup_name still exists. Retrying in $SLEEP_INTERVAL seconds..." + kubectl -n $DB_NS get backup $backup_name + sleep $SLEEP_INTERVAL + fi + ((++ITER)) + done + echo "❌ Expired backup was not cleaned up within expected time." + exit 1 \ No newline at end of file diff --git a/docs/designs/backup-and-restore-design.md b/docs/designs/backup-and-restore-design.md new file mode 100644 index 00000000..2e16f65d --- /dev/null +++ b/docs/designs/backup-and-restore-design.md @@ -0,0 +1,177 @@ +# Backup and Restore Design + +## Backup + +### VolumeSnapshotClass + +A [VolumeSnapshotClass](https://kubernetes.io/docs/concepts/storage/volume-snapshot-classes/) must exist before taking volume snapshots. +It specifies which CSI driver to use for creating volume snapshots. + +**CNPG's Approach:** CNPG requires users to manually create the VolumeSnapshotClass. + +**Our Approach:** The DocumentDB operator automatically creates a VolumeSnapshotClass when a Backup resource is created, if one doesn't already exist. + +#### Current Support + +Currently, we only support **AKS (Azure Kubernetes Service)** with the **`disk.csi.azure.com`** CSI driver. + +The operator will automatically create a VolumeSnapshotClass named `azure-disk-snapclass` configured with the Azure disk CSI driver when you create your first Backup resource. + +### Backup CRD + +We have our own Backup CRD and backup controller in the DocumentDB operator. +When a Backup resource is created, it triggers a [Kubernetes Volume Snapshot](https://kubernetes.io/blog/2020/12/10/kubernetes-1.20-volume-snapshot-moves-to-ga/#what-is-a-volume-snapshot) on the primary instance of a DocumentDB cluster. + +Since DocumentDB uses a [CloudNativePG (CNPG)](https://cloudnative-pg.io/) cluster as the backend, we leverage CNPG's backup functionality. +When users create a DocumentDB Backup resource, the operator automatically creates a corresponding [CNPG Backup](https://cloudnative-pg.io/documentation/current/backup/) resource. + +**Why not use CNPG Backup directly?** + +In this phase, our Backup resource acts as a wrapper around CNPG Backup. We maintain our own CRD to support future enhancements: +- **Next phase:** Multi-region backup support +- **Future:** Multi-node backup capabilities + +### Creating On-Demand Backups + +Create an on-demand backup by applying the following resource: + +```yaml +apiVersion: db.microsoft.com/preview +kind: Backup +metadata: + name: backup-example + namespace: documentdb-preview-ns +spec: + cluster: + name: documentdb-preview +``` + +## Scheduled Backup + +### ScheduledBackup CRD + +The ScheduledBackup CRD enables automated, recurring backups using [cron expressions](https://en.wikipedia.org/wiki/Cron). + +**Why not use CNPG ScheduledBackup?** + +CNPG's [ScheduledBackup](https://cloudnative-pg.io/documentation/current/backup/#scheduled-backups) creates CNPG Backup resources directly. +Since we have our own Backup CRD with custom logic, we need our own ScheduledBackup implementation. + +### Creating Scheduled Backups + +Create a scheduled backup using a cron expression: + +```yaml +apiVersion: db.microsoft.com/preview +kind: ScheduledBackup +metadata: + name: backup-example + namespace: documentdb-preview-ns +spec: + schedule: "0 0 0 * * *" # Daily at midnight + cluster: + name: documentdb-preview +``` + +## Retention Policy + +Retention policies control how long backups are preserved before automatic deletion. +Backups are retained according to their configured retention period, even if the cluster is deleted—allowing users to restore from deleted clusters if needed. + +The DocumentDB operator supports retention policies at three levels, with the following precedence: +1. **Backup-level retention** (highest priority) - applies to individual backups +2. **ScheduledBackup-level retention** - applies to backups created by a schedule +3. **Cluster-level retention** (default) - applies to all backups from a cluster + +### Backup-Level Retention (On-Demand) + +**Field:** `spec.retentionDays` + +**Purpose:** Overrides the cluster-level retention for an individual on-demand backup. Useful for long-term retention of critical backups. + +**Example:** +```yaml +apiVersion: db.microsoft.com/preview +kind: Backup +metadata: + name: backup-example +spec: + cluster: + name: documentdb-preview + retentionDays: 90 # Override: retain this backup for 90 days +``` + +### ScheduledBackup-Level Retention + +**Field:** `spec.retentionDays` + +**Purpose:** Overrides the cluster-level retention for backups created by a scheduled backup job. + +**Example:** +```yaml +apiVersion: db.microsoft.com/preview +kind: ScheduledBackup +metadata: + name: backup-example +spec: + schedule: "0 0 0 * * *" # Daily at midnight + cluster: + name: documentdb-preview + retentionDays: 14 # Override: retain scheduled backups for 14 days +``` + +### Cluster-Level Retention (Default) + +**Field:** `spec.backup.retentionDays` + +**Purpose:** Sets the default retention period for all backups created from this cluster. + +**Example:** +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: documentdb-preview +spec: + backup: + retentionDays: 30 # Default: retain all backups for 30 days + # ...other fields... +``` + + +### Expiration and Cleanup + +- **Expiration Time:** Calculated as `backup.status.stoppedAt + retentionDays` (or `backup.creationTimestamp + retentionDays` if backup failed) +- **Automatic Deletion:** When a backup's expiration time is reached, the operator automatically deletes the Backup resource and its associated volume snapshot +- **Status Field:** `backup.status.expiredAt` indicates when the backup will be automatically deleted + +Note: CNPG does not yet support retention policies for volume snapshots. This is an ongoing discussion in the CNPG community (see [issue #6009](https://github.com/cloudnative-pg/cloudnative-pg/issues/6009)). + + +## Deletion Behavior + +- **Deleting a Backup resource:** Immediately deletes the associated volume snapshot +- **Deleting a ScheduledBackup resource:** Stops creating new backups but does not delete existing backups created by that schedule +- **Deleting a Cluster:** Backups are retained according to the cluster's `retentionPeriod` setting + +## Restore + +### Recovery from Backup + +The operator supports bootstrapping a new cluster from an existing backup. In-place restoration is not currently supported. + +**Recovery Example:** + +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: documentdb-preview-restore + namespace: documentdb-preview-ns +spec: + bootstrap: + recovery: + backup: + name: backup-example + ...... +``` diff --git a/docs/operator-public-documentation/v1/backup-and-restore.md b/docs/operator-public-documentation/v1/backup-and-restore.md new file mode 100644 index 00000000..dfe5e560 --- /dev/null +++ b/docs/operator-public-documentation/v1/backup-and-restore.md @@ -0,0 +1,291 @@ +# Backup and Restore + +## Prerequisites + +### For Kind or Minikube + +1. Run the CSI driver deployment script **before** installing the documentdb-operator: + +```bash +scripts/test-scripts/deploy-csi-driver.sh +``` + +2. Validate storage and snapshot components: + + +```bash +kubectl get storageclass +kubectl get volumesnapshotclasses +``` + +You should see something like: + +StorageClasses: +``` +NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE +csi-hostpath-sc hostpath.csi.k8s.io Delete Immediate true 5d20h +standard (default) rancher.io/local-path Delete WaitForFirstConsumer false 5d20h +``` + +VolumeSnapshotClasses: +``` +NAME DRIVER DELETIONPOLICY AGE +csi-hostpath-snapclass hostpath.csi.k8s.io Delete 5d19h +``` + +If `csi-hostpath-snapclass` isn't present, the deploy script didn’t finish correctly. Re-run it. + +3. When creating a cluster, ensure you set the appropriate storage class: + +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: my-cluster + namespace: default +spec: + resource: + storage: + storageClass: csi-hostpath-sc # Specify your CSI storage class + # ... other configuration +``` + +### AKS + +AKS already provides a CSI driver. + +To allow the documentdb-operator to auto-create a default `VolumeSnapshotClass`, set `spec.environment: aks` in your `DocumentDB` spec: + +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: my-cluster + namespace: default +spec: + environment: aks + # ... other configuration +``` + +### Other Providers (EKS / GKE / Custom) + +Support is emerging; you must manually ensure: +- A CSI driver that supports snapshots +- VolumeSnapshot CRDs installed +- A default `VolumeSnapshotClass` + +Example manual snapshot class (adjust DRIVER accordingly): + +```yaml +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: generic-snapclass + annotations: + snapshot.storage.kubernetes.io/is-default-class: "true" +driver: ebs.csi.aws.com # or pd.csi.storage.gke.io / other +deletionPolicy: Delete +``` + +Apply: +```bash +kubectl apply -f volumesnapshotclass.yaml +``` + +## On-Demand Backup + +An on-demand backup creates a single backup of a DocumentDB cluster. + +### Creating an On-Demand Backup + +Create a `Backup` resource: + +```yaml +apiVersion: db.microsoft.com/preview +kind: Backup +metadata: + name: my-backup + namespace: default # Same namespace as DocumentDB cluster +spec: + cluster: + name: my-documentdb-cluster # Must match the DocumentDB cluster name + retentionDays: 30 # Optional: backup retention period in days +``` + +Apply the resource: + +```bash +kubectl apply -f backup.yaml +``` + +### Monitoring Backup Status + +Check the backup status: + +```bash +kubectl get backups -n default +``` + +View detailed backup information: + +```bash +kubectl describe backup my-backup -n default +``` + +## Scheduled Backups + +Scheduled backups automatically create backups at regular intervals using a cron schedule. + +### Creating a Scheduled Backup + +Create a `ScheduledBackup` resource on yaml file scheduledbackup.yaml + +```yaml +apiVersion: db.microsoft.com/preview +kind: ScheduledBackup +metadata: + name: my-scheduled-backup + namespace: default # Same namespace as DocumentDB +spec: + cluster: + name: my-documentdb-cluster # Must match the DocumentDB cluster name + schedule: "0 2 * * *" # Cron expression: daily at 2:00 AM + retentionDays: 30 # Optional: backup retention period in days +``` + +Apply the resource: + +```bash +kubectl apply -f scheduledbackup.yaml +``` + +### Cron Schedule Format + +The schedule uses standard cron expression format. Common examples: + +| Schedule | Meaning | +|----------|---------| +| `0 2 * * *` | Every day at 2:00 AM | +| `0 */6 * * *` | Every 6 hours | +| `0 0 * * 0` | Every Sunday at midnight | +| `*/15 * * * *` | Every 15 minutes | +| `0 2 1 * *` | First day of every month at 2:00 AM | + +For more details, see [cron expression format](https://pkg.go.dev/github.com/robfig/cron#hdr-CRON_Expression_Format). + +### Monitoring Scheduled Backups + +List all ScheduledBackups: + +```bash +kubectl get scheduledbackups -n default +``` + +Check the generated backups: + +```bash +kubectl get backups -n default +``` + +### Important Notes + +- If a backup is currently running, the next backup will be queued and start after the current one completes +- The operator will automatically create `Backup` resources according to the schedule +- Failed backups do not prevent subsequent backups from being scheduled +- ScheduledBackups are automatically garbage collected when the source cluster is deleted +- Deleting a ScheduledBackup does NOT delete its created Backup objects; they remain until expiration + +## Restore from Backup + +You can restore a backup to a **different DocumentDB cluster**. + +### List Available Backups + +First, identify the backup you want to restore: + +```bash +kubectl get backups -n default +``` + +### Create a New Cluster with Backup Recovery + +Create a new `DocumentDB` resource with recovery configuration: + +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: my-restored-cluster + namespace: default +spec: + bootstrap: + recovery: + backup: + name: my-backup # Reference the backup to restore from + #...... other configurations +``` + +Apply the resource: + +```bash +kubectl apply -f restore.yaml +``` + +## Backup Retention Policy + +Backups don't live forever. Each one gets an expiration time. After that time passes, the operator deletes it automatically. + +### Where the Retention Value Comes From (priority order) +1. `Backup.spec.retentionDays` (per backup override) +2. `ScheduledBackup.spec.retentionDays` (copied into each created Backup) +3. `DocumentDB.spec.backup.retentionDays` (cluster default) +4. Default (if none set): 30 days + +### How it's calculated +- Success: retention starts at `status.stoppedAt` +- Failure: retention starts at `metadata.creationTimestamp` +- Expiration = start + retentionDays * 24h + +### Examples +Per-backup override: +```yaml +apiVersion: db.microsoft.com/preview +kind: Backup +metadata: + name: monthly-audit +spec: + cluster: + name: prod-cluster + retentionDays: 90 +``` + +Scheduled backups (14‑day retention): +```yaml +apiVersion: db.microsoft.com/preview +kind: ScheduledBackup +metadata: + name: nightly +spec: + cluster: + name: prod-cluster + schedule: "0 2 * * *" + retentionDays: 14 +``` + +Cluster default (used when Backup doesn't set retention): +```yaml +apiVersion: db.microsoft.com/preview +kind: DocumentDB +metadata: + name: prod-cluster +spec: + backup: + retentionDays: 30 +``` + +### Important Notes +- Changing retention on a `ScheduledBackup` only affects new backups, not old ones. +- Changing `DocumentDB.spec.backup.retentionDays` doesn’t retroactively update existing backups. +- Failed backups still expire (timer starts at creation). +- Deleting the cluster does NOT delete its Backup objects immediately—they still wait for expiration. +- No "keep forever" mode—export externally if you need permanent archival. \ No newline at end of file diff --git a/operator/documentdb-helm-chart/crds/db.microsoft.com_backups.yaml b/operator/documentdb-helm-chart/crds/db.microsoft.com_backups.yaml new file mode 100644 index 00000000..bc9fc524 --- /dev/null +++ b/operator/documentdb-helm-chart/crds/db.microsoft.com_backups.yaml @@ -0,0 +1,115 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: backups.db.microsoft.com +spec: + group: db.microsoft.com + names: + kind: Backup + listKind: BackupList + plural: backups + singular: backup + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Target DocumentDB cluster + jsonPath: .spec.cluster.name + name: Cluster + type: string + - description: Backup phase + jsonPath: .status.phase + name: Phase + type: string + - description: Backup start time + jsonPath: .status.startedAt + name: StartedAt + type: string + - description: Backup completion time + jsonPath: .status.stoppedAt + name: StoppedAt + type: string + - description: Backup expiration time + jsonPath: .status.expiredAt + name: ExpiredAt + type: string + - description: Backup error information + jsonPath: .status.error + name: Error + type: string + name: preview + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BackupSpec defines the desired state of Backup. + properties: + cluster: + description: |- + Cluster specifies the DocumentDB cluster to backup. + The cluster must exist in the same namespace as the Backup resource. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + retentionDays: + description: |- + RetentionDays specifies how many days the backup should be retained. + If not specified, the default retention period from the cluster's backup retention policy will be used. + type: integer + required: + - cluster + type: object + x-kubernetes-validations: + - message: BackupSpec is immutable once set + rule: oldSelf == self + status: + description: BackupStatus defines the observed state of Backup. + properties: + error: + description: Error contains error information if the backup failed. + type: string + expiredAt: + description: ExpiredAt is the time when the backup is considered expired + and can be deleted. + format: date-time + type: string + phase: + description: Phase represents the current phase of the backup operation. + type: string + startedAt: + description: StartedAt is the time when the backup operation started. + format: date-time + type: string + stoppedAt: + description: StoppedAt is the time when the backup operation completed. + format: date-time + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml b/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml index a2a92fdc..db773b8c 100644 --- a/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml +++ b/operator/documentdb-helm-chart/crds/db.microsoft.com_documentdbs.yaml @@ -48,6 +48,37 @@ spec: spec: description: DocumentDBSpec defines the desired state of DocumentDB. properties: + backup: + description: Backup configures backup settings for DocumentDB. + properties: + retentionDays: + default: 30 + description: |- + RetentionDays specifies how many days backups should be retained. + If not specified, the default retention period is 30 days. + maximum: 365 + minimum: 1 + type: integer + type: object + bootstrap: + description: Bootstrap configures the initialization of the DocumentDB + cluster. + properties: + recovery: + description: Recovery configures recovery from a backup. + properties: + backup: + description: Backup specifies the source backup to restore + from. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + type: object + type: object clusterReplication: description: ClusterReplication configures cross-cluster replication for DocumentDB. diff --git a/operator/documentdb-helm-chart/crds/db.microsoft.com_scheduledbackups.yaml b/operator/documentdb-helm-chart/crds/db.microsoft.com_scheduledbackups.yaml new file mode 100644 index 00000000..98052439 --- /dev/null +++ b/operator/documentdb-helm-chart/crds/db.microsoft.com_scheduledbackups.yaml @@ -0,0 +1,97 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: scheduledbackups.db.microsoft.com +spec: + group: db.microsoft.com + names: + kind: ScheduledBackup + listKind: ScheduledBackupList + plural: scheduledbackups + singular: scheduledbackup + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.cluster.name + name: Cluster + type: string + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .spec.retentionDays + name: Retention Days + type: integer + name: preview + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledBackupSpec defines the desired state of ScheduledBackup + properties: + cluster: + description: |- + Cluster specifies the DocumentDB cluster to backup. + The cluster must exist in the same namespace as the ScheduledBackup resource. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + retentionDays: + description: |- + RetentionDays specifies how many days the backups should be retained. + If not specified, the default retention period from the cluster's backup retention policy will be used. + type: integer + schedule: + description: |- + Schedule defines when backups should be created using cron expression format. + See https://pkg.go.dev/github.com/robfig/cron#hdr-CRON_Expression_Format + type: string + required: + - cluster + - schedule + type: object + status: + description: ScheduledBackupStatus defines the observed state of ScheduledBackup + properties: + lastScheduledTime: + description: LastScheduledTime is the time when the last backup was + scheduled by this ScheduledBackup. + format: date-time + type: string + nextScheduledTime: + description: NextScheduledTime is the time when the next backup is + scheduled by this ScheduledBackup. + format: date-time + type: string + type: object + required: + - metadata + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/operator/documentdb-helm-chart/templates/05_clusterrole.yaml b/operator/documentdb-helm-chart/templates/05_clusterrole.yaml index e5bae780..e3ed8422 100644 --- a/operator/documentdb-helm-chart/templates/05_clusterrole.yaml +++ b/operator/documentdb-helm-chart/templates/05_clusterrole.yaml @@ -36,4 +36,19 @@ rules: - apiGroups: ["postgresql.cnpg.io"] resources: ["clusters", "publications", "subscriptions", "clusters/status"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - +# Backup permissions +- apiGroups: ["db.microsoft.com"] + resources: ["backups", "backups/status", "backups/finalizers"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +# ScheduledBackup permissions +- apiGroups: ["db.microsoft.com"] + resources: ["scheduledbackups", "scheduledbackups/status", "scheduledbackups/finalizers"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +# CNPG Backup permissions +- apiGroups: ["postgresql.cnpg.io"] + resources: ["backups", "backups/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +# VolumeSnapshotClass permissions +- apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshotclasses"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] diff --git a/operator/src/api/preview/backup_funcs.go b/operator/src/api/preview/backup_funcs.go new file mode 100644 index 00000000..1c5b3c76 --- /dev/null +++ b/operator/src/api/preview/backup_funcs.go @@ -0,0 +1,143 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +// CreateCNPGBackup creates a CNPG Backup resource based on the DocumentDB Backup spec. +func (backup *Backup) CreateCNPGBackup(scheme *runtime.Scheme) (*cnpgv1.Backup, error) { + cnpgBackup := &cnpgv1.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backup.Name, + Namespace: backup.Namespace, + }, + Spec: cnpgv1.BackupSpec{ + Method: cnpgv1.BackupMethodVolumeSnapshot, + Cluster: cnpgv1.LocalObjectReference{ + Name: backup.Spec.Cluster.Name, + }, + }, + } + // Set owner reference for garbage collection + // This ensures that the CNPG Backup is deleted when the DocumentDB Backup is deleted. + if err := controllerutil.SetControllerReference(backup, cnpgBackup, scheme); err != nil { + return nil, err + } + return cnpgBackup, nil +} + +// UpdateStatus updates the Backup status based on the CNPG Backup status and backup configuration. +func (backup *Backup) UpdateStatus(cnpgBackup *cnpgv1.Backup, backupConfiguration *BackupConfiguration) bool { + needsUpdate := false + if backup.Status.Phase != cnpgBackup.Status.Phase { + backup.Status.Phase = cnpgBackup.Status.Phase + needsUpdate = true + } + + if !areTimesEqual(backup.Status.StartedAt, cnpgBackup.Status.StartedAt) { + backup.Status.StartedAt = cnpgBackup.Status.StartedAt + needsUpdate = true + } + + if !areTimesEqual(backup.Status.StoppedAt, cnpgBackup.Status.StoppedAt) { + backup.Status.StoppedAt = cnpgBackup.Status.StoppedAt + needsUpdate = true + } + + if backup.Status.Error != cnpgBackup.Status.Error { + backup.Status.Error = cnpgBackup.Status.Error + needsUpdate = true + } + + expirationTime := backup.CalculateExpirationTime(backupConfiguration) + if !areTimesEqual(backup.Status.ExpiredAt, expirationTime) { + backup.Status.ExpiredAt = expirationTime + needsUpdate = true + } + + return needsUpdate +} + +// CalculateExpirationTime calculates the expiration time of the backup based on retention policy. +func (backup *Backup) CalculateExpirationTime(backupConfiguration *BackupConfiguration) *metav1.Time { + if !backup.Status.IsDone() { + return nil + } + + retentionHours := 0 + if backup.Spec.RetentionDays != nil { + retentionHours = *backup.Spec.RetentionDays * 24 + } else if backupConfiguration != nil { + retentionHours = backupConfiguration.RetentionDays * 24 + } else { + retentionHours = 30 * 24 // Default to 30 days + } + + // Determine the start time for retention calculation + // If backup completed, use StoppedAt; + // If backup failed, StoppedAt is not set, use CreationTimestamp + retentionStart := backup.Status.StoppedAt + if retentionStart == nil { + retentionStart = &backup.CreationTimestamp + } + + expirationTime := retentionStart.Time.Add(time.Duration(retentionHours) * time.Hour) + return &metav1.Time{Time: expirationTime} +} + +// areTimesEqual compares two metav1.Time pointers for equality +func areTimesEqual(t1, t2 *metav1.Time) bool { + if t1 == nil && t2 == nil { + return true + } + if t1 == nil || t2 == nil { + return false + } + return t1.Equal(t2) +} + +// IsDone returns true if the backup operation is completed or failed. +func (backupStatus *BackupStatus) IsDone() bool { + return backupStatus.Phase == cnpgv1.BackupPhaseCompleted || backupStatus.Phase == cnpgv1.BackupPhaseFailed +} + +// IsExpired returns true if the backup has expired based on the current time. +func (backupStatus *BackupStatus) IsExpired() bool { + if backupStatus.ExpiredAt == nil { + return false + } + return backupStatus.ExpiredAt.Time.Before(time.Now()) +} + +// IsRunning returns true if the backup is currently in progress (not in a terminal state). +func (backupList *BackupList) IsBackupRunning() bool { + for _, backup := range backupList.Items { + if !backup.Status.IsDone() { + return true + } + } + return false +} + +// GetLastBackup returns the most recent Backup from the list, or nil if the list is empty. +func (backupList *BackupList) GetLastBackup() *Backup { + if len(backupList.Items) == 0 { + return nil + } + + var lastBackup *Backup + for i, backup := range backupList.Items { + if lastBackup == nil || backup.CreationTimestamp.After(lastBackup.CreationTimestamp.Time) { + lastBackup = &backupList.Items[i] + } + } + return lastBackup +} diff --git a/operator/src/api/preview/backup_funcs_test.go b/operator/src/api/preview/backup_funcs_test.go new file mode 100644 index 00000000..cd1170fc --- /dev/null +++ b/operator/src/api/preview/backup_funcs_test.go @@ -0,0 +1,357 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var _ = Describe("Backup", func() { + var intPtr = func(v int) *int { return &v } + + Describe("CreateCNPGBackup", func() { + It("creates a CNPG Backup with expected fields and owner reference", func() { + // prepare scheme with known types so SetControllerReference can find GVKs + scheme := runtime.NewScheme() + Expect(cnpgv1.AddToScheme(scheme)).To(Succeed()) + gv := schema.GroupVersion{Group: "preview.test", Version: "preview"} + scheme.AddKnownTypes(gv, &Backup{}, &BackupList{}) + + backup := &Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-backup", + Namespace: "my-ns", + }, + Spec: BackupSpec{ + Cluster: cnpgv1.LocalObjectReference{Name: "my-cluster"}, + }, + } + + cnpg, err := backup.CreateCNPGBackup(scheme) + Expect(err).To(BeNil()) + Expect(cnpg).ToNot(BeNil()) + + // metadata and spec checks + Expect(cnpg.Name).To(Equal("my-backup")) + Expect(cnpg.Namespace).To(Equal("my-ns")) + Expect(cnpg.Spec.Method).To(Equal(cnpgv1.BackupMethodVolumeSnapshot)) + Expect(cnpg.Spec.Cluster.Name).To(Equal("my-cluster")) + + // owner reference set by SetControllerReference + Expect(len(cnpg.OwnerReferences)).To(BeNumerically(">", 0)) + owner := cnpg.OwnerReferences[0] + Expect(owner.Name).To(Equal("my-backup")) + Expect(owner.Kind).To(Equal("Backup")) + Expect(owner.APIVersion).To(Equal(gv.String())) + }) + }) + + Describe("UpdateStatus", func() { + It("updates fields from cnpg backup and computes ExpiredAt when done", func() { + startedAt := metav1.NewTime(time.Date(2025, 4, 1, 1, 0, 0, 0, time.UTC)) + stoppedAt := metav1.NewTime(time.Date(2025, 4, 1, 2, 0, 0, 0, time.UTC)) + + cnpg := &cnpgv1.Backup{ + Status: cnpgv1.BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StartedAt: &startedAt, + StoppedAt: &stoppedAt, + Error: "none", + }, + } + + backup := &Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-backup", + Namespace: "my-ns", + CreationTimestamp: metav1.NewTime(time.Date(2025, 4, 1, 0, 0, 0, 0, time.UTC)), + }, + Spec: BackupSpec{}, // no retention specified -> default 30 days + Status: BackupStatus{ + Phase: cnpgv1.BackupPhase(""), + StartedAt: nil, + StoppedAt: nil, + Error: "", + }, + } + + needsUpdate := backup.UpdateStatus(cnpg, nil) + Expect(needsUpdate).To(BeTrue()) + Expect(string(backup.Status.Phase)).To(Equal(cnpgv1.BackupPhaseCompleted)) + Expect(backup.Status.StartedAt).To(Equal(&startedAt)) + Expect(backup.Status.StoppedAt).To(Equal(&stoppedAt)) + Expect(backup.Status.Error).To(Equal("none")) + // ExpiredAt should be StoppedAt + 30 days (default) + Expect(backup.Status.ExpiredAt).ToNot(BeNil()) + Expect(backup.Status.ExpiredAt.Time.Equal(stoppedAt.Time.Add(30 * 24 * time.Hour))).To(BeTrue()) + }) + + It("does not update when there are no changes", func() { + startedAt := metav1.NewTime(time.Date(2025, 5, 1, 1, 0, 0, 0, time.UTC)) + stoppedAt := metav1.NewTime(time.Date(2025, 5, 1, 2, 0, 0, 0, time.UTC)) + expiredAt := metav1.NewTime(time.Date(2025, 5, 31, 2, 0, 0, 0, time.UTC)) + + cnpg := &cnpgv1.Backup{ + Status: cnpgv1.BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StartedAt: &startedAt, + StoppedAt: &stoppedAt, + Error: "none", + }, + } + + backup := &Backup{ + Spec: BackupSpec{}, + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StartedAt: &startedAt, + StoppedAt: &stoppedAt, + Error: "none", + ExpiredAt: &expiredAt, + }, + } + + needsUpdate := backup.UpdateStatus(cnpg, nil) + Expect(needsUpdate).To(BeFalse()) + }) + }) + + Describe("CalculateExpirationTime", func() { + It("returns nil if backup is not done", func() { + backup := &Backup{ + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseRunning, + }, + } + Expect(backup.CalculateExpirationTime(nil)).To(BeNil()) + }) + + It("uses Spec.RetentionDays when specified", func() { + base := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + stopped := metav1.NewTime(base) + backup := &Backup{ + Spec: BackupSpec{ + RetentionDays: intPtr(2), + }, + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StoppedAt: &stopped, + }, + } + + exp := backup.CalculateExpirationTime(nil) + Expect(exp).ToNot(BeNil()) + Expect(exp.Time.Equal(base.Add(48 * time.Hour))).To(BeTrue()) + }) + + It("uses BackupConfiguration.RetentionDays when Spec.RetentionDays is nil", func() { + base := time.Date(2025, 2, 1, 0, 0, 0, 0, time.UTC) + stopped := metav1.NewTime(base) + backup := &Backup{ + Spec: BackupSpec{}, // RetentionDays nil + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StoppedAt: &stopped, + }, + } + cfg := &BackupConfiguration{RetentionDays: 3} + + exp := backup.CalculateExpirationTime(cfg) + Expect(exp).ToNot(BeNil()) + Expect(exp.Time.Equal(base.Add(72 * time.Hour))).To(BeTrue()) + }) + + It("defaults to 30 days and uses CreationTimestamp when StoppedAt is nil", func() { + base := time.Date(2025, 3, 1, 12, 0, 0, 0, time.UTC) + backup := &Backup{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(base), + }, + Spec: BackupSpec{}, // RetentionDays nil + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StoppedAt: nil, + }, + } + + exp := backup.CalculateExpirationTime(nil) + Expect(exp).ToNot(BeNil()) + Expect(exp.Time.Equal(base.Add(30 * 24 * time.Hour))).To(BeTrue()) + }) + }) + + Describe("areTimesEqual", func() { + It("returns true for nil nil", func() { + Expect(areTimesEqual(nil, nil)).To(BeTrue()) + }) + + It("returns false for nil and non-nil", func() { + t := metav1.NewTime(time.Now()) + Expect(areTimesEqual(nil, &t)).To(BeFalse()) + Expect(areTimesEqual(&t, nil)).To(BeFalse()) + }) + + It("returns true for identical times and false for different", func() { + base := time.Now().Truncate(time.Second) + t1 := metav1.NewTime(base) + t2 := metav1.NewTime(base) + t3 := metav1.NewTime(base.Add(time.Minute)) + + Expect(areTimesEqual(&t1, &t2)).To(BeTrue()) + Expect(areTimesEqual(&t1, &t3)).To(BeFalse()) + }) + }) + + Describe("IsDone", func() { + It("returns true when phase is Completed", func() { + status := &BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + } + Expect(status.IsDone()).To(BeTrue()) + }) + + It("returns true when phase is Failed", func() { + status := &BackupStatus{ + Phase: cnpgv1.BackupPhaseFailed, + } + Expect(status.IsDone()).To(BeTrue()) + }) + + It("returns false when phase is Running", func() { + status := &BackupStatus{ + Phase: cnpgv1.BackupPhaseRunning, + } + Expect(status.IsDone()).To(BeFalse()) + }) + + It("returns false when phase is empty", func() { + status := &BackupStatus{ + Phase: cnpgv1.BackupPhase(""), + } + Expect(status.IsDone()).To(BeFalse()) + }) + }) + + Describe("IsExpired", func() { + It("returns false when ExpiredAt is nil", func() { + status := &BackupStatus{ExpiredAt: nil} + Expect(status.IsExpired()).To(BeFalse()) + }) + + It("returns true when ExpiredAt is in the past", func() { + past := metav1.NewTime(time.Now().Add(-1 * time.Hour)) + status := &BackupStatus{ExpiredAt: &past} + Expect(status.IsExpired()).To(BeTrue()) + }) + + It("returns false when ExpiredAt is in the future", func() { + future := metav1.NewTime(time.Now().Add(1 * time.Hour)) + status := &BackupStatus{ExpiredAt: &future} + Expect(status.IsExpired()).To(BeFalse()) + }) + }) +}) + +var _ = Describe("BackupList", func() { + Describe("IsBackupRunning", func() { + It("returns false when all backups are in terminal phases", func() { + backupList := &BackupList{ + Items: []Backup{ + { + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + }, + }, + { + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseFailed, + }, + }, + }, + } + Expect(backupList.IsBackupRunning()).To(BeFalse()) + }) + + It("returns false when backup list is empty", func() { + backupList := &BackupList{ + Items: []Backup{}, + } + Expect(backupList.IsBackupRunning()).To(BeFalse()) + }) + + It("returns true when at least one backup is running among completed backups", func() { + backupList := &BackupList{ + Items: []Backup{ + { + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + }, + }, + { + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseRunning, + }, + }, + { + Status: BackupStatus{ + Phase: cnpgv1.BackupPhaseFailed, + }, + }, + }, + } + Expect(backupList.IsBackupRunning()).To(BeTrue()) + }) + }) + + Describe("GetLastBackup", func() { + It("returns nil for empty list", func() { + backupList := &BackupList{ + Items: []Backup{}, + } + Expect(backupList.GetLastBackup()).To(BeNil()) + }) + + It("returns the most recent backup by CreationTimestamp", func() { + t1 := metav1.NewTime(time.Date(2025, 6, 1, 9, 0, 0, 0, time.UTC)) + t2 := metav1.NewTime(time.Date(2025, 6, 1, 11, 0, 0, 0, time.UTC)) + t3 := metav1.NewTime(time.Date(2025, 6, 1, 10, 0, 0, 0, time.UTC)) + + backupList := &BackupList{ + Items: []Backup{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b1", + CreationTimestamp: t1, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b2", + CreationTimestamp: t2, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b3", + CreationTimestamp: t3, + }, + }, + }, + } + + last := backupList.GetLastBackup() + Expect(last).ToNot(BeNil()) + Expect(last.Name).To(Equal("b2")) + // ensure pointer points into the slice + Expect(last).To(Equal(&backupList.Items[1])) + }) + }) +}) diff --git a/operator/src/api/preview/backup_types.go b/operator/src/api/preview/backup_types.go new file mode 100644 index 00000000..226e876f --- /dev/null +++ b/operator/src/api/preview/backup_types.go @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// BackupSpec defines the desired state of Backup. +// +kubebuilder:validation:XValidation:rule="oldSelf == self",message="BackupSpec is immutable once set" +type BackupSpec struct { + // Cluster specifies the DocumentDB cluster to backup. + // The cluster must exist in the same namespace as the Backup resource. + // +kubebuilder:validation:Required + Cluster cnpgv1.LocalObjectReference `json:"cluster"` + + // RetentionDays specifies how many days the backup should be retained. + // If not specified, the default retention period from the cluster's backup retention policy will be used. + // +optional + RetentionDays *int `json:"retentionDays,omitempty"` +} + +// BackupPhaseSkipped indicates that the backup was skipped, +// for example backup won't run for a standby cluster in multi-region setup. +const BackupPhaseSkipped cnpgv1.BackupPhase = "skipped" + +// BackupStatus defines the observed state of Backup. +type BackupStatus struct { + // Phase represents the current phase of the backup operation. + Phase cnpgv1.BackupPhase `json:"phase,omitempty"` + + // StartedAt is the time when the backup operation started. + // +optional + StartedAt *metav1.Time `json:"startedAt,omitempty"` + + // StoppedAt is the time when the backup operation completed. + // +optional + StoppedAt *metav1.Time `json:"stoppedAt,omitempty"` + + // ExpiredAt is the time when the backup is considered expired and can be deleted. + // +optional + ExpiredAt *metav1.Time `json:"expiredAt,omitempty"` + + // Error contains error information if the backup failed. + // +optional + Error string `json:"error,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=".spec.cluster.name",description="Target DocumentDB cluster" +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=".status.phase",description="Backup phase" +// +kubebuilder:printcolumn:name="StartedAt",type=string,JSONPath=".status.startedAt",description="Backup start time" +// +kubebuilder:printcolumn:name="StoppedAt",type=string,JSONPath=".status.stoppedAt",description="Backup completion time" +// +kubebuilder:printcolumn:name="ExpiredAt",type=string,JSONPath=".status.expiredAt",description="Backup expiration time" +// +kubebuilder:printcolumn:name="Error",type=string,JSONPath=".status.error",description="Backup error information" +type Backup struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec BackupSpec `json:"spec,omitempty"` + Status BackupStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// BackupList contains a list of Backup. +type BackupList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Backup `json:"items"` +} + +func init() { + SchemeBuilder.Register(&Backup{}, &BackupList{}) +} diff --git a/operator/src/api/preview/documentdb_types.go b/operator/src/api/preview/documentdb_types.go index d26c5620..bdc21415 100644 --- a/operator/src/api/preview/documentdb_types.go +++ b/operator/src/api/preview/documentdb_types.go @@ -4,6 +4,7 @@ package preview import ( + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -64,6 +65,39 @@ type DocumentDBSpec struct { // Overrides default log level for the DocumentDB cluster. LogLevel string `json:"logLevel,omitempty"` + + // Bootstrap configures the initialization of the DocumentDB cluster. + // +optional + Bootstrap *BootstrapConfiguration `json:"bootstrap,omitempty"` + + // Backup configures backup settings for DocumentDB. + // +optional + Backup *BackupConfiguration `json:"backup,omitempty"` +} + +// BootstrapConfiguration defines how to bootstrap a DocumentDB cluster. +type BootstrapConfiguration struct { + // Recovery configures recovery from a backup. + // +optional + Recovery *RecoveryConfiguration `json:"recovery,omitempty"` +} + +// RecoveryConfiguration defines backup recovery settings. +type RecoveryConfiguration struct { + // Backup specifies the source backup to restore from. + // +optional + Backup cnpgv1.LocalObjectReference `json:"backup,omitempty"` +} + +// BackupConfiguration defines backup settings for DocumentDB. +type BackupConfiguration struct { + // RetentionDays specifies how many days backups should be retained. + // If not specified, the default retention period is 30 days. + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=365 + // +kubebuilder:default=30 + // +optional + RetentionDays int `json:"retentionDays,omitempty"` } type Resource struct { diff --git a/operator/src/api/preview/scheduledbackup_funcs.go b/operator/src/api/preview/scheduledbackup_funcs.go new file mode 100644 index 00000000..5a0e388a --- /dev/null +++ b/operator/src/api/preview/scheduledbackup_funcs.go @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + "fmt" + "time" + + "github.com/robfig/cron" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// CreateBackup generates a new Backup resource for this ScheduledBackup. +// The backup name is generated with a timestamp suffix to ensure uniqueness. +func (scheduledBackup *ScheduledBackup) CreateBackup(now time.Time) *Backup { + // Generate backup name with timestamp + backupName := fmt.Sprintf("%s-%s", scheduledBackup.Name, now.Format("20060102-150405")) + + return &Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: scheduledBackup.Namespace, + Labels: map[string]string{ + "scheduledbackup": scheduledBackup.Name, + }, + }, + Spec: BackupSpec{ + Cluster: scheduledBackup.Spec.Cluster, + RetentionDays: scheduledBackup.Spec.RetentionDays, + }, + } +} + +// GetNextScheduleTime calculates the next scheduled time +func (scheduledBackup *ScheduledBackup) GetNextScheduleTime(schedule cron.Schedule, lastBackup *Backup) time.Time { + // If there is a last backup, calculate the next schedule time based on its creation time + if lastBackup != nil && lastBackup.CreationTimestamp.Time.After(time.Time{}) { + return schedule.Next(lastBackup.CreationTimestamp.Time) + } + + if scheduledBackup.Status.NextScheduledTime != nil { + return scheduledBackup.Status.NextScheduledTime.Time + } + + return schedule.Next(time.Now()) +} diff --git a/operator/src/api/preview/scheduledbackup_funcs_test.go b/operator/src/api/preview/scheduledbackup_funcs_test.go new file mode 100644 index 00000000..2174d5bd --- /dev/null +++ b/operator/src/api/preview/scheduledbackup_funcs_test.go @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + "reflect" + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/robfig/cron" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = Describe("ScheduledBackup", func() { + + Describe("CreateBackup", func() { + It("creates a Backup with expected fields", func() { + retentionDays := 7 + sb := &ScheduledBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-scheduled-backup", + Namespace: "default", + }, + Spec: ScheduledBackupSpec{ + Cluster: cnpgv1.LocalObjectReference{ + Name: "test-cluster", + }, + RetentionDays: &retentionDays, + }, + } + + fixedTime := time.Date(2025, 10, 20, 15, 30, 45, 0, time.UTC) + backup := sb.CreateBackup(fixedTime) + + Expect(backup.Name).To(Equal("my-scheduled-backup-20251020-153045")) + Expect(backup.Namespace).To(Equal("default")) + Expect(backup.Labels).To(HaveKeyWithValue("scheduledbackup", "my-scheduled-backup")) + Expect(backup.Spec.Cluster.Name).To(Equal("test-cluster")) + Expect(backup.Spec.RetentionDays).ToNot(BeNil()) + Expect(*backup.Spec.RetentionDays).To(Equal(7)) + }) + + It("creates a Backup without RetentionDays when not specified", func() { + sb := &ScheduledBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-scheduled-backup", + Namespace: "default", + }, + Spec: ScheduledBackupSpec{ + Cluster: cnpgv1.LocalObjectReference{ + Name: "test-cluster", + }, + }, + } + + fixedTime := time.Date(2025, 10, 20, 15, 30, 45, 0, time.UTC) + backup := sb.CreateBackup(fixedTime) + + Expect(backup.Name).To(Equal("my-scheduled-backup-20251020-153045")) + Expect(backup.Namespace).To(Equal("default")) + Expect(backup.Labels).To(HaveKeyWithValue("scheduledbackup", "my-scheduled-backup")) + Expect(backup.Spec.Cluster.Name).To(Equal("test-cluster")) + Expect(reflect.ValueOf(backup.Spec.RetentionDays).IsNil()).To(BeTrue()) + }) + }) + + Describe("getNextScheduleTime", func() { + sb := ScheduledBackup{ + Spec: ScheduledBackupSpec{ + Schedule: "0 0 * * *", + }, + Status: ScheduledBackupStatus{ + NextScheduledTime: &metav1.Time{Time: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + }, + } + schedule, _ := cron.ParseStandard("0 0 * * *") + + It("returns next time based on the last backup", func() { + backup := &Backup{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.Time{Time: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + }, + } + + nextScheduleTime := sb.GetNextScheduleTime(schedule, backup) + Expect(nextScheduleTime).To(Equal(schedule.Next(backup.CreationTimestamp.Time))) + }) + + It("returns Status.NextScheduledTime when no backups exist", func() { + nextScheduleTime := sb.GetNextScheduleTime(schedule, nil) + Expect(nextScheduleTime).To(Equal(sb.Status.NextScheduledTime.Time)) + }) + + It("returns next time based on now when no backups exist and Status.NextScheduledTime is not set", func() { + sb := ScheduledBackup{ + Spec: ScheduledBackupSpec{ + Schedule: "0 0 * * *", + }, + } + nextScheduleTime := sb.GetNextScheduleTime(schedule, nil) + Expect(nextScheduleTime.After(time.Now())) + }) + }) +}) diff --git a/operator/src/api/preview/scheduledbackup_types.go b/operator/src/api/preview/scheduledbackup_types.go new file mode 100644 index 00000000..13f8f683 --- /dev/null +++ b/operator/src/api/preview/scheduledbackup_types.go @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ScheduledBackupSpec defines the desired state of ScheduledBackup +type ScheduledBackupSpec struct { + // Cluster specifies the DocumentDB cluster to backup. + // The cluster must exist in the same namespace as the ScheduledBackup resource. + // +kubebuilder:validation:Required + Cluster cnpgv1.LocalObjectReference `json:"cluster"` + + // Schedule defines when backups should be created using cron expression format. + // See https://pkg.go.dev/github.com/robfig/cron#hdr-CRON_Expression_Format + // +kubebuilder:validation:Required + Schedule string `json:"schedule"` + + // RetentionDays specifies how many days the backups should be retained. + // If not specified, the default retention period from the cluster's backup retention policy will be used. + // +optional + RetentionDays *int `json:"retentionDays,omitempty"` +} + +// ScheduledBackupStatus defines the observed state of ScheduledBackup +type ScheduledBackupStatus struct { + + // LastScheduledTime is the time when the last backup was scheduled by this ScheduledBackup. + // +optional + LastScheduledTime *metav1.Time `json:"lastScheduledTime,omitempty"` + + // NextScheduledTime is the time when the next backup is scheduled by this ScheduledBackup. + // +optional + NextScheduledTime *metav1.Time `json:"nextScheduledTime,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Cluster",type="string",JSONPath=".spec.cluster.name" +// +kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".spec.schedule" +// +kubebuilder:printcolumn:name="Retention Days",type="integer",JSONPath=".spec.retentionDays" +type ScheduledBackup struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata"` + + Spec ScheduledBackupSpec `json:"spec"` + Status ScheduledBackupStatus `json:"status,omitempty"` +} + +// ScheduledBackupList contains a list of ScheduledBackup resources +// +kubebuilder:object:root=true +type ScheduledBackupList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ScheduledBackup `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ScheduledBackup{}, &ScheduledBackupList{}) +} diff --git a/operator/src/api/preview/suite_test.go b/operator/src/api/preview/suite_test.go new file mode 100644 index 00000000..41dbc0c3 --- /dev/null +++ b/operator/src/api/preview/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package preview + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestAPIPreview(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "API Preview Suite") +} diff --git a/operator/src/api/preview/zz_generated.deepcopy.go b/operator/src/api/preview/zz_generated.deepcopy.go index 94b244b6..410f21b5 100644 --- a/operator/src/api/preview/zz_generated.deepcopy.go +++ b/operator/src/api/preview/zz_generated.deepcopy.go @@ -8,9 +8,151 @@ package preview import ( - runtime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Backup) DeepCopyInto(out *Backup) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Backup. +func (in *Backup) DeepCopy() *Backup { + if in == nil { + return nil + } + out := new(Backup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Backup) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackupConfiguration) DeepCopyInto(out *BackupConfiguration) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupConfiguration. +func (in *BackupConfiguration) DeepCopy() *BackupConfiguration { + if in == nil { + return nil + } + out := new(BackupConfiguration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackupList) DeepCopyInto(out *BackupList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Backup, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupList. +func (in *BackupList) DeepCopy() *BackupList { + if in == nil { + return nil + } + out := new(BackupList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *BackupList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackupSpec) DeepCopyInto(out *BackupSpec) { + *out = *in + in.Cluster.DeepCopyInto(&out.Cluster) + if in.RetentionDays != nil { + in, out := &in.RetentionDays, &out.RetentionDays + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupSpec. +func (in *BackupSpec) DeepCopy() *BackupSpec { + if in == nil { + return nil + } + out := new(BackupSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackupStatus) DeepCopyInto(out *BackupStatus) { + *out = *in + if in.StartedAt != nil { + in, out := &in.StartedAt, &out.StartedAt + *out = (*in).DeepCopy() + } + if in.StoppedAt != nil { + in, out := &in.StoppedAt, &out.StoppedAt + *out = (*in).DeepCopy() + } + if in.ExpiredAt != nil { + in, out := &in.ExpiredAt, &out.ExpiredAt + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupStatus. +func (in *BackupStatus) DeepCopy() *BackupStatus { + if in == nil { + return nil + } + out := new(BackupStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BootstrapConfiguration) DeepCopyInto(out *BootstrapConfiguration) { + *out = *in + if in.Recovery != nil { + in, out := &in.Recovery, &out.Recovery + *out = new(RecoveryConfiguration) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BootstrapConfiguration. +func (in *BootstrapConfiguration) DeepCopy() *BootstrapConfiguration { + if in == nil { + return nil + } + out := new(BootstrapConfiguration) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterReplication) DeepCopyInto(out *ClusterReplication) { *out = *in @@ -101,6 +243,16 @@ func (in *DocumentDBSpec) DeepCopyInto(out *DocumentDBSpec) { } out.ExposeViaService = in.ExposeViaService out.Timeouts = in.Timeouts + if in.Bootstrap != nil { + in, out := &in.Bootstrap, &out.Bootstrap + *out = new(BootstrapConfiguration) + (*in).DeepCopyInto(*out) + } + if in.Backup != nil { + in, out := &in.Backup, &out.Backup + *out = new(BackupConfiguration) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DocumentDBSpec. @@ -143,6 +295,22 @@ func (in *ExposeViaService) DeepCopy() *ExposeViaService { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RecoveryConfiguration) DeepCopyInto(out *RecoveryConfiguration) { + *out = *in + in.Backup.DeepCopyInto(&out.Backup) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecoveryConfiguration. +func (in *RecoveryConfiguration) DeepCopy() *RecoveryConfiguration { + if in == nil { + return nil + } + out := new(RecoveryConfiguration) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Resource) DeepCopyInto(out *Resource) { *out = *in @@ -159,6 +327,109 @@ func (in *Resource) DeepCopy() *Resource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledBackup) DeepCopyInto(out *ScheduledBackup) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledBackup. +func (in *ScheduledBackup) DeepCopy() *ScheduledBackup { + if in == nil { + return nil + } + out := new(ScheduledBackup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledBackup) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledBackupList) DeepCopyInto(out *ScheduledBackupList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ScheduledBackup, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledBackupList. +func (in *ScheduledBackupList) DeepCopy() *ScheduledBackupList { + if in == nil { + return nil + } + out := new(ScheduledBackupList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledBackupList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledBackupSpec) DeepCopyInto(out *ScheduledBackupSpec) { + *out = *in + in.Cluster.DeepCopyInto(&out.Cluster) + if in.RetentionDays != nil { + in, out := &in.RetentionDays, &out.RetentionDays + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledBackupSpec. +func (in *ScheduledBackupSpec) DeepCopy() *ScheduledBackupSpec { + if in == nil { + return nil + } + out := new(ScheduledBackupSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledBackupStatus) DeepCopyInto(out *ScheduledBackupStatus) { + *out = *in + if in.LastScheduledTime != nil { + in, out := &in.LastScheduledTime, &out.LastScheduledTime + *out = (*in).DeepCopy() + } + if in.NextScheduledTime != nil { + in, out := &in.NextScheduledTime, &out.NextScheduledTime + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledBackupStatus. +func (in *ScheduledBackupStatus) DeepCopy() *ScheduledBackupStatus { + if in == nil { + return nil + } + out := new(ScheduledBackupStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StorageConfiguration) DeepCopyInto(out *StorageConfiguration) { *out = *in diff --git a/operator/src/cmd/main.go b/operator/src/cmd/main.go index 661d5bd9..966527f0 100644 --- a/operator/src/cmd/main.go +++ b/operator/src/cmd/main.go @@ -200,6 +200,25 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "DocumentDB") os.Exit(1) } + + if err = (&controller.BackupReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("backup-controller"), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Backup") + os.Exit(1) + } + + if err = (&controller.ScheduledBackupReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("scheduled-backup-controller"), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ScheduledBackup") + os.Exit(1) + } + // +kubebuilder:scaffold:builder if metricsCertWatcher != nil { diff --git a/operator/src/config/crd/bases/db.microsoft.com_backups.yaml b/operator/src/config/crd/bases/db.microsoft.com_backups.yaml new file mode 100644 index 00000000..bc9fc524 --- /dev/null +++ b/operator/src/config/crd/bases/db.microsoft.com_backups.yaml @@ -0,0 +1,115 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: backups.db.microsoft.com +spec: + group: db.microsoft.com + names: + kind: Backup + listKind: BackupList + plural: backups + singular: backup + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Target DocumentDB cluster + jsonPath: .spec.cluster.name + name: Cluster + type: string + - description: Backup phase + jsonPath: .status.phase + name: Phase + type: string + - description: Backup start time + jsonPath: .status.startedAt + name: StartedAt + type: string + - description: Backup completion time + jsonPath: .status.stoppedAt + name: StoppedAt + type: string + - description: Backup expiration time + jsonPath: .status.expiredAt + name: ExpiredAt + type: string + - description: Backup error information + jsonPath: .status.error + name: Error + type: string + name: preview + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BackupSpec defines the desired state of Backup. + properties: + cluster: + description: |- + Cluster specifies the DocumentDB cluster to backup. + The cluster must exist in the same namespace as the Backup resource. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + retentionDays: + description: |- + RetentionDays specifies how many days the backup should be retained. + If not specified, the default retention period from the cluster's backup retention policy will be used. + type: integer + required: + - cluster + type: object + x-kubernetes-validations: + - message: BackupSpec is immutable once set + rule: oldSelf == self + status: + description: BackupStatus defines the observed state of Backup. + properties: + error: + description: Error contains error information if the backup failed. + type: string + expiredAt: + description: ExpiredAt is the time when the backup is considered expired + and can be deleted. + format: date-time + type: string + phase: + description: Phase represents the current phase of the backup operation. + type: string + startedAt: + description: StartedAt is the time when the backup operation started. + format: date-time + type: string + stoppedAt: + description: StoppedAt is the time when the backup operation completed. + format: date-time + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml b/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml index a2a92fdc..db773b8c 100644 --- a/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml +++ b/operator/src/config/crd/bases/db.microsoft.com_documentdbs.yaml @@ -48,6 +48,37 @@ spec: spec: description: DocumentDBSpec defines the desired state of DocumentDB. properties: + backup: + description: Backup configures backup settings for DocumentDB. + properties: + retentionDays: + default: 30 + description: |- + RetentionDays specifies how many days backups should be retained. + If not specified, the default retention period is 30 days. + maximum: 365 + minimum: 1 + type: integer + type: object + bootstrap: + description: Bootstrap configures the initialization of the DocumentDB + cluster. + properties: + recovery: + description: Recovery configures recovery from a backup. + properties: + backup: + description: Backup specifies the source backup to restore + from. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + type: object + type: object clusterReplication: description: ClusterReplication configures cross-cluster replication for DocumentDB. diff --git a/operator/src/config/crd/bases/db.microsoft.com_scheduledbackups.yaml b/operator/src/config/crd/bases/db.microsoft.com_scheduledbackups.yaml new file mode 100644 index 00000000..98052439 --- /dev/null +++ b/operator/src/config/crd/bases/db.microsoft.com_scheduledbackups.yaml @@ -0,0 +1,97 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: scheduledbackups.db.microsoft.com +spec: + group: db.microsoft.com + names: + kind: ScheduledBackup + listKind: ScheduledBackupList + plural: scheduledbackups + singular: scheduledbackup + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.cluster.name + name: Cluster + type: string + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .spec.retentionDays + name: Retention Days + type: integer + name: preview + schema: + openAPIV3Schema: + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledBackupSpec defines the desired state of ScheduledBackup + properties: + cluster: + description: |- + Cluster specifies the DocumentDB cluster to backup. + The cluster must exist in the same namespace as the ScheduledBackup resource. + properties: + name: + description: Name of the referent. + type: string + required: + - name + type: object + retentionDays: + description: |- + RetentionDays specifies how many days the backups should be retained. + If not specified, the default retention period from the cluster's backup retention policy will be used. + type: integer + schedule: + description: |- + Schedule defines when backups should be created using cron expression format. + See https://pkg.go.dev/github.com/robfig/cron#hdr-CRON_Expression_Format + type: string + required: + - cluster + - schedule + type: object + status: + description: ScheduledBackupStatus defines the observed state of ScheduledBackup + properties: + lastScheduledTime: + description: LastScheduledTime is the time when the last backup was + scheduled by this ScheduledBackup. + format: date-time + type: string + nextScheduledTime: + description: NextScheduledTime is the time when the next backup is + scheduled by this ScheduledBackup. + format: date-time + type: string + type: object + required: + - metadata + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/operator/src/go.mod b/operator/src/go.mod index 9e13850c..ec4b2a1f 100644 --- a/operator/src/go.mod +++ b/operator/src/go.mod @@ -8,6 +8,8 @@ require ( github.com/cloudnative-pg/cloudnative-pg v1.25.1 github.com/cloudnative-pg/machinery v0.1.0 github.com/go-logr/logr v1.4.2 + github.com/onsi/ginkgo/v2 v2.22.2 + github.com/onsi/gomega v1.36.2 go.goms.io/fleet-networking v0.3.0 k8s.io/api v0.32.2 k8s.io/apimachinery v0.32.2 @@ -35,6 +37,7 @@ require ( github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect @@ -42,6 +45,7 @@ require ( github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect @@ -49,7 +53,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.11 // indirect - github.com/kubernetes-csi/external-snapshotter/client/v8 v8.2.0 // indirect + github.com/kubernetes-csi/external-snapshotter/client/v8 v8.2.0 github.com/lib/pq v1.10.9 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/moby/spdystream v0.5.0 // indirect @@ -63,6 +67,7 @@ require ( github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect + github.com/robfig/cron v1.2.0 github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/pflag v1.0.6 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect @@ -85,6 +90,7 @@ require ( golang.org/x/term v0.30.0 // indirect golang.org/x/text v0.23.0 // indirect golang.org/x/time v0.9.0 // indirect + golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a // indirect diff --git a/operator/src/go.sum b/operator/src/go.sum index 2051bf8a..5c6ccbbb 100644 --- a/operator/src/go.sum +++ b/operator/src/go.sum @@ -128,6 +128,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= +github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/operator/src/internal/cnpg/cnpg_cluster.go b/operator/src/internal/cnpg/cnpg_cluster.go index 39fa73b5..3b8917cb 100644 --- a/operator/src/internal/cnpg/cnpg_cluster.go +++ b/operator/src/internal/cnpg/cnpg_cluster.go @@ -84,8 +84,14 @@ func GetCnpgClusterSpec(req ctrl.Request, documentdb *dbpreview.DocumentDB, docu "host replication all all trust", }, }, - Bootstrap: getBootstrapConfiguration(), + Bootstrap: getBootstrapConfiguration(documentdb, log), LogLevel: cmp.Or(documentdb.Spec.LogLevel, "info"), + Backup: &cnpgv1.BackupConfiguration{ + VolumeSnapshot: &cnpgv1.VolumeSnapshotConfiguration{ + SnapshotOwnerReference: "backup", // Set owner reference to 'backup' so that snapshots are deleted when Backup resource is deleted + }, + Target: cnpgv1.BackupTarget("primary"), + }, } spec.MaxStopDelay = getMaxStopDelayOrDefault(documentdb) return spec @@ -102,7 +108,19 @@ func getInheritedMetadataLabels(appName string) *cnpgv1.EmbeddedObjectMetadata { } } -func getBootstrapConfiguration() *cnpgv1.BootstrapConfiguration { +func getBootstrapConfiguration(documentdb *dbpreview.DocumentDB, log logr.Logger) *cnpgv1.BootstrapConfiguration { + if documentdb.Spec.Bootstrap != nil && documentdb.Spec.Bootstrap.Recovery != nil && documentdb.Spec.Bootstrap.Recovery.Backup.Name != "" { + backupName := documentdb.Spec.Bootstrap.Recovery.Backup.Name + log.Info("DocumentDB cluster will be bootstrapped from backup", "backupName", backupName) + return &cnpgv1.BootstrapConfiguration{ + Recovery: &cnpgv1.BootstrapRecovery{ + Backup: &cnpgv1.BackupSource{ + LocalObjectReference: cnpgv1.LocalObjectReference{Name: backupName}, + }, + }, + } + } + return &cnpgv1.BootstrapConfiguration{ InitDB: &cnpgv1.BootstrapInitDB{ PostInitSQL: []string{ diff --git a/operator/src/internal/controller/backup_controller.go b/operator/src/internal/controller/backup_controller.go new file mode 100644 index 00000000..87a4045e --- /dev/null +++ b/operator/src/internal/controller/backup_controller.go @@ -0,0 +1,227 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "context" + "fmt" + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + dbpreview "github.com/microsoft/documentdb-operator/api/preview" +) + +// BackupReconciler reconciles a Backup object +type BackupReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder +} + +// Reconcile handles the reconciliation loop for Backup resources. +func (r *BackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Fetch the Backup resource + backup := &dbpreview.Backup{} + if err := r.Get(ctx, req.NamespacedName, backup); err != nil { + if apierrors.IsNotFound(err) { + logger.Info("Backup resource not found, might have been deleted") + return ctrl.Result{}, nil + } + logger.Error(err, "Failed to get Backup") + return ctrl.Result{}, err + } + + // Delete the Backup resource if it has expired + if backup.Status.IsExpired() { + r.Recorder.Event(backup, "Normal", "BackupExpired", "Backup has expired and will be deleted") + if err := r.Delete(ctx, backup); err != nil { + r.Recorder.Event(backup, "Warning", "BackupDeleteFailed", "Failed to delete expired Backup: "+err.Error()) + return ctrl.Result{}, err + } + r.Recorder.Event(backup, "Normal", "BackupDeleted", "Expired Backup has been deleted") + return ctrl.Result{}, nil + } + + // No further action needed for completed backups + if backup.Status.IsDone() { + return ctrl.Result{}, nil + } + + // Fetch the associated DocumentDB cluster + cluster := &dbpreview.DocumentDB{} + clusterKey := client.ObjectKey{ + Name: backup.Spec.Cluster.Name, + Namespace: backup.Namespace, + } + if err := r.Get(ctx, clusterKey, cluster); err != nil { + return r.SetBackupPhaseFailed(ctx, backup, "Failed to get associated DocumentDB cluster: "+err.Error(), nil) + } + + // Ensure VolumeSnapshotClass exists + if err := r.ensureVolumeSnapshotClass(ctx, cluster.Spec.Environment); err != nil { + return r.SetBackupPhaseFailed(ctx, backup, "Failed to ensure VolumeSnapshotClass: "+err.Error(), cluster.Spec.Backup) + } + + // Get or create the CNPG Backup + cnpgBackup := &cnpgv1.Backup{} + cnpgBackupKey := client.ObjectKey{ + Name: backup.Name, + Namespace: backup.Namespace, + } + if err := r.Get(ctx, cnpgBackupKey, cnpgBackup); err != nil { + if apierrors.IsNotFound(err) { + return r.createCNPGBackup(ctx, backup, cluster.Spec.Backup) + } + logger.Error(err, "Failed to get CNPG Backup") + return ctrl.Result{}, err + } + + // Update status based on CNPG Backup status + return r.updateBackupStatus(ctx, backup, cnpgBackup, cluster.Spec.Backup) +} + +// ensureVolumeSnapshotClass creates a VolumeSnapshotClass based on the cloud environment +func (r *BackupReconciler) ensureVolumeSnapshotClass(ctx context.Context, environment string) error { + logger := log.FromContext(ctx) + + // Check if any VolumeSnapshotClass exists + vscList := &snapshotv1.VolumeSnapshotClassList{} + if err := r.List(ctx, vscList); err != nil { + logger.Error(err, "Failed to list VolumeSnapshotClasses") + return err + } + + for _, vsc := range vscList.Items { + if val, ok := vsc.Annotations["snapshot.storage.kubernetes.io/is-default-class"]; ok && val == "true" { + return nil + } + } + + r.Recorder.Event(nil, "Normal", "VolumeSnapshotClass", "No default VolumeSnapshotClass found, creating one") + vsc := buildVolumeSnapshotClass(environment) + if vsc == nil { + err := fmt.Errorf("Please create a default VolumeSnapshotClass before creating backups") + logger.Error(err, "Failed to build VolumeSnapshotClass", "environment", environment) + return err + } + + if err := r.Create(ctx, vsc); err != nil { + logger.Error(err, "Failed to create VolumeSnapshotClass") + return err + } + + r.Recorder.Event(nil, "Normal", "VolumeSnapshotClass", "Successfully created VolumeSnapshotClass "+vsc.Name) + return nil +} + +// buildVolumeSnapshotClass builds a VolumeSnapshotClass based on cloud provider +func buildVolumeSnapshotClass(environment string) *snapshotv1.VolumeSnapshotClass { + deletionPolicy := snapshotv1.VolumeSnapshotContentDelete + + var driver string + var name string + + switch environment { + case "aks": + driver = "disk.csi.azure.com" + name = "azure-disk-snapclass" + default: + // TODO: add support for other cloud providers + return nil + } + + return &snapshotv1.VolumeSnapshotClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Annotations: map[string]string{ + "snapshot.storage.kubernetes.io/is-default-class": "true", + }, + }, + Driver: driver, + DeletionPolicy: deletionPolicy, + } +} + +// createCNPGBackup creates a new CNPG Backup resource +func (r *BackupReconciler) createCNPGBackup(ctx context.Context, backup *dbpreview.Backup, backupConfiguration *dbpreview.BackupConfiguration) (ctrl.Result, error) { + cnpgBackup, err := backup.CreateCNPGBackup(r.Scheme) + if err != nil { + return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), backupConfiguration) + } + + if err := r.Create(ctx, cnpgBackup); err != nil { + return r.SetBackupPhaseFailed(ctx, backup, "Failed to initialize backup: "+err.Error(), backupConfiguration) + } + + r.Recorder.Event(backup, "Normal", "BackupInitialized", "Successfully initialized backup") + + // Requeue to check status + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil +} + +// updateBackupStatus updates the Backup status based on CNPG Backup status +func (r *BackupReconciler) updateBackupStatus(ctx context.Context, backup *dbpreview.Backup, cnpgBackup *cnpgv1.Backup, backupConfiguration *dbpreview.BackupConfiguration) (ctrl.Result, error) { + original := backup.DeepCopy() + needsUpdate := backup.UpdateStatus(cnpgBackup, backupConfiguration) + + if needsUpdate { + if err := r.Status().Patch(ctx, backup, client.MergeFrom(original)); err != nil { + logger := log.FromContext(ctx) + logger.Error(err, "Failed to patch Backup status") + return ctrl.Result{}, err + } + } + + if backup.Status.IsDone() && backup.Status.ExpiredAt != nil { + requeueAfter := time.Until(backup.Status.ExpiredAt.Time) + if requeueAfter < 0 { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + // Backup is still in progress, requeue to check status again + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil +} + +func (r *BackupReconciler) SetBackupPhaseFailed(ctx context.Context, backup *dbpreview.Backup, errMessage string, backupConfiguration *dbpreview.BackupConfiguration) (ctrl.Result, error) { + original := backup.DeepCopy() + + backup.Status.Phase = cnpgv1.BackupPhaseFailed + backup.Status.Error = errMessage + backup.Status.ExpiredAt = backup.CalculateExpirationTime(backupConfiguration) + + if err := r.Status().Patch(ctx, backup, client.MergeFrom(original)); err != nil { + logger := log.FromContext(ctx) + logger.Error(err, "Failed to patch Backup status") + return ctrl.Result{}, err + } + + r.Recorder.Event(backup, "Warning", "BackupFailed", errMessage) + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *BackupReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Register VolumeSnapshotClass with the scheme + if err := snapshotv1.AddToScheme(mgr.GetScheme()); err != nil { + return err + } + + return ctrl.NewControllerManagedBy(mgr). + For(&dbpreview.Backup{}). + Owns(&cnpgv1.Backup{}). + Complete(r) +} diff --git a/operator/src/internal/controller/backup_controller_test.go b/operator/src/internal/controller/backup_controller_test.go new file mode 100644 index 00000000..f5479bfb --- /dev/null +++ b/operator/src/internal/controller/backup_controller_test.go @@ -0,0 +1,254 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "context" + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + dbpreview "github.com/microsoft/documentdb-operator/api/preview" +) + +var _ = Describe("Backup Controller", func() { + const ( + backupName = "test-backup" + backupNamespace = "default" + clusterName = "test-cluster" + ) + + var ( + ctx context.Context + scheme *runtime.Scheme + recorder record.EventRecorder + ) + + BeforeEach(func() { + ctx = context.Background() + scheme = runtime.NewScheme() + recorder = record.NewFakeRecorder(10) + // register both preview and CNPG types used by the controller + Expect(dbpreview.AddToScheme(scheme)).To(Succeed()) + Expect(cnpgv1.AddToScheme(scheme)).To(Succeed()) + }) + + Describe("createCNPGBackup", func() { + It("creates a CNPG Backup with expected spec and owner reference and requeues", func() { + // fake client + reconciler + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + reconciler := &BackupReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + } + + // input dbpreview Backup + backup := &dbpreview.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Spec: dbpreview.BackupSpec{ + Cluster: cnpgv1.LocalObjectReference{Name: clusterName}, + }, + } + + // Call under test + res, err := reconciler.createCNPGBackup(ctx, backup, nil) + Expect(err).ToNot(HaveOccurred()) + // controller uses a 5s requeue + Expect(res.RequeueAfter).To(Equal(5 * time.Second)) + + // Verify only one CNPG Backup exists in the fake client + cnpgBackupList := &cnpgv1.BackupList{} + Expect(fakeClient.List(ctx, cnpgBackupList)).To(Succeed()) + Expect(len(cnpgBackupList.Items)).To(Equal(1)) + cnpgBackup := &cnpgBackupList.Items[0] + + Expect(cnpgBackup.Name).To(Equal(backupName)) + Expect(cnpgBackup.Namespace).To(Equal(backupNamespace)) + + // Check spec fields + Expect(cnpgBackup.Spec.Method).To(Equal(cnpgv1.BackupMethodVolumeSnapshot)) + Expect(cnpgBackup.Spec.Cluster.Name).To(Equal(clusterName)) + + // Owner reference should reference the dbpreview Backup (by name) + Expect(len(cnpgBackup.OwnerReferences)).To(Equal(1)) + ownerReference := cnpgBackup.OwnerReferences[0] + Expect(ownerReference.Name).To(Equal(backup.Name)) + Expect(ownerReference.Controller).ToNot(BeNil()) + Expect(*ownerReference.Controller).To(BeTrue()) + }) + }) + + Describe("updateBackupStatus", func() { + It("requeues until expiration time when CNPG Backup phase is Completed", func() { + backup := &dbpreview.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Spec: dbpreview.BackupSpec{ + Cluster: cnpgv1.LocalObjectReference{Name: clusterName}, + }, + Status: dbpreview.BackupStatus{ + Phase: cnpgv1.BackupPhasePending, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(backup). + WithStatusSubresource(&dbpreview.Backup{}). + Build() + + reconciler := &BackupReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + } + + now := time.Now().UTC() + cnpgBackup := &cnpgv1.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Status: cnpgv1.BackupStatus{ + Phase: cnpgv1.BackupPhaseCompleted, + StartedAt: &metav1.Time{Time: now.Add(-time.Minute)}, + StoppedAt: &metav1.Time{Time: now}, + }, + } + + res, err := reconciler.updateBackupStatus(ctx, backup, cnpgBackup, nil) + Expect(err).ToNot(HaveOccurred()) + Expect(res.RequeueAfter).NotTo(Equal(0)) + + // Verify status was updated with times + updated := &dbpreview.Backup{} + Expect(fakeClient.Get(ctx, client.ObjectKey{Name: backupName, Namespace: backupNamespace}, updated)).To(Succeed()) + Expect(string(updated.Status.Phase)).To(Equal(string(cnpgv1.BackupPhaseCompleted))) + Expect(updated.Status.StartedAt).ToNot(BeNil()) + Expect(updated.Status.StoppedAt).ToNot(BeNil()) + Expect(updated.Status.StartedAt.Time.Unix()).To(Equal(cnpgBackup.Status.StartedAt.Time.Unix())) + Expect(updated.Status.StoppedAt.Time.Unix()).To(Equal(cnpgBackup.Status.StoppedAt.Time.Unix())) + }) + + It("stops reconciling (returns zero result) when CNPG Backup phase is Failed", func() { + backup := &dbpreview.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Spec: dbpreview.BackupSpec{ + Cluster: cnpgv1.LocalObjectReference{Name: clusterName}, + }, + Status: dbpreview.BackupStatus{ + Phase: cnpgv1.BackupPhaseStarted, + StartedAt: &metav1.Time{Time: time.Now().UTC().Add(-5 * time.Minute)}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(backup). + WithStatusSubresource(&dbpreview.Backup{}). + Build() + + reconciler := &BackupReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + } + + startTime := time.Now().UTC().Add(-10 * time.Minute) + stopTime := time.Now().UTC() + cnpgBackup := &cnpgv1.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Status: cnpgv1.BackupStatus{ + Phase: cnpgv1.BackupPhaseFailed, + StartedAt: &metav1.Time{Time: startTime}, + StoppedAt: &metav1.Time{Time: stopTime}, + Error: "connection timeout", + }, + } + + res, err := reconciler.updateBackupStatus(ctx, backup, cnpgBackup, nil) + Expect(err).ToNot(HaveOccurred()) + Expect(res.RequeueAfter).NotTo(Equal(0)) + + // Verify status was updated with error + updated := &dbpreview.Backup{} + Expect(fakeClient.Get(ctx, client.ObjectKey{Name: backupName, Namespace: backupNamespace}, updated)).To(Succeed()) + Expect(string(updated.Status.Phase)).To(Equal(string(cnpgv1.BackupPhaseFailed))) + Expect(updated.Status.Error).To(Equal("connection timeout")) + Expect(updated.Status.StartedAt).ToNot(BeNil()) + Expect(updated.Status.StoppedAt).ToNot(BeNil()) + Expect(updated.Status.StartedAt.Time.Unix()).To(Equal(startTime.Unix())) + Expect(updated.Status.StoppedAt.Time.Unix()).To(Equal(stopTime.Unix())) + }) + + It("does not update status when phase hasn't changed", func() { + backup := &dbpreview.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Spec: dbpreview.BackupSpec{ + Cluster: cnpgv1.LocalObjectReference{Name: clusterName}, + }, + Status: dbpreview.BackupStatus{ + Phase: cnpgv1.BackupPhaseRunning, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(backup). + WithStatusSubresource(&dbpreview.Backup{}). + Build() + + reconciler := &BackupReconciler{ + Client: fakeClient, + Scheme: scheme, + } + + // CNPG Backup has same phase + cnpgBackup := &cnpgv1.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: backupNamespace, + }, + Status: cnpgv1.BackupStatus{ + Phase: cnpgv1.BackupPhaseRunning, + }, + } + + res, err := reconciler.updateBackupStatus(ctx, backup, cnpgBackup, nil) + Expect(err).ToNot(HaveOccurred()) + // Still in progress, requeue + Expect(res.RequeueAfter).To(Equal(10 * time.Second)) + + // Phase should remain unchanged + updated := &dbpreview.Backup{} + Expect(fakeClient.Get(ctx, client.ObjectKey{Name: backupName, Namespace: backupNamespace}, updated)).To(Succeed()) + Expect(string(updated.Status.Phase)).To(Equal(string(cnpgv1.BackupPhaseRunning))) + }) + }) +}) diff --git a/operator/src/internal/controller/scheduledbackup_controller.go b/operator/src/internal/controller/scheduledbackup_controller.go new file mode 100644 index 00000000..6e37cc64 --- /dev/null +++ b/operator/src/internal/controller/scheduledbackup_controller.go @@ -0,0 +1,155 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "context" + "time" + + "github.com/robfig/cron" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + dbpreview "github.com/microsoft/documentdb-operator/api/preview" +) + +// ScheduledBackupReconciler reconciles a ScheduledBackup object +type ScheduledBackupReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder +} + +// Reconcile handles the reconciliation loop for ScheduledBackup resources. +func (r *ScheduledBackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Fetch the ScheduledBackup resource + scheduledBackup := &dbpreview.ScheduledBackup{} + if err := r.Get(ctx, req.NamespacedName, scheduledBackup); err != nil { + if apierrors.IsNotFound(err) { + logger.Info("ScheduledBackup resource not found, might have been deleted") + return ctrl.Result{}, nil + } + logger.Error(err, "Failed to get ScheduledBackup") + return ctrl.Result{}, err + } + + // Ensure ScheduledBackup is owned by the referenced Cluster so it's garbage collected when the Cluster is deleted. + err := r.ensureOwnerReference(ctx, scheduledBackup) + if err != nil { + logger.Error(err, "Failed to ensure owner reference on ScheduledBackup") + return ctrl.Result{}, err + } + + // Parse cron schedule + schedule, err := cron.ParseStandard(scheduledBackup.Spec.Schedule) + if err != nil { + logger.Error(err, "Failed to parse schedule", "schedule", scheduledBackup.Spec.Schedule) + r.Recorder.Event(scheduledBackup, "Warning", "InvalidSchedule", "Failed to parse schedule: "+err.Error()) + return ctrl.Result{}, err + } + + // If there is an ongoing backup, wait for it to finish before starting a new one + backupList := &dbpreview.BackupList{} + if err := r.List(ctx, backupList, client.InNamespace(scheduledBackup.Namespace), client.MatchingFields{"spec.cluster": scheduledBackup.Spec.Cluster.Name}); err != nil { + logger.Error(err, "Failed to list backups") + return ctrl.Result{}, err + } + + if backupList.IsBackupRunning() { + // If a backup is currently running, requeue after a short delay + logger.Info("A backup is currently running, requeuing") + return ctrl.Result{RequeueAfter: time.Minute}, nil + } + + // If it's time to create a backup + nextScheduleTime := scheduledBackup.GetNextScheduleTime(schedule, backupList.GetLastBackup()) + now := time.Now() + if !now.Before(nextScheduleTime) { + backup := scheduledBackup.CreateBackup(now) + logger.Info("Creating new backup", "backupName", backup.Name) + if err := r.Create(ctx, backup); err != nil { + logger.Error(err, "Failed to create backup", "backupName", backup.Name) + r.Recorder.Event(scheduledBackup, "Warning", "BackupCreation", "Failed to create backup: "+err.Error()) + return ctrl.Result{}, err + } + + scheduledBackup.Status.LastScheduledTime = &metav1.Time{Time: now} + + // Calculate next run time + nextScheduleTime = schedule.Next(now) + } + + scheduledBackup.Status.NextScheduledTime = &metav1.Time{Time: nextScheduleTime} + if err := r.Status().Update(ctx, scheduledBackup); err != nil { + logger.Error(err, "Failed to update ScheduledBackup status with next scheduled time") + return ctrl.Result{}, err + } + + logger.Info("Next backup scheduled", "time", nextScheduleTime) + r.Recorder.Event(scheduledBackup, "Normal", "BackupSchedule", "Next backup scheduled at: "+nextScheduleTime.String()) + + // Requeue at next schedule time + requeueAfter := time.Until(nextScheduleTime) + if requeueAfter < 0 { + requeueAfter = time.Minute + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil +} + +func (r *ScheduledBackupReconciler) ensureOwnerReference(ctx context.Context, scheduledBackup *dbpreview.ScheduledBackup) error { + if len(scheduledBackup.OwnerReferences) > 0 { + // Owner reference already set + return nil + } + + // Fetch the associated DocumentDB cluster + cluster := &dbpreview.DocumentDB{} + clusterKey := client.ObjectKey{ + Name: scheduledBackup.Spec.Cluster.Name, + Namespace: scheduledBackup.Namespace, + } + if err := r.Get(ctx, clusterKey, cluster); err != nil { + r.Recorder.Event(scheduledBackup, "Warning", "ClusterNotFound", "Failed to find associated DocumentDB cluster: "+err.Error()) + return err + } + + // Set owner reference + if err := controllerutil.SetControllerReference(cluster, scheduledBackup, r.Scheme); err != nil { + logger := log.FromContext(ctx) + logger.Error(err, "Failed to set owner reference on ScheduledBackup") + return err + } + + // Update the ScheduledBackup with the new owner reference + if err := r.Update(ctx, scheduledBackup); err != nil { + logger := log.FromContext(ctx) + logger.Error(err, "Failed to update ScheduledBackup with owner reference") + return err + } + + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ScheduledBackupReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Register field index for spec.cluster so we can query Backups by cluster name + if err := mgr.GetFieldIndexer().IndexField(context.Background(), &dbpreview.Backup{}, "spec.cluster", func(rawObj client.Object) []string { + backup := rawObj.(*dbpreview.Backup) + return []string{backup.Spec.Cluster.Name} + }); err != nil { + return err + } + + return ctrl.NewControllerManagedBy(mgr). + For(&dbpreview.ScheduledBackup{}). + Complete(r) +} diff --git a/operator/src/internal/controller/scheduledbackup_controller_test.go b/operator/src/internal/controller/scheduledbackup_controller_test.go new file mode 100644 index 00000000..1fedcbbf --- /dev/null +++ b/operator/src/internal/controller/scheduledbackup_controller_test.go @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "context" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + dbpreview "github.com/microsoft/documentdb-operator/api/preview" +) + +var _ = Describe("ScheduledBackup Controller", func() { + const ( + scheduledBackupName = "test-scheduled-backup" + scheduledBackupNamespace = "default" + clusterName = "test-cluster" + ) + + var ( + ctx context.Context + scheme *runtime.Scheme + recorder record.EventRecorder + ) + + BeforeEach(func() { + ctx = context.Background() + scheme = runtime.NewScheme() + recorder = record.NewFakeRecorder(10) + Expect(dbpreview.AddToScheme(scheme)).To(Succeed()) + }) + + It("returns error for invalid cron schedule", func() { + invalidSchedule := "invalid cron expression" + cluster := &dbpreview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: scheduledBackupNamespace, + }, + } + scheduledBackup := &dbpreview.ScheduledBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: scheduledBackupName, + Namespace: scheduledBackupNamespace, + }, + Spec: dbpreview.ScheduledBackupSpec{ + Schedule: invalidSchedule, + Cluster: cnpgv1.LocalObjectReference{ + Name: clusterName, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(scheduledBackup, cluster). + Build() + + reconciler := &ScheduledBackupReconciler{ + Client: fakeClient, + Scheme: scheme, + Recorder: recorder, + } + + result, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: scheduledBackupName, + Namespace: scheduledBackupNamespace, + }, + }) + + // expect err: invalid cron expression + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid cron expression")) + Expect(result.Requeue).To(BeFalse()) + }) +}) diff --git a/operator/src/internal/controller/suite_test.go b/operator/src/internal/controller/suite_test.go new file mode 100644 index 00000000..0f9dc624 --- /dev/null +++ b/operator/src/internal/controller/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Controllers Suite") +} diff --git a/operator/src/scripts/test-scripts/deploy-csi-driver.sh b/operator/src/scripts/test-scripts/deploy-csi-driver.sh new file mode 100755 index 00000000..be55f802 --- /dev/null +++ b/operator/src/scripts/test-scripts/deploy-csi-driver.sh @@ -0,0 +1,83 @@ +# Defaults +# renovate: datasource=github-releases depName=kubernetes-csi/csi-driver-host-path +CSI_DRIVER_HOST_PATH_DEFAULT_VERSION=v1.17.0 +# renovate: datasource=github-releases depName=kubernetes-csi/external-snapshotter +EXTERNAL_SNAPSHOTTER_VERSION=v8.4.0 +# renovate: datasource=github-releases depName=kubernetes-csi/external-provisioner +EXTERNAL_PROVISIONER_VERSION=v6.0.0 +# renovate: datasource=github-releases depName=kubernetes-csi/external-resizer +EXTERNAL_RESIZER_VERSION=v2.0.0 +# renovate: datasource=github-releases depName=kubernetes-csi/external-attacher +EXTERNAL_ATTACHER_VERSION=v4.10.0 + +CSI_DRIVER_HOST_PATH_VERSION=${CSI_DRIVER_HOST_PATH_VERSION:-$CSI_DRIVER_HOST_PATH_DEFAULT_VERSION} + +TEMP_DIR="$(mktemp -d)" + +# Colors (only if using a terminal) +bright= +reset= +if [ -t 1 ]; then + bright=$(tput bold 2>/dev/null || true) + reset=$(tput sgr0 2>/dev/null || true) +fi + +echo "${bright}Starting deployment of CSI driver plugin... ${reset}" +CSI_BASE_URL=https://raw.githubusercontent.com/kubernetes-csi + +## Install external snapshotter CRD +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/client/config/crd/snapshot.storage.k8s.io_volumesnapshotclasses.yaml +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/client/config/crd/snapshot.storage.k8s.io_volumesnapshotcontents.yaml +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/client/config/crd/snapshot.storage.k8s.io_volumesnapshots.yaml +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/deploy/kubernetes/snapshot-controller/rbac-snapshot-controller.yaml +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml +kubectl apply -f "${CSI_BASE_URL}"/external-snapshotter/"${EXTERNAL_SNAPSHOTTER_VERSION}"/deploy/kubernetes/csi-snapshotter/rbac-csi-snapshotter.yaml + +## Install external provisioner +kubectl apply -f "${CSI_BASE_URL}"/external-provisioner/"${EXTERNAL_PROVISIONER_VERSION}"/deploy/kubernetes/rbac.yaml + +## Install external attacher +kubectl apply -f "${CSI_BASE_URL}"/external-attacher/"${EXTERNAL_ATTACHER_VERSION}"/deploy/kubernetes/rbac.yaml + +## Install external resizer +kubectl apply -f "${CSI_BASE_URL}"/external-resizer/"${EXTERNAL_RESIZER_VERSION}"/deploy/kubernetes/rbac.yaml + +## Install driver and plugin +## Create a temporary file for the modified plugin deployment. This is needed +## because csi-driver-host-path plugin yaml tends to lag behind a few versions. +plugin_file="${TEMP_DIR}/csi-hostpath-plugin.yaml" +curl -sSL "${CSI_BASE_URL}/csi-driver-host-path/${CSI_DRIVER_HOST_PATH_VERSION}/deploy/kubernetes-1.30/hostpath/csi-hostpath-plugin.yaml" | + sed "s|registry.k8s.io/sig-storage/hostpathplugin:.*|registry.k8s.io/sig-storage/hostpathplugin:${CSI_DRIVER_HOST_PATH_VERSION}|g" > "${plugin_file}" + +kubectl apply -f "${CSI_BASE_URL}"/csi-driver-host-path/"${CSI_DRIVER_HOST_PATH_VERSION}"/deploy/kubernetes-1.30/hostpath/csi-hostpath-driverinfo.yaml +kubectl apply -f "${plugin_file}" +rm "${plugin_file}" + +## create volumesnapshotclass +kubectl apply -f "${CSI_BASE_URL}"/csi-driver-host-path/"${CSI_DRIVER_HOST_PATH_VERSION}"/deploy/kubernetes-1.30/hostpath/csi-hostpath-snapshotclass.yaml +kubectl patch volumesnapshotclass csi-hostpath-snapclass -p '{"metadata":{"annotations":{"snapshot.storage.kubernetes.io/is-default-class":"true"}}}' --type merge + +## Prevent VolumeSnapshot E2e test to fail when taking a +## snapshot of a running PostgreSQL instance +kubectl patch volumesnapshotclass csi-hostpath-snapclass -p '{"parameters":{"ignoreFailedRead":"true"}}' --type merge + +## create storage class +kubectl apply -f "${CSI_BASE_URL}"/csi-driver-host-path/"${CSI_DRIVER_HOST_PATH_VERSION}"/examples/csi-storageclass.yaml +kubectl annotate storageclass csi-hostpath-sc storage.kubernetes.io/default-snapshot-class=csi-hostpath-snapclass + +echo "${bright} CSI driver plugin deployment has started. Waiting for the CSI plugin to be ready... ${reset}" +ITER=0 +while true; do + if [[ $ITER -ge 300 ]]; then + echo "${bright}Timeout: The CSI plugin did not become ready within the expected time.${reset}" + exit 1 + fi + NUM_SPEC=$(kubectl get statefulset csi-hostpathplugin -o jsonpath='{.spec.replicas}') + NUM_STATUS=$(kubectl get statefulset csi-hostpathplugin -o jsonpath='{.status.availableReplicas}') + if [[ "$NUM_SPEC" == "$NUM_STATUS" ]]; then + echo "${bright}Success: The CSI plugin is deployed and ready.${reset}" + break + fi + sleep 1 + ((++ITER)) +done From df496fb4692c0d0703f2f9e77403b06c7bb6dd6e Mon Sep 17 00:00:00 2001 From: wenting Date: Mon, 3 Nov 2025 15:39:00 -0500 Subject: [PATCH 2/2] fix up --- .github/actions/setup-test-environment/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index d0d84e42..660b05d7 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -276,8 +276,8 @@ runs: - name: Deploy CSI driver shell: bash run: | - chmod +x ./scripts/test-scripts/deploy-csi-driver.sh - ./scripts/test-scripts/deploy-csi-driver.sh + chmod +x ./operator/src/scripts/test-scripts/deploy-csi-driver.sh + ./operator/src/scripts/test-scripts/deploy-csi-driver.sh - name: Install cert-manager shell: bash