Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions rollback/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[defaults]
log_path = /opt/omnia/log/core/playbooks/rollback.log
remote_tmp = /opt/omnia/tmp/.ansible/tmp/
host_key_checking = false
forks = 5
timeout = 180
executable = /bin/bash
roles_path = ../upgrade/roles:../utils/roles
library = ../common/library/modules
module_utils = ../common/library/module_utils

[persistent_connection]
command_timeout = 180
connect_timeout = 180

[ssh_connection]
retries = 3
ssh_args = -o ControlMaster=auto -o ControlPersist=60 -o ConnectTimeout=60
67 changes: 67 additions & 0 deletions rollback/playbooks/rollback_cloud_init_bss.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Rollback Cloud-Init and BSS parameters
hosts: localhost
connection: local
gather_facts: false
vars:
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: cloud_init_bss
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
src: "{{ rollback_manifest_path }}"
register: raw_rollback_manifest

- name: Parse rollback manifest
ansible.builtin.set_fact:
rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}"

- name: Skip if cloud_init_bss already rolled back
ansible.builtin.meta: end_play
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'

- name: Set cloud_init_bss rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

# TODO: Implement Cloud-Init/BSS rollback steps per ESpec §4.9/§4.11:
# 1. Read pre-upgrade BSS state from backup files
# 2. For each functional group: PUT /boot/v1/bootparameters with backup payload
# 3. Set operation_type to 'reboot' for all groups
# 4. Validate BSS entries match pre-upgrade state
- name: Cloud-Init/BSS rollback placeholder
ansible.builtin.debug:
msg: "Cloud-Init/BSS rollback tasks to be implemented (restore BSS params from backup)"

- name: Mark cloud_init_bss rollback as completed
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'completed'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
69 changes: 69 additions & 0 deletions rollback/playbooks/rollback_k8s.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Rollback Kubernetes cluster
hosts: localhost
connection: local
gather_facts: false
vars:
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: k8s
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
src: "{{ rollback_manifest_path }}"
register: raw_rollback_manifest

- name: Parse rollback manifest
ansible.builtin.set_fact:
rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}"

- name: Skip if K8s already rolled back
ansible.builtin.meta: end_play
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'

- name: Set K8s rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

# TODO: Implement K8s rollback steps per ESpec §4.11.4:
# 1. Restore etcd from pre-upgrade snapshot
# 2. Deploy cluster with previous version image
# 3. Restore /etc/kubernetes configs from backup
# 4. Restart K8s services on all nodes
# 5. Validate cluster health (nodes Ready, etcd quorum, pods running)
# 6. Rollback BSS + cloud-init to pre-upgrade state
- name: K8s rollback placeholder
ansible.builtin.debug:
msg: "K8s rollback tasks to be implemented (etcd restore, cluster redeploy, BSS rollback)"

- name: Mark K8s rollback as completed
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'completed'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
76 changes: 76 additions & 0 deletions rollback/playbooks/rollback_oim.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Rollback OIM (includes OpenCHAMI + BuildStream)
hosts: localhost
connection: local
gather_facts: false
vars:
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: oim
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
src: "{{ rollback_manifest_path }}"
register: raw_rollback_manifest

- name: Parse rollback manifest
ansible.builtin.set_fact:
rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}"

- name: Skip if OIM already rolled back
ansible.builtin.meta: end_play
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'

- name: Set OIM rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

# TODO: Implement OIM rollback steps per ESpec §4.11.3:
# 1. Stop BuildStream container (systemctl stop buildstream)
# 2. Restore buildstream.container quadlet from backup
# 3. systemctl daemon-reload
# 4. Start BuildStream container (systemctl start buildstream)
# 5. Validate container healthy
# 6. Restore OpenCHAMI quadlet files from backup
# 7. Restore configs_vars.yaml from backup
# 8. Restore version.yml from backup
# 9. Checkout previous deployment-recipes version
# 10. Restore PostgreSQL from pg_dump backup
# 11. systemctl daemon-reload
# 12. Restart all OpenCHAMI services
# 13. Validate (SMD, BSS, S3, node inventory)
- name: OIM rollback placeholder
ansible.builtin.debug:
msg: "OIM rollback tasks to be implemented (BuildStream + OpenCHAMI restore)"

- name: Mark OIM rollback as completed
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'completed'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
70 changes: 70 additions & 0 deletions rollback/playbooks/rollback_slurm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Rollback Slurm feature updates
hosts: localhost
connection: local
gather_facts: false
vars:
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: slurm
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
src: "{{ rollback_manifest_path }}"
register: raw_rollback_manifest

- name: Parse rollback manifest
ansible.builtin.set_fact:
rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}"

- name: Skip if slurm already rolled back
ansible.builtin.meta: end_play
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'

- name: Set slurm rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

# TODO: Implement Slurm rollback steps per ESpec §4.11.5:
# 1. Pre-Rollback: Backup current state (slurmdbd, configs, GRE, mounts)
# 2. Point old image using BSS command
# 3. Restore Slurm database, configs, GRE configs
# 4. Rollback HPC Tools (login/compiler nodes)
# 5. Restore mount configurations
# 6. Validate cluster health
# 7. Rollback BSS + cloud-init to pre-upgrade state
- name: Slurm rollback placeholder
ansible.builtin.debug:
msg: "Slurm rollback tasks to be implemented (BSS repoint, DB restore, config rollback)"

- name: Mark slurm rollback as completed
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'completed'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
70 changes: 70 additions & 0 deletions rollback/playbooks/rollback_telemetry.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Rollback Telemetry components
hosts: localhost
connection: local
gather_facts: false
vars:
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: telemetry
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
src: "{{ rollback_manifest_path }}"
register: raw_rollback_manifest

- name: Parse rollback manifest
ansible.builtin.set_fact:
rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}"

- name: Skip if telemetry already rolled back
ansible.builtin.meta: end_play
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'

- name: Set telemetry rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

# TODO: Implement telemetry rollback steps per ESpec §4.8.5:
# 1. Helm uninstall new components (powerscale, vast, victorialogs, ufm)
# 2. Rollback Strimzi operator + Kafka brokers to previous version
# 3. Rollback VictoriaMetrics StatefulSet(s) to previous version
# 4. Rollback iDRAC telemetry receiver + pump images
# 5. Restore LDMS sampler/aggregator configs from backup
# 6. Rolling restart LDMS pods
# 7. Validate: all telemetry pods Running, metrics/logs flowing
- name: Telemetry rollback placeholder
ansible.builtin.debug:
msg: "Telemetry rollback tasks to be implemented (Helm uninstall, component rollback)"

- name: Mark telemetry rollback as completed
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'completed'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
Loading
Loading