diff --git a/build_stream/api/jobs/routes.py b/build_stream/api/jobs/routes.py index 8a480bf41d..5a711d1571 100644 --- a/build_stream/api/jobs/routes.py +++ b/build_stream/api/jobs/routes.py @@ -354,6 +354,7 @@ async def get_job( error_code=s.error_code, error_summary=s.error_summary, log_file_path=s.log_file_path, + result_detail=s.result_detail, ) for s in filtered_stages ] diff --git a/build_stream/api/jobs/schemas.py b/build_stream/api/jobs/schemas.py index 971a3d8dd9..3ab6f94610 100644 --- a/build_stream/api/jobs/schemas.py +++ b/build_stream/api/jobs/schemas.py @@ -84,6 +84,7 @@ class GetStageResponse(BaseModel): error_code: Optional[str] = Field(default=None, description="Error code if failed") error_summary: Optional[str] = Field(default=None, description="Error summary if failed") log_file_path: Optional[str] = Field(default=None, description="Ansible log file path on OIM host (NFS share)") + result_detail: Optional[Dict[str, Any]] = Field(default=None, description="Detailed stage results (JSONB) including log_path, test_summary, artifact_dir") class CreateJobResponse(BaseModel): diff --git a/build_stream/orchestrator/common/result_poller.py b/build_stream/orchestrator/common/result_poller.py index 0fb0336126..5aa562d4f6 100644 --- a/build_stream/orchestrator/common/result_poller.py +++ b/build_stream/orchestrator/common/result_poller.py @@ -842,7 +842,8 @@ def _build_validate_result_detail(self, result: PlaybookResult, outcome: str) -> "test_summary": result.test_summary or {"total": 0, "passed": 0, "failed": 0, "skipped": 0, "errors": 0}, "duration_seconds": result.duration_seconds, "artifact_dir": artifact_dir, - "report_path": str(Path(artifact_dir) / "test_report.html") if artifact_dir else "", + "log_path": str(Path(artifact_dir) / "molecule_output.log") if artifact_dir else "", + "report_path": str(Path(artifact_dir) / "test_report.json") if artifact_dir else "", "correlation_id": str(result.request_id), } if outcome == "FAILED": diff --git a/build_stream/orchestrator/validate/use_cases/validate.py b/build_stream/orchestrator/validate/use_cases/validate.py index 932ae70763..e3a99f9746 100644 --- a/build_stream/orchestrator/validate/use_cases/validate.py +++ b/build_stream/orchestrator/validate/use_cases/validate.py @@ -212,6 +212,7 @@ def _create_stage(self, command: ValidateCommand, attempt: int) -> Stage: existing_stage.error_code = None # Clear error fields from previous attempt existing_stage.error_summary = None existing_stage.ended_at = None # Clear ended_at from previous attempt + existing_stage.log_file_path = None # Clear log_file_path from previous attempt existing_stage.result_detail = None # Clear result_detail from previous attempt self._stage_repo.save(existing_stage) if hasattr(self._stage_repo, 'session'): diff --git a/build_stream/playbook-watcher/playbook_watcher_service.py b/build_stream/playbook-watcher/playbook_watcher_service.py index 7e7705b8d6..e943f16fb2 100644 --- a/build_stream/playbook-watcher/playbook_watcher_service.py +++ b/build_stream/playbook-watcher/playbook_watcher_service.py @@ -1052,44 +1052,96 @@ def execute_molecule(request_data: Dict[str, Any]) -> Dict[str, Any]: except OSError: log_secure_info("warning", "Failed to write molecule output log", job_id) - # Parse test summary from molecule_output.log (avoids stale reports from shared directory) + # Parse metadata from molecule_output.log (report_id, suites) test_summary = {"total": 0, "passed": 0, "failed": 0, "skipped": 0, "errors": 0} + report_id = None if os.path.exists(log_file_path): try: import re with open(log_file_path, 'r') as f: log_content = f.read() - # Parse summary line: "Results: 10 passed, 1 failed, 11 skipped" - results_match = re.search(r'Results:\s+(\d+)\s+passed,\s+(\d+)\s+failed,\s+(\d+)\s+skipped', log_content) - if results_match: - passed = int(results_match.group(1)) - failed = int(results_match.group(2)) - skipped = int(results_match.group(3)) - test_summary = { - "total": passed + failed + skipped, - "passed": passed, - "failed": failed, - "skipped": skipped, - "errors": 0, - } + + # Extract report_id: "Report ID: 2b4ade78" + report_id_match = re.search(r'Report ID:\s+([a-f0-9]+)', log_content) + if report_id_match: + report_id = report_id_match.group(1) + + # Extract top-level Suite from header (e.g., 'Suite : build_stream') + # Strip ANSI color codes first + try: + sanitized = re.sub(r'\x1B\[[0-?]*[ -/]*[@-~]', '', log_content) + except re.error: + sanitized = log_content + header_suite_match = re.search(r'(?m)^\s*Suite\s*:\s*([\w\-.]+)', sanitized) + if header_suite_match: + test_summary["suite"] = header_suite_match.group(1) else: - # Fallback: try parsing pytest summary line: "1 failed, 10 passed, 11 skipped" - pytest_match = re.search(r'(\d+)\s+failed,\s+(\d+)\s+passed,\s+(\d+)\s+skipped', log_content) - if pytest_match: - failed = int(pytest_match.group(1)) - passed = int(pytest_match.group(2)) - skipped = int(pytest_match.group(3)) - test_summary = { - "total": passed + failed + skipped, - "passed": passed, - "failed": failed, - "skipped": skipped, - "errors": 0, - } + # Fallback: parse from 'Suite/Marker: -m ' line + marker_match = re.search(r'(?m)^\s*Suite/Marker\s*:\s*.*?-m\s+([\w\-.]+)', sanitized) + if marker_match: + test_summary["suite"] = marker_match.group(1) + except (OSError, IOError, ValueError) as e: log_secure_info("warning", f"Failed to parse molecule_output.log: {e}", job_id) + # Extract current run from shared test_report.json by report_id and save to artifact_dir + report_source_path = "/opt/omnia/automation/reports/test_report.json" + if report_id and os.path.exists(report_source_path): + try: + # Load full report from shared location + with open(report_source_path, 'r') as f: + full_report = json.load(f) + + if "servers" in full_report and "" in full_report["servers"]: + runs = full_report["servers"][""].get("runs", []) + # Find run matching report_id + current_run = None + for run in runs: + if run.get("report_id") == report_id: + current_run = run + break + + if current_run: + # Populate test_summary from JSON (enforce order: identifiers, duration, counts, tests) + modules = current_run.get("modules", []) + if modules: + module_info = modules[0] + scenario = module_info.get("module", "unknown") + molecule_command = module_info.get("molecule_command", "verify") + duration_seconds = module_info.get("duration_seconds", 0) + results = module_info.get("results", []) + tests = [{"name": r.get("test_name"), "status": r.get("status")} for r in results if r.get("test_name")] + test_summary["scenario"] = scenario + test_summary["molecule_command"] = molecule_command + test_summary["report_id"] = report_id + test_summary["duration_seconds"] = duration_seconds + test_summary["tests"] = tests + summary_block = current_run.get("summary", {}) + if isinstance(summary_block, dict): + test_summary["total"] = summary_block.get("total", 0) + test_summary["passed"] = summary_block.get("passed", 0) + test_summary["failed"] = summary_block.get("failed", 0) + test_summary["skipped"] = summary_block.get("skipped", 0) + test_summary["errors"] = summary_block.get("errors", 0) + log_secure_info('info', f"Test scenario: {scenario}, command: {molecule_command}, duration: {duration_seconds}s, tests: {len(tests)}, report_id: {report_id}", job_id) + + # Save filtered report to artifact_dir + filtered_report = { + "servers": { + "": { + "runs": [current_run], + "hostname": "" + } + } + } + dest_path = os.path.join(artifact_dir, "test_report.json") + with open(dest_path, 'w') as f: + json.dump(filtered_report, f, indent=2) + log_secure_info('info', f"Extracted report {report_id} to artifact directory", job_id) + except (OSError, json.JSONDecodeError) as e: + log_secure_info('warning', f"Failed to extract report: {e}", job_id) + # Determine status: if any test failed, mark as failed regardless of exit code if test_summary["failed"] > 0 or test_summary["errors"] > 0: status = "failed" @@ -1117,6 +1169,7 @@ def execute_molecule(request_data: Dict[str, Any]) -> Dict[str, Any]: "duration_seconds": int(duration_seconds), "test_summary": test_summary, "artifact_dir": artifact_dir, + "log_file_path": log_file_path, "started_at": started_at.isoformat(), "completed_at": completed_at.isoformat(), "timestamp": completed_at.isoformat(), @@ -1127,7 +1180,22 @@ def execute_molecule(request_data: Dict[str, Any]) -> Dict[str, Any]: if exit_code == 124: result_data["error_summary"] = f"Molecule execution timed out after {timeout_minutes} minutes" elif test_summary["failed"] > 0: - result_data["error_summary"] = f"Test failures: {test_summary['failed']} failed, {test_summary['errors']} errors" + # Parse specific test failures from molecule_output.log + failed_tests = [] + if os.path.exists(log_file_path): + try: + with open(log_file_path, 'r') as f: + log_content = f.read() + # Parse FAILED test lines: "FAILED path/to/test_file.py::test_function" + failed_matches = re.findall(r'^FAILED (.+)$', log_content, re.MULTILINE) + failed_tests = failed_matches[:5] # Include up to 5 specific failures + except (OSError, IOError): + pass + + if failed_tests: + result_data["error_summary"] = f"Test failures: {test_summary['failed']} failed. Failed tests: {', '.join(failed_tests)}" + else: + result_data["error_summary"] = f"Test failures: {test_summary['failed']} failed, {test_summary['errors']} errors" else: result_data["error_summary"] = f"Molecule exited with code {exit_code}" @@ -1148,6 +1216,7 @@ def execute_molecule(request_data: Dict[str, Any]) -> Dict[str, Any]: "exit_code": 124, "error_summary": f"Molecule execution timed out after {timeout_minutes} minutes", "artifact_dir": artifact_dir, + "log_file_path": os.path.join(artifact_dir, "molecule_output.log"), "started_at": started_at.isoformat(), "completed_at": completed_at.isoformat(), "duration_seconds": int(duration_seconds), @@ -1169,6 +1238,7 @@ def execute_molecule(request_data: Dict[str, Any]) -> Dict[str, Any]: "exit_code": -1, "error_summary": f"System error during molecule execution: {str(e)}", "artifact_dir": artifact_dir, + "log_file_path": os.path.join(artifact_dir, "molecule_output.log"), "started_at": started_at.isoformat(), "completed_at": completed_at.isoformat(), "duration_seconds": int(duration_seconds), diff --git a/common/library/module_utils/input_validation/schema/provision_config.json b/common/library/module_utils/input_validation/schema/provision_config.json index 79977c296c..0f154d8870 100644 --- a/common/library/module_utils/input_validation/schema/provision_config.json +++ b/common/library/module_utils/input_validation/schema/provision_config.json @@ -16,6 +16,11 @@ "description": "Default lease time for DHCP.", "pattern": "^[0-9]+$", "default": "86400" + }, + "dns_enabled": { + "type": "boolean", + "description": "Enable DNS-based hostname resolution via coresmd.", + "default": false } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 98efc3637f..20c69cf94e 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1442,3 +1442,21 @@ def _ranges_overlap(range_a, range_b): return a_start <= b_end and b_start <= a_end except (ValueError, TypeError): return False + + + +def validate_dns_config(data): + """ + Validates dns_config input parameters. + + dns_config.yml only contains dns_enabled (boolean). + The cluster domain is read from OIM metadata (domain_name). + + Args: + data (dict): The dns_config dict from dns_config.yml. + + Returns: + list: Validation error messages (currently empty; schema + validation handles the dns_enabled type check). + """ + return [] diff --git a/common/library/modules/k8s_upgrade_status.py b/common/library/modules/k8s_upgrade_status.py new file mode 100644 index 0000000000..6db50b9c72 --- /dev/null +++ b/common/library/modules/k8s_upgrade_status.py @@ -0,0 +1,292 @@ +#!/usr/bin/python +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=import-error,no-name-in-module + +import os +import copy +import fcntl +import tempfile +from ansible.module_utils.basic import AnsibleModule + +try: + import yaml + HAS_YAML = True +except ImportError: + HAS_YAML = False + +""" +Ansible module to efficiently update Kubernetes upgrade status file. + +This module optimizes status file updates by: +- Direct file manipulation on the remote host (no temp files on controller) +- File locking to prevent race conditions +- Atomic write operations +- Connection reuse for better performance +- Preserves YAML key order for readability +""" + +DOCUMENTATION = r''' +--- +module: k8s_upgrade_status + +short_description: Update Kubernetes upgrade status file efficiently + +version_added: "2.1.0" + +description: + - Updates the Kubernetes upgrade status YAML file on kube_vip host + - Supports both node-specific and general status updates + - Uses file locking and atomic operations for safety + - Optimized for performance with direct remote file manipulation + +options: + status_file: + description: Path to the status file on kube_vip host + required: true + type: str + kube_vip: + description: Target host where status file is stored + required: true + type: str + node_name: + description: Name of the node to update (for node-specific updates) + required: false + type: str + node_status_update: + description: Dictionary to merge into the node's status + required: false + type: dict + status_update: + description: Dictionary to merge into general status (non-node updates) + required: false + type: dict + +author: + - Dell Omnia Team +''' + +EXAMPLES = r''' +# Update node-specific status +- name: Mark kubeadm_install as in_progress + k8s_upgrade_status: + status_file: /mnt/nfs/upgrade/upgrade_status.yml + kube_vip: 192.168.1.100 + node_name: kcp1 + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "2026-05-17T12:00:00Z" + +# Update general status +- name: Mark etcd backup as completed + k8s_upgrade_status: + status_file: /mnt/nfs/upgrade/upgrade_status.yml + kube_vip: 192.168.1.100 + status_update: + etcd_backup: + status: completed + timestamp: "2026-05-17T12:00:00Z" +''' + +RETURN = r''' +changed: + description: Whether the status file was modified + type: bool + returned: always +merged_status: + description: The complete merged status after update + type: dict + returned: always +''' + + +def merge_dicts(base, update): + """ + Recursively merge two dictionaries. + + Args: + base: Base dictionary + update: Dictionary with updates to merge + + Returns: + Merged dictionary + """ + result = base.copy() + for key, value in update.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = merge_dicts(result[key], value) + else: + result[key] = value + return result + + +def read_status_file(file_path): + """ + Read and parse the status YAML file with file locking. + + Args: + file_path: Path to the status file + + Returns: + Parsed status dictionary, or empty dict if file doesn't exist + """ + if not os.path.exists(file_path): + return {} + + try: + with open(file_path, 'r', encoding='utf-8') as f: + # Acquire shared lock for reading + fcntl.flock(f.fileno(), fcntl.LOCK_SH) + try: + content = f.read() + if not content.strip(): + return {} + return yaml.safe_load(content) or {} + finally: + fcntl.flock(f.fileno(), fcntl.LOCK_UN) + except (OSError, IOError, yaml.YAMLError) as e: + raise IOError(f"Failed to read status file: {str(e)}") from e + + +def write_status_file(file_path, status_data): + """ + Write status data to YAML file atomically with file locking. + + Args: + file_path: Path to the status file + status_data: Dictionary to write + """ + # Ensure directory exists + os.makedirs(os.path.dirname(file_path), mode=0o755, exist_ok=True) + + # Write to temporary file first + temp_fd, temp_path = tempfile.mkstemp( + dir=os.path.dirname(file_path), + prefix='.upgrade_status_', + suffix='.tmp' + ) + + try: + with os.fdopen(temp_fd, 'w', encoding='utf-8') as f: + # Acquire exclusive lock for writing + fcntl.flock(f.fileno(), fcntl.LOCK_EX) + try: + # Use safe_dump with custom settings to preserve order and readability + yaml.safe_dump( + status_data, + f, + default_flow_style=False, + sort_keys=False, + indent=2, + width=120 + ) + finally: + fcntl.flock(f.fileno(), fcntl.LOCK_UN) + + # Atomic rename + os.chmod(temp_path, 0o644) + os.rename(temp_path, file_path) + + except (OSError, IOError, yaml.YAMLError) as e: + # Clean up temp file on error + if os.path.exists(temp_path): + os.unlink(temp_path) + raise IOError(f"Failed to write status file: {str(e)}") from e + + +def run_module(): + """ + Main module execution. + """ + module_args = { + 'status_file': {'type': 'str', 'required': True}, + 'kube_vip': {'type': 'str', 'required': True}, + 'node_name': {'type': 'str', 'required': False, 'default': None}, + 'node_status_update': {'type': 'dict', 'required': False, 'default': None}, + 'status_update': {'type': 'dict', 'required': False, 'default': None}, + } + + result = { + 'changed': False, + 'merged_status': {}, + } + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True, + mutually_exclusive=[ + ['node_status_update', 'status_update'] + ], + required_one_of=[ + ['node_status_update', 'status_update'] + ] + ) + + if not HAS_YAML: + module.fail_json(msg='PyYAML is required for this module') + + status_file = module.params['status_file'] + node_name = module.params['node_name'] + node_status_update = module.params['node_status_update'] + status_update = module.params['status_update'] + + # Validate node-specific update has node_name + if node_status_update and not node_name: + module.fail_json(msg='node_name is required when node_status_update is provided') + + try: + # Read current status + current_status = read_status_file(status_file) + + # Deep-copy before mutation so the original is preserved for + # the changed-detection comparison below. + original_status = copy.deepcopy(current_status) + + # Build merged status + if node_status_update: + # Node-specific update + nodes = copy.deepcopy(current_status.get('nodes', {})) + node_data = nodes.get(node_name, {}) + updated_node_data = merge_dicts(node_data, node_status_update) + nodes[node_name] = updated_node_data + merged_status = merge_dicts(current_status, {'nodes': nodes}) + else: + # General update + merged_status = merge_dicts(current_status, status_update) + + # Check if anything changed + if merged_status != original_status: + result['changed'] = True + + # Write updated status (unless in check mode) + if not module.check_mode: + write_status_file(status_file, merged_status) + + result['merged_status'] = merged_status + module.exit_json(**result) + + except (OSError, IOError, yaml.YAMLError) as e: + module.fail_json(msg=str(e), **result) + + +def main(): + """Main entry point.""" + run_module() + + +if __name__ == '__main__': + main() diff --git a/common/vars/upgrade_vars.yml b/common/vars/upgrade_vars.yml index 96a36bbda1..2c3669b4af 100644 --- a/common/vars/upgrade_vars.yml +++ b/common/vars/upgrade_vars.yml @@ -1,87 +1,87 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# ============================================================================ -# OMNIA UPGRADE CONFIGURATION (Production Recommended) -# ============================================================================ -# Upgrade metadata (source and target Omnia versions) is read automatically -# from /opt/omnia/.data/oim_metadata.yml after omnia_core execution. -# -# Design: -# omnia_upgrade_paths — defines the Omnia version upgrade sequence. -# Each entry specifies the software versions for the NEXT Omnia version. -# -# components — defines each software component and its valid version sequence. -# supported_versions — ordered list used for automatic intermediate hop -# generation when a K8s version gap is detected. -# -# Automatic intermediate hop detection: -# When the target K8s version skips one or more entries in supported_versions, -# the system auto-generates one K8s hop per intermediate version. -# -# Example: Omnia 2.1.0.0 (K8s 1.34.1) → Omnia 2.3.0.0 (K8s 1.37.1) -# Omnia path : 2.1.0.0 → 2.2.0.0 → 2.3.0.0 -# K8s hops : 1.34.1 → 1.35.1 (Omnia 2.1→2.2, direct) -# 1.35.1 → 1.36.1 (auto-generated, within 2.2→2.3) -# 1.36.1 → 1.37.1 (Omnia 2.2→2.3, final) -# ============================================================================ - -# ============================================================================ -# OMNIA VERSION UPGRADE PATHS -# ============================================================================ -# Each entry: -# "": -# next_omnia_version: "" -# software_versions: -# : "" -# ============================================================================ -omnia_upgrade_paths: - "2.1.0.0": - next_omnia_version: "2.2.0.0" - software_versions: - service_k8s: "1.35.1" - # Uncomment to enable multi-hop upgrade to Omnia 2.3.0.0: - # K8s 1.35.1 -> 1.37.1 will auto-generate intermediate hop via 1.36.1 - # "2.2.0.0": - # next_omnia_version: "2.3.0.0" - # software_versions: - # service_k8s: "1.37.1" - -# ============================================================================ -# COMPONENT CONFIGURATION -# ============================================================================ -# Each component: -# json_file — base name for versioned JSON files -# (e.g., "service_k8s" → service_k8s_v1.35.1.json) -# enabled — whether this component participates in upgrade -# supported_versions — ordered list of all valid software versions. -# When target skips versions, intermediate hops are -# auto-generated in sequence order. -# ============================================================================ -components: - service_k8s: - json_file: "service_k8s" - enabled: true - supported_versions: - - "1.34.1" - - "1.35.1" - # Additional components (placeholders) - # slurm_custom: - # json_file: "slurm_custom" - # enabled: false - # supported_versions: - # - "24.05" - # - "25.11" - # - "26.05" +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# OMNIA UPGRADE CONFIGURATION (Production Recommended) +# ============================================================================ +# Upgrade metadata (source and target Omnia versions) is read automatically +# from /opt/omnia/.data/oim_metadata.yml after omnia_core execution. +# +# Design: +# omnia_upgrade_paths — defines the Omnia version upgrade sequence. +# Each entry specifies the software versions for the NEXT Omnia version. +# +# components — defines each software component and its valid version sequence. +# supported_versions — ordered list used for automatic intermediate hop +# generation when a K8s version gap is detected. +# +# Automatic intermediate hop detection: +# When the target K8s version skips one or more entries in supported_versions, +# the system auto-generates one K8s hop per intermediate version. +# +# Example: Omnia 2.1.0.0 (K8s 1.34.1) → Omnia 2.3.0.0 (K8s 1.37.1) +# Omnia path : 2.1.0.0 → 2.2.0.0 → 2.3.0.0 +# K8s hops : 1.34.1 → 1.35.1 (Omnia 2.1→2.2, direct) +# 1.35.1 → 1.36.1 (auto-generated, within 2.2→2.3) +# 1.36.1 → 1.37.1 (Omnia 2.2→2.3, final) +# ============================================================================ + +# ============================================================================ +# OMNIA VERSION UPGRADE PATHS +# ============================================================================ +# Each entry: +# "": +# next_omnia_version: "" +# software_versions: +# : "" +# ============================================================================ +omnia_upgrade_paths: + "2.1.0.0": + next_omnia_version: "2.2.0.0" + software_versions: + service_k8s: "1.35.1" + # Uncomment to enable multi-hop upgrade to Omnia 2.3.0.0: + # K8s 1.35.1 -> 1.37.1 will auto-generate intermediate hop via 1.36.1 + # "2.2.0.0": + # next_omnia_version: "2.3.0.0" + # software_versions: + # service_k8s: "1.37.1" + +# ============================================================================ +# COMPONENT CONFIGURATION +# ============================================================================ +# Each component: +# json_file — base name for versioned JSON files +# (e.g., "service_k8s" → service_k8s_v1.35.1.json) +# enabled — whether this component participates in upgrade +# supported_versions — ordered list of all valid software versions. +# When target skips versions, intermediate hops are +# auto-generated in sequence order. +# ============================================================================ +components: + service_k8s: + json_file: "service_k8s" + enabled: true + supported_versions: + - "1.34.1" + - "1.35.1" + # Additional components (placeholders) + # slurm_custom: + # json_file: "slurm_custom" + # enabled: false + # supported_versions: + # - "24.05" + # - "25.11" + # - "26.05" diff --git a/gitlab/roles/hosted_gitlab/files/.gitlab-ci-deploy-child-template.yml b/gitlab/roles/hosted_gitlab/files/.gitlab-ci-deploy-child-template.yml index 7d8955fe44..0a1ec99afc 100644 --- a/gitlab/roles/hosted_gitlab/files/.gitlab-ci-deploy-child-template.yml +++ b/gitlab/roles/hosted_gitlab/files/.gitlab-ci-deploy-child-template.yml @@ -376,7 +376,6 @@ deploy: artifacts: reports: dotenv: deploy.env - junit: test-results.xml paths: - status_response.json @@ -793,21 +792,12 @@ validate: printf " %-22s %s\n" "Validate State:" "${VALIDATE_RESULT}" printf " %-22s %s\n" "Job State:" "${JOB_STATE_VALIDATE}" - # Extract and display test summary + # Extract and display test summary (raw JSON from API) TEST_SUMMARY=$(jq -r '.stages[]? | select(.stage_name == "validate") | .result_detail.test_summary // empty' status_response.json 2>/dev/null) if [ -n "${TEST_SUMMARY}" ]; then echo " ------------------------------------------------------------" echo " Test Summary:" - TOTAL=$(echo "${TEST_SUMMARY}" | jq -r '.total // 0') - PASSED=$(echo "${TEST_SUMMARY}" | jq -r '.passed // 0') - FAILED=$(echo "${TEST_SUMMARY}" | jq -r '.failed // 0') - SKIPPED=$(echo "${TEST_SUMMARY}" | jq -r '.skipped // 0') - ERRORS=$(echo "${TEST_SUMMARY}" | jq -r '.errors // 0') - printf " %-22s %s\n" "Total Tests:" "${TOTAL}" - printf " %-22s %s\n" "Passed:" "${PASSED}" - printf " %-22s %s\n" "Failed:" "${FAILED}" - printf " %-22s %s\n" "Skipped:" "${SKIPPED}" - printf " %-22s %s\n" "Errors:" "${ERRORS}" + echo "${TEST_SUMMARY}" | jq '.' fi # Display artifact directory if available diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json deleted file mode 100644 index 60ec2d5c68..0000000000 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json +++ /dev/null @@ -1,122 +0,0 @@ -{ - "service_k8s": { - "cluster": [ - { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, - { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, - { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, - { "package": "git", "type": "rpm", "repo_name": "appstream"}, - { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, - { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, - { "package": "podman", "type": "rpm", "repo_name": "appstream"}, - { "package": "kubeadm-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, - { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, - { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, - { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, - { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, - { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, - { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, - { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, - { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, - { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.50.0" }, - { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.50.0" }, - { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, - { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, - { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, - { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, - { "package": "docker.io/library/python", "type": "image", "tag": "3.12-slim" }, - { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, - { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, - { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, - { "package": "cryptography==45.0.7", "type": "pip_module" }, - { "package": "omsdk==1.2.518", "type": "pip_module" }, - { "package": "cffi==1.17.1", "type": "pip_module" }, - { "package": "prometheus_client==0.20.0", "type": "pip_module" }, - { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, - { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, - { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, - { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, - { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.143.1", "type": "image" }, - { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, - { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, - { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, - { "package": "quay.io/jetstack/cert-manager-controller", "tag": "v1.10.0", "type": "image" }, - { "package": "quay.io/jetstack/cert-manager-cainjector", "tag": "v1.10.0", "type": "image" }, - { "package": "quay.io/jetstack/cert-manager-webhook", "tag": "v1.10.0", "type": "image" }, - { "package": "quay.io/jetstack/cert-manager-acmesolver", "tag": "v1.10.0", "type": "image" }, - { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, - { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, - { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, - { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, - { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, - { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, - { "package": "docker.io/timberio/vector", "tag": "0.54.0-debian", "type": "image" }, - { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }, - { "package": "iscsi-initiator-utils", "type": "rpm", "repo_name": "baseos" }, - { "package": "device-mapper-multipath", "type": "rpm", "repo_name": "baseos" }, - { "package": "sg3_utils", "type": "rpm", "repo_name": "baseos" }, - { "package": "lsscsi", "type": "rpm", "repo_name": "baseos" } - ] - }, - "service_kube_control_plane": { - "cluster": [ - { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, - { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, - { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, - { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, - { "package": "quay.io/calico/cni", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/calico/node", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, - { "package": "prettytable==3.14.0", "type": "pip_module" }, - { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, - { "package": "git", "type": "rpm", "repo_name": "appstream"}, - { "package": "kubernetes==35.0.0", "type": "pip_module" }, - { "package": "PyMySQL==1.1.2", "type": "pip_module" } - - ] - }, - "service_kube_control_plane_first": { - "cluster": [ - { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, - { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, - { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, - { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, - { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, - { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, - { "package": "quay.io/calico/cni", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/calico/node", "tag": "v3.30.3", "type": "image" }, - { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { "package": "calico-v3.30.3","type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" }, - { "package": "metallb-native-v0.15.2", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" }, - { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, - { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, - { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, - { "package": "prettytable==3.14.0", "type": "pip_module" }, - { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, - { "package": "git", "type": "rpm", "repo_name": "appstream"}, - { "package": "kubernetes==35.0.0", "type": "pip_module" }, - { "package": "PyMySQL==1.1.2", "type": "pip_module" } - ] - }, - - "service_kube_node": { - "cluster": [ - { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, - { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } - ] - } -} - diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 36e83d7fb8..5a1a4d53f8 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -70,9 +70,9 @@ { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, - { "package": "quay.io/calico/cni", "tag": "v3.31.4", "type": "image" }, - { "package": "quay.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, - { "package": "quay.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, { "package": "prettytable==3.14.0", "type": "pip_module" }, @@ -94,9 +94,9 @@ { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, - { "package": "quay.io/calico/cni", "tag": "v3.31.4", "type": "image" }, - { "package": "quay.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, - { "package": "quay.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, { "package": "calico-v3.31.4","type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" }, { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" }, diff --git a/input/provision_config.yml b/input/provision_config.yml index 6b8f17c6aa..14b946ad8a 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -38,3 +38,11 @@ language: "en_US.UTF-8" # Default: 86400 # Max: 31536000 default_lease_time: "86400" + +#### Optional +# Enable DNS-based hostname resolution for compute nodes. +# When true, nodes use coresmd (CoreDNS + OpenCHAMI SMD plugin) instead of /etc/hosts. +# DNS records are generated automatically from SMD inventory. +# The cluster domain is read from OIM metadata (domain_name). +# Default: false +dns_enabled: false diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml index 73e093c6d6..160ff38073 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml @@ -85,14 +85,6 @@ delegate_to: localhost connection: local -- name: Deploy coredhcp template with multi-subnet support - ansible.builtin.copy: - src: "{{ openchami_coredhcp_template }}" - dest: "{{ openchami_coredhcp_target }}" - mode: "{{ file_permissions_644 }}" - delegate_to: localhost - connection: local - - name: Load the openchami configs vars ansible.builtin.template: src: "{{ openchami_config_vars_template }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 deleted file mode 100644 index 523d4be376..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 +++ /dev/null @@ -1,35 +0,0 @@ -server4: - listen: - - "%{{ cluster_boot_interface }}" - plugins: - - server_id: {{ coredhcp_server_id }} - - dns: {{ coredhcp_dns_server }} - - router: {{ coredhcp_router }} - - netmask: {{ coredhcp_netmask }} -{% if coredhcp_subnets | default([]) | length > 0 %} - # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) - - coresmd: | - svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 - ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 - ca_cert=/root_ca/root_ca.crt - cache_valid={{ coredhcp_cache_validity }} - lease_time={{ coredhcp_lease_duration }} - single_port={{ coredhcp_tftp_single_port_mode | lower }} -{% for s in coredhcp_subnets %} - subnet={{ s.cidr }},{{ s.router }} -{% endfor %} - rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=hostname:unknown-{{'{'}}04d{{'}'}} - - bootloop: | - lease_file=/tmp/coredhcp.db - script_path={{ coredhcp_custom_ipxe }} - lease_time={{ coredhcp_tmp_lease_duration }} -{% for sp in coredhcp_subnet_pools %} - subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} -{% endfor %} -{% else %} - # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} -{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 index 2b0e180422..523d4be376 100644 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 @@ -6,5 +6,30 @@ server4: - dns: {{ coredhcp_dns_server }} - router: {{ coredhcp_router }} - netmask: {{ coredhcp_netmask }} +{% if coredhcp_subnets | default([]) | length > 0 %} + # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) + - coresmd: | + svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 + ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 + ca_cert=/root_ca/root_ca.crt + cache_valid={{ coredhcp_cache_validity }} + lease_time={{ coredhcp_lease_duration }} + single_port={{ coredhcp_tftp_single_port_mode | lower }} +{% for s in coredhcp_subnets %} + subnet={{ s.cidr }},{{ s.router }} +{% endfor %} + rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=hostname:unknown-{{'{'}}04d{{'}'}} + - bootloop: | + lease_file=/tmp/coredhcp.db + script_path={{ coredhcp_custom_ipxe }} + lease_time={{ coredhcp_tmp_lease_duration }} +{% for sp in coredhcp_subnet_pools %} + subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} +{% endfor %} +{% else %} + # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} +{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2b88daffe8..dfdf99a745 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -27,8 +27,6 @@ openchami_inventory_template: "{{ role_path }}/templates/inventory.yaml.j2" openchami_inventory_file: "{{ openchami_clone_path }}/dell/podman-quadlets/inventory/01-ochami" openchami_config_vars_path: "/opt/omnia/openchami/configs_vars.yaml" openchami_config_vars_template: "{{ role_path }}/templates/configs.yaml.j2" -openchami_coredhcp_template: "{{ role_path }}/templates/coredhcp.yaml.j2" -openchami_coredhcp_target: "{{ openchami_clone_path }}/dell/podman-quadlets/roles/configs/templates/coredhcp/coredhcp.yaml.j2" openchami_install_fail_msg: "Failed to install OpenCHAMI" network_spec: "{{ hostvars['localhost']['input_project_dir'] }}/network_spec.yml" network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 60b0a47616..2e81733240 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -131,12 +131,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index a4b89e1efa..de69e4f556 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -130,12 +130,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index ad767a2e59..156608de44 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -101,12 +101,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index faa5c234b6..51296e3c29 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -100,12 +100,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 37e05d2b16..50db7f9fed 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -605,6 +605,32 @@ # Patch: append nameservers after /etc/resolv.conf using Jinja list "dns" sed -i 's|/etc/resolv.conf|/etc/resolv.conf{% for ns in dns %} {{ ns }}{% endfor %}|' "$cfg" +{% if dns_enabled | default(false) | bool %} + # Forward cluster-internal DNS domain to OIM CoreDNS + # This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS + python3 - "$cfg" << 'PYEOF' +import sys, yaml +cfg_path = sys.argv[1] +with open(cfg_path) as f: + doc = yaml.safe_load(f) +corefile = doc['data']['Corefile'] +fwd_block = """{{ domain_name }}:53 { + errors + cache 30 + forward . {{ admin_nic_ip }} +} +""" +if '{{ domain_name }}:53' not in corefile: + corefile = fwd_block + corefile + doc['data']['Corefile'] = corefile + with open(cfg_path, 'w') as f: + yaml.dump(doc, f, default_flow_style=False) + print("Added {{ domain_name }} forward zone to K8s CoreDNS") +else: + print("{{ domain_name }} forward zone already present in K8s CoreDNS") +PYEOF +{% endif %} + # Apply the patched ConfigMap kubectl apply -f "$cfg" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2ee561109c..ba8fcfad03 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -107,12 +107,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /root/init_slurm_db.sql permissions: '{{ file_mode_600 }}' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 2d4b7ad001..44f188e51b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -343,12 +343,22 @@ echo "[INFO] ===== Completed firewall and service configuration (aarch64) =====" +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 3cae337b69..6baef46c43 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -148,12 +148,22 @@ {% endif %} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root permissions: '0644' diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index b6136b185d..0050048928 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -263,7 +263,7 @@ - name: Untar helm tarball to version-specific directory ansible.builtin.unarchive: src: "{{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz" - dest: "{{ k8s_client_mount_path }}/helm/" + dest: "{{ k8s_client_mount_path }}/helm/linux-amd64-helm-v{{ helm_version }}/" remote_src: true extra_opts: - "--transform" diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index b2480d2c6e..2895762f8e 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -45,6 +45,11 @@ ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" +- name: Set dns_enabled default when not defined + ansible.builtin.set_fact: + dns_enabled: false + when: dns_enabled is not defined + - name: Initialise variables ansible.builtin.set_fact: service_k8s_support: false diff --git a/provision/roles/provision_validations/tasks/update_hosts.yml b/provision/roles/provision_validations/tasks/update_hosts.yml index bd046032bc..8110097cbe 100644 --- a/provision/roles/provision_validations/tasks/update_hosts.yml +++ b/provision/roles/provision_validations/tasks/update_hosts.yml @@ -19,19 +19,22 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs and hostnames that are being updated - ansible.builtin.shell: | - set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ - grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp - cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} - rm -f {{ hosts_file_path }}.tmp - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" +- name: Update OIM /etc/hosts (skipped when CoreDNS is enabled) + when: not (dns_enabled | default(false) | bool) + block: + - name: Remove stale entries for IPs and hostnames that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Add hosts file entry for cluster - ansible.builtin.shell: | - set -o pipefail - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" + - name: Add hosts file entry for cluster + ansible.builtin.shell: | + set -o pipefail + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" diff --git a/provision/roles/slurm_config/tasks/update_hosts_munge.yml b/provision/roles/slurm_config/tasks/update_hosts_munge.yml index 29683159ad..783d821edd 100644 --- a/provision/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/provision/roles/slurm_config/tasks/update_hosts_munge.yml @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Edit /etc/hosts file till DNS +- name: Edit /etc/hosts file (skipped when CoreDNS is enabled) ignore_unreachable: true delegate_to: "{{ slurmhost_ip }}" + when: not (dns_enabled | default(false) | bool) block: - name: Remove deleted nodes if any hostname exists in /etc/hosts ansible.builtin.lineinfile: diff --git a/provision/roles/telemetry/tasks/deploy_vector_ome.yml b/provision/roles/telemetry/tasks/deploy_vector_ome.yml index 31965270a6..5a91dee0df 100644 --- a/provision/roles/telemetry/tasks/deploy_vector_ome.yml +++ b/provision/roles/telemetry/tasks/deploy_vector_ome.yml @@ -25,46 +25,6 @@ # - Logs path requires: victoria_logs_support=true # - KafkaUser: Dedicated vector-ome-user (NOT shared kafkapump) -# ============================================================================ -# Generate OME KafkaTopic manifests (5 topics for OME telemetry) -# ============================================================================ -- name: Build OME topic list for Vector-OME bridge - ansible.builtin.set_fact: - ome_identifier: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}" - ome_kafka_topics_to_create: - - name: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}.events" - partitions: 3 - filename: "kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_events.yaml" - - name: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}.alerts" - partitions: 2 - filename: "kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_alerts.yaml" - - name: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}.inventory" - partitions: 2 - filename: "kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_inventory.yaml" - - name: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}.logs" - partitions: 2 - filename: "kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_logs.yaml" - - name: "{{ telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') }}.telemetry" - partitions: 3 - filename: "kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_telemetry.yaml" - -- name: Generate OME KafkaTopic manifests - ansible.builtin.template: - src: 'telemetry/kafka/kafka.topic.yaml.j2' - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.filename }}" - mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - loop: "{{ ome_kafka_topics_to_create }}" - vars: - topic_name: "{{ item.name }}" - topic_partitions: "{{ item.partitions }}" - tags: telemetry_deployment - -# ============================================================================ -# NOTE: Kubernetes verification and deployment removed from this task -# Rationale: This task runs during provision.yml (template generation phase) -# Actual Kafka cluster and Vector deployment happens later via telemetry.sh -# during cloud-init execution on control plane nodes -# ============================================================================ # ============================================================================ # Render Vector-OME KafkaUser CR @@ -129,7 +89,6 @@ ansible.builtin.debug: msg: > Vector-OME templates generated successfully. - KafkaTopic CRs: {{ ome_kafka_topics_to_create | length }} topics. ConfigMap: {{ vector_ome_configmap_rendered.dest }}. KafkaUser: {{ vector_ome_kafkauser_rendered.dest }}. Deployment: {{ vector_ome_deployment_rendered.dest }}. diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index c55f7a5f74..3b1555e8ba 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -87,10 +87,4 @@ resources: - vector/vector-ome-configmap.yaml - vector/vector-ome-deployment.yaml - vector/vector-ome-service.yaml -# OME KafkaTopic CRs (5 topics for OME telemetry) - - kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_events.yaml - - kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_alerts.yaml - - kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_inventory.yaml - - kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_logs.yaml - - kafka.topic_{{ (telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome')) | replace('.', '_') }}_telemetry.yaml {% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ome-config.toml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ome-config.toml.j2 index f9a4d2c896..1228ba52b6 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ome-config.toml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ome-config.toml.j2 @@ -1,9 +1,14 @@ +{% set ome_id = telemetry_config.telemetry_bridges.vector_ome.ome_identifier | default('ome') %} # Vector-OME Configuration -# Purpose: Consume OME telemetry from Kafka topics matching ome.* pattern, +# Purpose: Consume OME telemetry from Kafka topics matching {{ ome_id }}.* pattern, # route metrics to VictoriaMetrics via vmagent-vector and # route logs/events to VictoriaLogs via vlagent-vector -# Architecture: Kafka 'ome.*' topics → Vector-OME → vmagent-vector:8429 (metrics) -# → vlagent-vector:9427 (logs) +# Architecture: Kafka '{{ ome_id }}.*' topics → Vector-OME → vmagent-vector:8429 (metrics) +# → vlagent-vector:9427 (logs) +# Topics (pre-existing in OME Kafka — NOT created by Omnia): +# Metrics → VictoriaMetrics: {{ ome_id }}.inventory, {{ ome_id }}.health, {{ ome_id }}.telemetry +# Logs → VictoriaLogs: {{ ome_id }}.alerts, {{ ome_id }}.auditlogs, {{ ome_id }}.logs +# Unknown → VictoriaMetrics (best-effort Redfish processing) # Q2 Status: Active — Omnia-deployed # Spec Reference: Vector HLD Engineering Spec (ESPEC-VECTOR-2026-001) §4.1.3.2, CD-10 @@ -15,7 +20,8 @@ data_dir = "/var/lib/vector" # ============================================================================ # SOURCE: Kafka Consumer for OME Topics (regex subscription) # ============================================================================ -# OME publishes to 5 topics: ome.events, ome.alerts, ome.inventory, ome.logs, ome.telemetry, +# OME publishes to topics: {{ ome_id }}.inventory, {{ ome_id }}.health, {{ ome_id }}.telemetry, +# {{ ome_id }}.alerts, {{ ome_id }}.auditlogs, {{ ome_id }}.logs # Uses regex subscription to capture all OME topics dynamically (Constraint C-05) [sources.kafka_ome] type = "kafka" @@ -53,57 +59,82 @@ source = ''' # ============================================================================ # TRANSFORM: OME Topic Router # ============================================================================ -# Step 2: Route by topic — metric topics to VictoriaMetrics, event/log topics to VictoriaLogs -# ome.inventory + ome.telemetry → metrics path (Redfish System[].Metric[] format) -# ome.events + ome.alerts + ome.logs → logs path (JSON events) +# Step 2: Route by topic name (prefix from ome_identifier input variable): +# metrics → {{ ome_id }}.inventory, {{ ome_id }}.health, {{ ome_id }}.telemetry → VictoriaMetrics +# logs → {{ ome_id }}.alerts, {{ ome_id }}.auditlogs, {{ ome_id }}.logs → VictoriaLogs +# _unmatched → VictoriaMetrics (best-effort for any unknown topics) [transforms.ome_topic_router] type = "route" inputs = ["parse_ome_topic"] [transforms.ome_topic_router.route] {% if telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool %} - metrics = '.source_topic == "ome.inventory" || .source_topic == "ome.telemetry"' + metrics = '.source_topic == "{{ ome_id }}.inventory" || .source_topic == "{{ ome_id }}.health" || .source_topic == "{{ ome_id }}.telemetry"' {% endif %} {% if telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool %} - logs = '.source_topic == "ome.events" || .source_topic == "ome.alerts" || .source_topic == "ome.logs"' + logs = '.source_topic == "{{ ome_id }}.alerts" || .source_topic == "{{ ome_id }}.auditlogs" || .source_topic == "{{ ome_id }}.logs"' {% endif %} {% if telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool %} # ============================================================================ -# TRANSFORM: Metric Enricher — Flatten OME Redfish System[].Metric[] Arrays +# TRANSFORM: Metric Enricher — Unified OME Metric Extraction # ============================================================================ -# Step 3: OME telemetry/inventory data uses Redfish nested array format. -# Each message has System[] with Metric[] sub-arrays. This transform flattens -# the nested structure into individual metric events for log_to_metric consumption. +# Step 3: Handles two OME metric data formats: +# Redfish format ({{ ome_id }}.inventory, {{ ome_id }}.telemetry): +# System[] array with nested Metric[] sub-arrays — flattened to individual metric events. +# Health format ({{ ome_id }}.health): +# System[] array with flat fields — all numeric/boolean fields are dynamically extracted as metrics. +# _unmatched: best-effort Redfish processing — events not matching System[] are dropped. [transforms.metric_enricher] type = "remap" -inputs = ["ome_topic_router.metrics"] +inputs = ["ome_topic_router.metrics", "ome_topic_router._unmatched"] source = ''' all_metrics = [] SystemArr = array!(.System) for_each(SystemArr) -> |_index, system| { - for_each(array!(system.Metric)) -> |_index1, metric| { - id_replaced = replace!(metric.MetricId, ".", "_") - parts = split(id_replaced, "_") - part_count = length(parts) - sliced, _ = slice(parts, 1, (part_count - 2)) - metric_name, _ = join(sliced, "_") - componentId = if exists(metric.ComponentId) && metric.ComponentId != null { - metric.ComponentId - } else { - "System" - } - parsed_metric, _ = [{ - "MetricName": metric_name, - "MetricValue": to_float(metric.MetricValue[0]), - "ComponentId": componentId, - "TimeStamp": metric.TimeStamp[0], - "Identifier": system.Identifier, - "Type": parts[0] - }] - - all_metrics = append(all_metrics, parsed_metric) + identifier = to_string(system.Identifier) ?? "unknown" + + if system.Metric != null && is_array(system.Metric) { + for_each(array!(system.Metric)) -> |_index1, metric| { + id_replaced = replace!(metric.MetricId, ".", "_") + parts = split(id_replaced, "_") + part_count = length(parts) + sliced, _ = slice(parts, 1, (part_count - 2)) + metric_name, _ = join(sliced, "_") + componentId = if exists(metric.ComponentId) && metric.ComponentId != null { + metric.ComponentId + } else { + "System" + } + parsed_metric, _ = [{ + "MetricName": metric_name, + "MetricValue": to_float(metric.MetricValue[0]), + "ComponentId": componentId, + "TimeStamp": metric.TimeStamp[0], + "Identifier": identifier, + "Type": parts[0] + }] + + all_metrics = append(all_metrics, parsed_metric) + } + } else { + ts = to_string(system.CollectionTime) ?? "" + sys_keys = keys!(system) + for_each(sys_keys) -> |_i, key| { + val = get!(system, [key]) + if is_float(val) || is_integer(val) || is_boolean(val) { + parsed_metric = [{ + "MetricName": key, + "MetricValue": to_float(val) ?? 0.0, + "ComponentId": "System", + "TimeStamp": ts, + "Identifier": identifier, + "Type": "health" + }] + all_metrics = append(all_metrics, parsed_metric) + } + } } } @@ -131,44 +162,65 @@ inputs = ["metric_enricher"] identifier = "{{ Identifier }}" type = "{{ Type }}" {% endraw %} - source_subsystem = "ome" + source_subsystem = "{{ ome_id }}" {% endif %} {% if telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool %} # ============================================================================ -# TRANSFORM: Log Enricher — Enrich Event/Alert/Log Messages for VictoriaLogs +# TRANSFORM: Log Enricher — Enrich Alert/AuditLog Messages for VictoriaLogs # ============================================================================ -# Step 5: Enrich non-metric events with VictoriaLogs-compatible fields +# Step 5: Enrich log events with VictoriaLogs-compatible fields. +# OME wraps log events in a Data[] array: +# {{ ome_id }}.alerts: {"Data": [{"AlertId": ..., "Severity": ..., "Timestamp": "...", ...}]} +# {{ ome_id }}.auditlogs: {"Data": [{"Id": ..., "Message": ..., "CreateDate": "20251126T185211Z", ...}]} +# {{ ome_id }}.logs: {"Data": [{"Id": ..., "Message": ..., "CreateDate": "...", ...}]} # _msg: full JSON-encoded message body -# _time: timestamp from OME event or current time -# _msg_source: "ome" for source identification -# _msg_topic: original Kafka topic name -# severity: extracted from OME Severity field +# _time: from Data[0].CreateDate (auditlogs) or Data[0].Timestamp (alerts), falls back to now() +# severity: from Data[0].Severity if available [transforms.ome_log_enricher] type = "remap" inputs = ["ome_topic_router.logs"] source = ''' ._msg = encode_json(.) - raw_ts = to_string(.Timestamp) ?? "" - parsed_ts, parse_err = parse_timestamp(raw_ts, "%Y%m%dT%H%M%SZ") - if parse_err == null { - ._time = format_timestamp!(parsed_ts, "%Y-%m-%dT%H:%M:%SZ") + data_arr = array(.Data) ?? [] + raw_ts = "" + if length(data_arr) > 0 { + first = data_arr[0] + raw_ts = if first.CreateDate != null { + to_string!(first.CreateDate) + } else if first.Timestamp != null { + to_string!(first.Timestamp) + } else { + "" + } + } + ts_parsed, ts_err = parse_timestamp(raw_ts, "%Y%m%dT%H%M%SZ") + if ts_err != null { + ts_parsed, ts_err = parse_timestamp(raw_ts, "%Y-%m-%dT%H:%M:%SZ") + } + ._time = if ts_err == null { + format_timestamp!(ts_parsed, "%Y-%m-%dT%H:%M:%SZ") + } else { + to_string(now()) + } + ._msg_source = "{{ ome_id }}" + ._msg_topic = to_string(.source_topic) ?? "{{ ome_id }}" + .severity = if length(data_arr) > 0 { + to_string(data_arr[0].Severity) ?? "info" } else { - ._time = to_string(now()) + "info" } - ._msg_source = "ome" - ._msg_topic = to_string(.source_topic) ?? "ome" - .severity = to_string(.Severity) ?? "info" ''' {% endif %} +{% if not (telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool) %} # ============================================================================ -# SINK: Drop unmatched topics (suppresses _unmatched warning) +# SINK: Drop unmatched topics (metrics disabled — no VictoriaMetrics sink available) # ============================================================================ -# Topics not matching metrics or logs routes (e.g. ome.health) are dropped [sinks.drop_unmatched] type = "blackhole" inputs = ["ome_topic_router._unmatched"] +{% endif %} # ============================================================================ # SINKS diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 829c74ef47..fa5cd3c490 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -85,19 +85,10 @@ kafka: ldms: name: "ldms" consumer_group: "ldms-consumer-group" - # # OME topics (provisioned when Vector-OME bridge is enabled) - # ome_events: - # name: "{{ ome_identifier }}.events" - # partitions: 3 - # ome_alerts: - # name: "{{ ome_identifier }}.alerts" - # partitions: 2 - # ome_inventory: - # name: "{{ ome_identifier }}.inventory" - # partitions: 2 - # ome_logs: - # name: "{{ ome_identifier }}.logs" - # partitions: 2 + # OME topics are pre-existing in the OME Kafka broker — NOT created by Omnia. + # Vector-OME subscribes to all {{ ome_identifier }}.* topics via regex pattern. + # Known topics: {{ ome_identifier }}.inventory, {{ ome_identifier }}.health, {{ ome_identifier }}.telemetry (→ VictoriaMetrics) + # {{ ome_identifier }}.alerts, {{ ome_identifier }}.auditlogs, {{ ome_identifier }}.logs (→ VictoriaLogs) # Dynamic image configuration from service_k8s_v.json # Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s_v.json @@ -603,8 +594,8 @@ vector: service_name: "vector-ome" container_name: "vector-ome" # Dynamic pattern based on ome_identifier from telemetry_config.yml - # Example: if ome_identifier="ome", pattern="^ome\\..*$" (matches ome.events, ome.alerts, etc.) - # Example: if ome_identifier="dell_ome", pattern="^dell_ome\\..*$" (matches dell_ome.events, etc.) + # Example: if ome_identifier="ome", pattern="^ome\\..*$" (matches ome.inventory, ome.health, etc.) + # Example: if ome_identifier="OME-637-2-100.100.5.8", pattern matches OME-637-2-100.100.5.8.* topics kafka_topics_pattern: "^{{ telemetry_bridges.vector_ome.ome_identifier | default('ome') | regex_escape }}\\..*$" consumer_group: "vector-ome-group" kafka_user: "vector-ome-user" # Dedicated KafkaUser for OME @@ -649,7 +640,7 @@ vector: image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" replicas: 2 pvc_size: "5Gi" # Disk buffer - remote_write_url: "https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9428/internal/insert" + remote_write_url: "https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert" tmp_data_path: "/vlagent-buffer" resources: requests: diff --git a/upgrade/playbooks/reboot_and_verify_cloud_init.yml b/upgrade/playbooks/reboot_and_verify_cloud_init.yml new file mode 100644 index 0000000000..a740859c00 --- /dev/null +++ b/upgrade/playbooks/reboot_and_verify_cloud_init.yml @@ -0,0 +1,96 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# reboot_and_verify_cloud_init.yml — Reboot node and verify cloud-init +# ============================================================================ +# This playbook reboots a target node and waits for cloud-init to complete +# successfully after the reboot. +# +# Required variables (passed via -e): +# target_host — Name of the host to reboot (must be in inventory) +# cloud_init_timeout — Timeout in seconds to wait for cloud-init (default: 600) +# +# Usage: +# ansible-playbook reboot_and_verify_cloud_init.yml \ +# -i inventory.ini \ +# -e target_host=kcp1 \ +# -e cloud_init_timeout=600 +# ============================================================================ + +- name: Reboot node and verify cloud-init completion + hosts: all + gather_facts: false + vars: + cloud_init_timeout: 600 + cloud_init_check_interval: 10 + cloud_init_success_marker: "Cloud-Init finished successfully after the reboot." + tasks: + - name: Validate target_host is provided + ansible.builtin.assert: + that: + - target_host is defined + - target_host | length > 0 + fail_msg: "target_host must be provided via -e target_host=" + + - name: Skip hosts other than target_host + ansible.builtin.meta: end_host + when: inventory_hostname != target_host + + - name: Display reboot target + ansible.builtin.debug: + msg: "Rebooting node {{ inventory_hostname }} and waiting for cloud-init to complete" + + - name: Reboot the node + ansible.builtin.reboot: + reboot_timeout: 600 + pre_reboot_delay: 5 + post_reboot_delay: 30 + msg: "Rebooting node after BSS/cloud-init update" + connect_timeout: 10 + test_command: whoami + + - name: Wait for cloud-init to complete + ansible.builtin.shell: + cmd: | + timeout={{ cloud_init_timeout }} + elapsed=0 + interval={{ cloud_init_check_interval }} + while [ $elapsed -lt $timeout ]; do + if grep -q "{{ cloud_init_success_marker }}" /var/log/cloud-init-output.log 2>/dev/null; then + echo "{{ cloud_init_success_marker }}" + exit 0 + fi + sleep $interval + elapsed=$((elapsed + interval)) + echo "Waiting for cloud-init to complete... ($elapsed/$timeout seconds)" + done + echo "Timeout waiting for cloud-init to complete after $timeout seconds" + tail -50 /var/log/cloud-init-output.log + exit 1 + executable: /bin/bash + register: cloud_init_result + changed_when: false + + - name: Fail if cloud-init did not complete successfully + ansible.builtin.fail: + msg: >- + Cloud-init did not complete successfully on {{ inventory_hostname }}. + Check /var/log/cloud-init-output.log for details. + Last output: {{ cloud_init_result.stdout_lines | default([]) | last | default('N/A') }} + when: cloud_init_result.rc != 0 + + - name: Display cloud-init success + ansible.builtin.debug: + msg: "Cloud-init completed successfully on {{ inventory_hostname }}" diff --git a/upgrade/playbooks/update_k8s_cloud_init_bss.yml b/upgrade/playbooks/update_k8s_cloud_init_bss.yml new file mode 100644 index 0000000000..4fc3a06485 --- /dev/null +++ b/upgrade/playbooks/update_k8s_cloud_init_bss.yml @@ -0,0 +1,93 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# update_k8s_cloud_init_bss.yml — Update cloud-init and BSS for K8s groups +# ============================================================================ +# This playbook updates cloud-init and BSS configurations for K8s functional +# groups after their upgrade is complete. +# +# Required variables (passed via -e): +# functional_group_name — Name of the functional group: +# - service_kube_control_plane_first_x86_64 +# - service_kube_control_plane_x86_64 +# - service_kube_node_x86_64 +# +# Usage: +# ansible-playbook update_k8s_cloud_init_bss.yml \ +# -e functional_group_name=service_kube_control_plane_first_x86_64 +# ============================================================================ + +# ============================================================================ +# Play 1: Include input project directory and set up variables +# ============================================================================ +- name: Include input project directory + ansible.builtin.import_playbook: ../../utils/include_input_dir.yml + vars: + openchami_vars_suppport: true + omnia_metadata_support: true + +# ============================================================================ +# Play 2: Create OIM host group (needed for cloud-init/BSS update on OIM) +# ============================================================================ +- name: Create OIM host group for cloud-init/BSS update + ansible.builtin.import_playbook: ../../utils/create_container_group.yml + vars: + oim_group: true + +# ============================================================================ +# Play 3: Update cloud-init and BSS for the specified K8s functional group +# ============================================================================ +- name: Update cloud-init and BSS for K8s functional group + hosts: oim + connection: ssh + gather_facts: false + vars: + openchami_work_dir: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir" + tasks: + - name: Validate functional_group_name is provided + ansible.builtin.assert: + that: + - functional_group_name is defined + - functional_group_name | length > 0 + fail_msg: "functional_group_name must be provided via -e functional_group_name=" + + - name: Display update target + ansible.builtin.debug: + msg: "Updating cloud-init and BSS for functional group: {{ functional_group_name }}" + + - name: Refresh OpenCHAMI access token + ansible.builtin.include_tasks: "{{ playbook_dir }}/../../common/tasks/common/openchami_auth.yml" + vars: + oim_node_name: "{{ hostvars['localhost']['oim_node_name'] }}" + + - name: Update cloud-init and BSS for {{ functional_group_name }} + environment: "{{ hostvars['oim']['ochami_env'] }}" + block: + - name: Include update_cloud_init_bss role + ansible.builtin.include_role: + name: "{{ playbook_dir }}/../../utils/roles/update_cloud_init_bss" + vars: + bss_file_path: "{{ openchami_work_dir }}/boot/bss-{{ functional_group_name }}.yaml" + cloud_init_file_path: "{{ openchami_work_dir }}/cloud-init/ci-group-{{ functional_group_name }}.yaml" + ci_defaults_file_path: "{{ openchami_work_dir }}/cloud-init/ci-defaults.yaml" + ci_common_file_path: "{{ openchami_work_dir }}/cloud-init/ci-group-common.yaml" + hostname_file_path: "{{ openchami_work_dir }}/nodes/hostname.yaml" + update_ci_defaults: true + update_ci_common: true + update_hostname: true + + - name: Display update completion + ansible.builtin.debug: + msg: "Successfully updated cloud-init and BSS for {{ functional_group_name }}" diff --git a/upgrade/playbooks/upgrade_cp.yml b/upgrade/playbooks/upgrade_cp.yml new file mode 100644 index 0000000000..f973f0d020 --- /dev/null +++ b/upgrade/playbooks/upgrade_cp.yml @@ -0,0 +1,496 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Additional control planes upgrade playbook +# +# Error Handling: +# - Each step has its own block/rescue to capture errors +# - Failed steps are marked with status: failed and error description +# - On subsequent runs, steps with status pending/in_progress/failed will be retried +# - Only steps with status: completed are skipped + +- name: Upgrade additional control planes + hosts: k8s_control_plane + serial: 1 + tasks: + - name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + + - name: Load upgrade status from kube_vip + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + when: status_file_check.stat.exists | default(false) + + - name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + when: status_file_check.stat.exists | default(false) + + - name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + when: not (status_file_check.stat.exists | default(false)) + + - name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + + - name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[current_node_name].ip }}" + + - name: "Skip node if already completed - {{ current_node_name }}" + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + + - name: Upgrade control plane {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + # ── kubeadm_install ────────────────────────────────────────── + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubeadm install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubeadm_install.yml" + + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubeadm install failed')) }}" + - name: Fail kubeadm_install step + ansible.builtin.fail: + msg: "kubeadm_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubeadm_upgrade_node ────────────────────────────────────── + - name: Run kubeadm_upgrade_node on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubeadm upgrade node + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_upgrade_node.yml" + + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_upgrade_node failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_upgrade_node: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('kubeadm upgrade node failed')) }}" + - name: Fail kubeadm_upgrade_node step + ansible.builtin.fail: + msg: "kubeadm_upgrade_node failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── drain (cordon only for production safety) ──────────────── + - name: Run drain on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute drain + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_drain.yml" + + - name: Mark drain completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark drain failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + drain: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('drain/cordon failed')) }}" + - name: Fail drain step + ansible.builtin.fail: + msg: "drain failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── kubelet_install ────────────────────────────────────────── + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubelet install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubelet_install.yml" + + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet install failed')) }}" + - name: Fail kubelet_install step + ansible.builtin.fail: + msg: "kubelet_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── crio_install ───────────────────────────────────────────── + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute crio install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_crio_install.yml" + + - name: Mark crio_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark crio_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + crio_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('cri-o install failed')) }}" + - name: Fail crio_install step + ansible.builtin.fail: + msg: "crio_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubelet_restart ────────────────────────────────────────── + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubelet restart + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubelet_restart.yml" + + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_restart failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_restart: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet restart failed')) }}" + - name: Fail kubelet_restart step + ansible.builtin.fail: + msg: "kubelet_restart failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── uncordon ───────────────────────────────────────────────── + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute uncordon + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_uncordon.yml" + + - name: Mark uncordon completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark uncordon failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + uncordon: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('uncordon failed')) }}" + - name: Fail uncordon step + ansible.builtin.fail: + msg: "uncordon failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── validation ─────────────────────────────────────────────── + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Mark validation in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute node validation + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_validate_node.yml" + + - name: Mark validation completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark validation failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + validation: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('validation failed')) }}" + - name: Fail validation step + ansible.builtin.fail: + msg: "validation failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── etcd_health_check ──────────────────────────────────────── + - name: Run etcd health check after {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.etcd_health_check.status | default('pending')) != 'completed' + block: + - name: Mark etcd_health_check in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + etcd_health_check: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute etcd health check + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_etcd_health_check.yml" + + - name: Mark etcd_health_check completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + etcd_health_check: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark etcd_health_check failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + etcd_health_check: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('etcd health check failed')) }}" + - name: Fail etcd_health_check step + ansible.builtin.fail: + msg: "etcd_health_check failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── Mark node completed ────────────────────────────────────── + - name: "Mark node completed {{ current_node_name }}" + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + rescue: + - name: "Mark node as failed (outer rescue) {{ current_node_name }}" + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Warn about CP failure (continues to next CP) + ansible.builtin.debug: + msg: "WARNING: Control plane {{ current_node_name }} upgrade failed. Will retry on next run." diff --git a/upgrade/playbooks/upgrade_cp_first.yml b/upgrade/playbooks/upgrade_cp_first.yml new file mode 100644 index 0000000000..0221bbef4e --- /dev/null +++ b/upgrade/playbooks/upgrade_cp_first.yml @@ -0,0 +1,496 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# First control plane upgrade playbook +# +# Error Handling: +# - Each step has its own block/rescue to capture errors +# - Failed steps are marked with status: failed and error description +# - On subsequent runs, steps with status pending/in_progress/failed will be retried +# - Only steps with status: completed are skipped + +- name: Upgrade first control plane + hosts: k8s_control_plane_first + serial: 1 + tasks: + - name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + + - name: Load upgrade status from kube_vip + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + when: status_file_check.stat.exists | default(false) + + - name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + when: status_file_check.stat.exists | default(false) + + - name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + when: not (status_file_check.stat.exists | default(false)) + + - name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + + - name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[current_node_name].ip }}" + + - name: "Skip node if already completed - {{ current_node_name }}" + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + + - name: Upgrade first control plane {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + # ── kubeadm_install ────────────────────────────────────────── + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubeadm install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubeadm_install.yml" + + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubeadm install failed')) }}" + - name: Fail kubeadm_install step + ansible.builtin.fail: + msg: "kubeadm_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubeadm_upgrade_apply (first CP only) ──────────────────── + - name: Run kubeadm_upgrade_apply on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_apply.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_apply in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_apply: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubeadm upgrade apply + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_upgrade_apply.yml" + + - name: Mark kubeadm_upgrade_apply completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_apply: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_upgrade_apply failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_upgrade_apply: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('kubeadm upgrade apply failed')) }}" + - name: Fail kubeadm_upgrade_apply step + ansible.builtin.fail: + msg: "kubeadm_upgrade_apply failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── drain (cordon only for production safety) ──────────────── + - name: Run drain on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute drain + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_drain.yml" + + - name: Mark drain completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark drain failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + drain: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('drain/cordon failed')) }}" + - name: Fail drain step + ansible.builtin.fail: + msg: "drain failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── kubelet_install ────────────────────────────────────────── + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubelet install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubelet_install.yml" + + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet install failed')) }}" + - name: Fail kubelet_install step + ansible.builtin.fail: + msg: "kubelet_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── crio_install ───────────────────────────────────────────── + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute crio install + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_crio_install.yml" + + - name: Mark crio_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark crio_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + crio_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('cri-o install failed')) }}" + - name: Fail crio_install step + ansible.builtin.fail: + msg: "crio_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubelet_restart ────────────────────────────────────────── + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute kubelet restart + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_kubelet_restart.yml" + + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_restart failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_restart: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet restart failed')) }}" + - name: Fail kubelet_restart step + ansible.builtin.fail: + msg: "kubelet_restart failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── uncordon ───────────────────────────────────────────────── + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute uncordon + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_uncordon.yml" + + - name: Mark uncordon completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark uncordon failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + uncordon: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('uncordon failed')) }}" + - name: Fail uncordon step + ansible.builtin.fail: + msg: "uncordon failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── validation ─────────────────────────────────────────────── + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Mark validation in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute node validation + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_validate_node.yml" + + - name: Mark validation completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark validation failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + validation: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('validation failed')) }}" + - name: Fail validation step + ansible.builtin.fail: + msg: "validation failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── etcd_health_check ──────────────────────────────────────── + - name: Run etcd health check after {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.etcd_health_check.status | default('pending')) != 'completed' + block: + - name: Mark etcd_health_check in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + etcd_health_check: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Execute etcd health check + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/step_etcd_health_check.yml" + + - name: Mark etcd_health_check completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + etcd_health_check: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark etcd_health_check failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + etcd_health_check: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('etcd health check failed')) }}" + - name: Fail etcd_health_check step + ansible.builtin.fail: + msg: "etcd_health_check failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── Mark node completed ────────────────────────────────────── + - name: Mark node completed {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + rescue: + - name: Mark node as failed (outer rescue) {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Fail after marking status + ansible.builtin.fail: + msg: "First control plane {{ current_node_name }} upgrade failed. Aborting." diff --git a/upgrade/playbooks/upgrade_k8s.yml b/upgrade/playbooks/upgrade_k8s.yml index 6bd9959e29..b214208acd 100644 --- a/upgrade/playbooks/upgrade_k8s.yml +++ b/upgrade/playbooks/upgrade_k8s.yml @@ -12,28 +12,167 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +# NOTE: input_project_dir is defined in each play's vars section. +# To override, use: --extra-vars "input_project_dir=/custom/path" -- name: Upgrade Kubernetes cluster +# ══════════════════════════════════════════════════════════════════════════════ +# Pre-check: Verify service_k8s is configured before proceeding +# ══════════════════════════════════════════════════════════════════════════════ +- name: "Kubernetes Upgrade - Pre-check service_k8s configuration" hosts: localhost connection: local gather_facts: false vars: + input_project_dir: "/opt/omnia/input/project_default" + oim_data_path: "/opt/omnia/.data" manifest_path: /opt/omnia/.data/upgrade_manifest.yml component_name: k8s + tasks: - - name: Read upgrade_manifest.yml + - name: "Load upgrade_vars.yml for supported versions" + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + + - name: "Load software_config.json" ansible.builtin.slurp: - src: "{{ manifest_path }}" - register: raw_manifest + path: "{{ input_project_dir }}/software_config.json" + register: _sw_config_slurp + + - name: "Parse software_config.json" + ansible.builtin.set_fact: + _software_config: "{{ _sw_config_slurp.content | b64decode | from_json }}" + + - name: "Check if service_k8s is configured in software_config.json" + ansible.builtin.set_fact: + k8s_upgrade_enabled: "{{ _software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0 }}" + + - name: "Extract K8s version from software_config.json" + ansible.builtin.set_fact: + _k8s_version_in_config: "{{ _software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | map(attribute='version') | first }}" + _k8s_supported_version: "{{ components.service_k8s.supported_versions | last }}" + when: k8s_upgrade_enabled + + - name: "Validate K8s version is supported" + ansible.builtin.fail: + msg: | + ════════════════════════════════════════════════════════════════════════ + [UPGRADE] UNSUPPORTED K8s VERSION + ════════════════════════════════════════════════════════════════════════ + Version in software_config.json: {{ _k8s_version_in_config }} + Supported version for upgrade: {{ _k8s_supported_version }} + + Only K8s version {{ _k8s_supported_version }} is supported for upgrade. + Please update software_config.json to use version {{ _k8s_supported_version }}. + ════════════════════════════════════════════════════════════════════════ + when: + - k8s_upgrade_enabled + - _k8s_version_in_config != _k8s_supported_version + + - name: "Load upgrade manifest" + ansible.builtin.set_fact: + manifest: "{{ lookup('file', manifest_path) | from_yaml }}" + ignore_errors: true + register: manifest_load + + - name: "Mark as skipped — service_k8s not configured" + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'skipped' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + when: + - not k8s_upgrade_enabled + - manifest_load is succeeded - - name: Parse manifest + - name: "Display skip message — service_k8s not configured" + ansible.builtin.debug: + msg: | + ════════════════════════════════════════════════════════════════════════ + [UPGRADE] Component '{{ component_name }}' — SKIPPED + ════════════════════════════════════════════════════════════════════════ + Reason: service_k8s is not present in software_config.json softwares list. + K8s cluster was not provisioned, skipping K8s upgrade. + ════════════════════════════════════════════════════════════════════════ + when: not k8s_upgrade_enabled + +- name: "Kubernetes Upgrade - Load Configuration & Version Detection" + hosts: localhost + connection: local + gather_facts: true + vars: + input_project_dir: "/opt/omnia/input/project_default" + oim_data_path: "/opt/omnia/.data" + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: k8s + + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: "Load upgrade_vars.yml" + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + + - name: "Extract K8s target version from upgrade configuration" ansible.builtin.set_fact: - manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + k8s_target_version: "{{ components.service_k8s.supported_versions | last }}" - - name: Skip if k8s already upgraded + - name: "Check if k8s already upgraded" + ansible.builtin.set_fact: + manifest: "{{ lookup('file', oim_data_path ~ '/upgrade_manifest.yml') | from_yaml }}" + ignore_errors: true + register: manifest_load + + - name: "Fail if upgrade manifest does not exist" + ansible.builtin.fail: + msg: | + ════════════════════════════════════════════════════════════════════════ + [UPGRADE] UPGRADE MANIFEST NOT FOUND + ════════════════════════════════════════════════════════════════════════ + The upgrade manifest file does not exist at: + {{ oim_data_path }}/upgrade_manifest.yml + + This file is required to verify prerequisite components are completed. + Please run: ansible-playbook upgrade/upgrade.yml + ════════════════════════════════════════════════════════════════════════ + when: manifest_load is failed + + - name: "Skip if already completed" ansible.builtin.meta: end_play when: - - manifest.component_status[component_name] | default('pending') == 'completed' + - manifest_load is succeeded + - manifest.component_status.k8s | default('pending') == 'completed' + + - name: "Verify component dependencies" + ansible.builtin.fail: + msg: | + ════════════════════════════════════════════════════════════════════════ + [UPGRADE] MISSING PREREQUISITE COMPONENTS + ════════════════════════════════════════════════════════════════════════ + K8s upgrade requires the following components to be completed first: + - OIM + - provision + - local_repo + - build_image + + Current component status: + - OIM: {{ manifest.component_status.oim | default('pending') }} + - provision: {{ manifest.component_status.provision | default('pending') }} + - local_repo: {{ manifest.component_status.local_repo | default('pending') }} + - build_image: {{ manifest.component_status.build_image | default('pending') }} + + Please run: ansible-playbook upgrade/upgrade.yml + ════════════════════════════════════════════════════════════════════════ + when: + - manifest.component_status.oim | default('pending') != 'completed' or + manifest.component_status.provision | default('pending') != 'completed' or + manifest.component_status.local_repo | default('pending') != 'completed' or + manifest.component_status.build_image | default('pending') != 'completed' - name: "Mark as skipped — BuildStream terminal gate active (C-24)" ansible.builtin.copy: @@ -60,38 +199,1209 @@ content: >- {{ manifest | combine({ 'component_status': manifest.component_status | combine({ - component_name: 'in-progress' + 'k8s': 'in-progress' }) }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' + dest: "{{ oim_data_path }}/upgrade_manifest.yml" + mode: "0644" + when: manifest_load is succeeded - name: "Display upgrade status in-progress — {{ component_name }}" ansible.builtin.debug: msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: in-progress" - # TODO: Implement per ESpec §4.7: - # 1. Validation gates (Pulp repos, SSH, cluster health, etcd, version chain, backup) - # 2. etcd snapshot + backup /etc/kubernetes - # 3. Sequential CP upgrade (kubeadm upgrade apply/node) - # 4. Addon upgrade (Calico, MetalLB, Helm charts) - # 5. Rolling worker upgrade (drain→upgrade→uncordon, batch configurable) - # 6. BSS + cloud-init update per functional group post-upgrade - # 7. Validation: all nodes Ready, pods Running - - name: K8s upgrade placeholder + - name: "Load software_config.json" + ansible.builtin.slurp: + path: "{{ input_project_dir }}/software_config.json" + register: _sw_config_slurp + + - name: "Parse software_config.json" + ansible.builtin.set_fact: + _software_config: "{{ _sw_config_slurp.content | b64decode | from_json }}" + + - name: "Set cluster OS variables" + ansible.builtin.set_fact: + cluster_os_type: "{{ _software_config.cluster_os_type }}" + cluster_os_version: "{{ _software_config.cluster_os_version }}" + + - name: "Load HA config" + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/high_availability_config.yml" + name: ha_config_data + + - name: "Extract kube_vip" + ansible.builtin.set_fact: + kube_vip: "{{ ha_config_data.service_k8s_cluster_ha[0].virtual_ip_address }}" + +- name: "Kubernetes Upgrade - Add kube_vip to inventory" + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: "Load HA config" + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/high_availability_config.yml" + name: ha_config_data + + - name: "Extract kube_vip" + ansible.builtin.set_fact: + kube_vip: "{{ ha_config_data.service_k8s_cluster_ha[0].virtual_ip_address }}" + + - name: "Add kube_vip to inventory" + ansible.builtin.add_host: + name: "{{ kube_vip }}" + ansible_host: "{{ kube_vip }}" + ansible_connection: ssh + ansible_user: root + ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + groups: kube_vip_group + k8s_target_version: "{{ k8s_target_version }}" + +- name: "Kubernetes Upgrade - Cache configuration files" + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Cache storage_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/storage_config.yml" + name: cached_storage_config + + - name: Cache omnia_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/omnia_config.yml" + name: cached_omnia_config + + - name: Cache high_availability_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/high_availability_config.yml" + name: cached_ha_config + + - name: Set cached config flag + ansible.builtin.set_fact: + configs_cached: true + cacheable: true + +- name: "Kubernetes Upgrade - Detect current version" + hosts: kube_vip_group + gather_facts: false + pre_tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: "Verify SSH connectivity to kube_vip" + ansible.builtin.wait_for_connection: + delay: 5 + timeout: 30 + register: ssh_check + ignore_errors: true + + - name: "Abort if SSH connection fails" + ansible.builtin.fail: + msg: "Failed to establish SSH connection to kube_vip {{ ansible_host }}" + when: ssh_check is failed + + tasks: + - name: "Get all node versions" + ansible.builtin.raw: /usr/bin/kubectl get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' + register: _all_versions_raw + changed_when: false + + - name: "Parse node versions" + ansible.builtin.set_fact: + _all_versions: "{{ _all_versions_raw.stdout_lines | map('regex_replace', '^v', '') | list }}" + + - name: "Set k8s_from_version" + ansible.builtin.set_fact: + k8s_from_version: "{{ _all_versions | min }}" + k8s_from_minor: "{{ (_all_versions | min) | regex_replace('\\.[0-9]+$', '') }}" + +- name: "Kubernetes Upgrade - Detect hop chain" + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: "Load upgrade_vars.yml" + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + + - name: Set k8s_from_version from kube_vip + ansible.builtin.set_fact: + k8s_from_version: "{{ hostvars[kube_vip]['k8s_from_version'] }}" + k8s_from_minor: "{{ hostvars[kube_vip]['k8s_from_minor'] }}" + + - name: "Detect upgrade hop chain" + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/detect_hop_chain_from_manifest.yml" + + - name: Save variables to file on localhost + ansible.builtin.copy: + content: | + k8s_from_version: {{ k8s_from_version }} + k8s_from_minor: {{ k8s_from_minor }} + k8s_target_version: {{ k8s_target_version }} + hop_chain: {{ hop_chain | to_json }} + is_multi_hop: {{ is_multi_hop }} + dest: /tmp/k8s_vars.yml + mode: "0644" + + - name: Copy k8s_vars.yml to kube_vip for remote plays + ansible.builtin.copy: + src: /tmp/k8s_vars.yml + dest: /tmp/k8s_vars.yml + mode: "0644" + delegate_to: "{{ kube_vip }}" + +- name: "Kubernetes Upgrade - Detect hop chain from cluster" + hosts: kube_vip_group + gather_facts: false + strategy: linear + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Load variables from file + ansible.builtin.include_vars: + file: /tmp/k8s_vars.yml + + - name: "Load upgrade_vars.yml" + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + + - name: "Get current cluster version" + ansible.builtin.command: /usr/bin/kubectl get nodes -o jsonpath='{.items[*].status.nodeInfo.kubeletVersion}' + register: _all_versions_raw + changed_when: false + + - name: "Set current version facts (use minimum version across all nodes)" + ansible.builtin.set_fact: + _all_versions: "{{ _all_versions_raw.stdout.split() | map('regex_replace', '^v', '') | list }}" + k8s_from_version: "{{ _all_versions_raw.stdout.split() | map('regex_replace', '^v', '') | sort | first }}" + k8s_from_minor: "{{ (_all_versions_raw.stdout.split() | map('regex_replace', '^v', '') | sort | first | regex_replace('\\.[0-9]+$', '')) }}" + + - name: "Display detected cluster version" ansible.builtin.debug: - msg: "K8s upgrade tasks to be implemented (validation gates, CP upgrade, worker rolling upgrade)" + msg: | + Detected node versions: {{ _all_versions }} + Minimum version (upgrade from): {{ k8s_from_version }} + Target version: {{ components.service_k8s.supported_versions | last }} + + - name: "Detect upgrade hop chain from cluster" + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/detect_hop_chain_from_manifest.yml" + +- name: "Kubernetes Upgrade - Get cluster version for status" + hosts: kube_vip_group + gather_facts: false + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Get current cluster version + ansible.builtin.command: /usr/bin/kubectl get nodes -o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' + register: cluster_version_raw + changed_when: false + + - name: Set cluster version facts + ansible.builtin.set_fact: + k8s_from_version: "{{ cluster_version_raw.stdout | regex_replace('^v', '') }}" + k8s_from_minor: "{{ cluster_version_raw.stdout | regex_replace('^v', '') | regex_replace('\\.[0-9]+$', '') }}" + +- name: "Kubernetes Upgrade - Setup upgrade directory on kube_vip" + hosts: kube_vip_group + gather_facts: false + strategy: linear + vars: + input_project_dir: "/opt/omnia/input/project_default" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + + - name: Use cached omnia_config + ansible.builtin.set_fact: + omnia_config: "{{ hostvars['localhost']['cached_omnia_config'] }}" + + - name: Set k8s_nfs_storage_name + ansible.builtin.set_fact: + k8s_nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Set k8s_client_mount_path + ansible.builtin.set_fact: + k8s_client_mount_path: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }} + + - name: Ensure upgrade directory exists + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/upgrade" + state: directory + mode: "0755" + + - name: Set fact on localhost + ansible.builtin.set_fact: + k8s_client_mount_path_kube_vip: "{{ k8s_client_mount_path }}" + +- name: "Kubernetes Upgrade - Initialize upgrade status file" + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + nodes_yaml_path: "/opt/omnia/openchami/workdir/nodes/nodes.yaml" + group_cp_first: "service_kube_control_plane_first_x86_64" + group_cp: "service_kube_control_plane_x86_64" + group_worker: "service_kube_node_x86_64" + status_file: "{{ hostvars[kube_vip]['k8s_client_mount_path_kube_vip'] }}/upgrade/upgrade_status.yml" + kube_vip: "{{ hostvars[kube_vip]['ansible_host'] }}" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + + - name: Use cached omnia_config + ansible.builtin.set_fact: + omnia_config: "{{ hostvars['localhost']['cached_omnia_config'] }}" + + - name: Set k8s_nfs_storage_name + ansible.builtin.set_fact: + k8s_nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Set k8s_client_mount_path + ansible.builtin.set_fact: + k8s_client_mount_path: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }} + + - name: Read nodes.yaml + ansible.builtin.slurp: + src: "{{ nodes_yaml_path }}" + register: nodes_slurp + changed_when: false + + - name: Parse nodes.yaml + ansible.builtin.set_fact: + parsed_nodes: "{{ nodes_slurp.content | b64decode | from_yaml }}" + + - name: Build node lists by role + ansible.builtin.set_fact: + groups_cp_first: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp_first) + | map(attribute='name') | list }} + groups_cp: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp) + | map(attribute='name') | list }} + groups_worker: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_worker) + | map(attribute='name') | list }} + all_upgrade_nodes: >- + {{ (parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp_first) + | map(attribute='name') | list) + + (parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp) + | map(attribute='name') | list) + + (parsed_nodes.nodes + | selectattr('group', 'equalto', group_worker) + | map(attribute='name') | list) }} + + - name: Build node IP map + ansible.builtin.set_fact: + node_ips: >- + {{ node_ips | default({}) | combine({ + item.name: (item.interfaces | first).ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first + }) }} + loop: "{{ parsed_nodes.nodes }}" + loop_control: + label: "{{ item.name }}" + + - name: Load variables from file + ansible.builtin.include_vars: + file: /tmp/k8s_vars.yml - - name: Mark k8s upgrade as completed + - name: Check if upgrade status file already exists + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_stat + + - name: Read existing status file if it exists + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: existing_status_slurp + when: status_file_stat.stat.exists | default(false) + + - name: Parse existing status + ansible.builtin.set_fact: + existing_status: "{{ existing_status_slurp.content | b64decode | from_yaml | default({}, true) }}" + when: status_file_stat.stat.exists | default(false) + + # Steps are stored as dictionary for reliable YAML handling + # Execution order: setup_repos -> kubeadm_install -> kubeadm_upgrade_apply/node -> + # drain -> kubelet_install -> crio_install -> kubelet_restart -> uncordon -> validation -> etcd_health_check + - name: Build initial node status entries + ansible.builtin.set_fact: + initial_nodes: >- + {{ initial_nodes | default({}) | combine({ + item.0: { + 'role': item.1, + 'ip': node_ips[item.0], + 'version_before': k8s_from_version, + 'version_current': k8s_from_version, + 'status': 'pending', + 'steps': ( + { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_apply': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None}, + 'etcd_health_check': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'control_plane_first' + else { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_node': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None}, + 'etcd_health_check': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'control_plane' + else { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_node': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'worker' + else {} + ) + } + }) }} + loop: >- + {{ (groups_cp_first | zip_longest([], fillvalue='control_plane_first')) + + (groups_cp | zip_longest([], fillvalue='control_plane')) + + (groups_worker | zip_longest([], fillvalue='worker')) }} + loop_control: + label: "{{ item.0 }}" + + - name: Build complete upgrade status structure + ansible.builtin.set_fact: + complete_upgrade_status: + upgrade: + from_version: "{{ k8s_from_version }}" + target_version: "{{ k8s_target_version }}" + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + completed_at: + etcd_backup: + status: pending + path: + timestamp: + error: + k8s_config_backup: + status: pending + path: + timestamp: + error: + addon_upgrade: + status: pending + calico: + status: pending + metallb: + status: pending + helm: + status: pending + bss_update: + service_kube_control_plane_first: + status: pending + service_kube_control_plane: + status: pending + service_kube_node: + status: pending + nodes: "{{ initial_nodes }}" + + - name: Merge with existing status if it exists + ansible.builtin.set_fact: + final_upgrade_status: "{{ complete_upgrade_status | combine(existing_status, recursive=true, list_merge='replace') }}" + when: status_file_stat.stat.exists | default(false) + + - name: Use complete status if no existing file + ansible.builtin.set_fact: + final_upgrade_status: "{{ complete_upgrade_status }}" + when: not (status_file_stat.stat.exists | default(false)) + + - name: Write status to JSON file locally ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'completed' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' + content: "{{ final_upgrade_status | to_json }}" + dest: /tmp/upgrade_status_init.json + mode: "0644" + + - name: Convert JSON to YAML + ansible.builtin.shell: + cmd: >- + python3 -c "import json, yaml; + f = open('/tmp/upgrade_status_init.json'); + data = json.load(f); f.close(); + f = open('/tmp/upgrade_status_init.yml', 'w'); + yaml.dump(data, f, default_flow_style=False, sort_keys=False); + f.close()" + changed_when: true + + - name: Ensure status file directory exists on kube_vip + ansible.builtin.file: + path: "{{ status_file | dirname }}" + state: directory + mode: "0755" + delegate_to: "{{ kube_vip }}" + + - name: Copy status file to kube_vip + ansible.builtin.copy: + src: /tmp/upgrade_status_init.yml + dest: "{{ status_file }}" + mode: "0644" + delegate_to: "{{ kube_vip }}" + + - name: Clean up temp files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - /tmp/upgrade_status_init.json + - /tmp/upgrade_status_init.yml + + - name: Display status initialization result + ansible.builtin.debug: + msg: "Upgrade status file initialized at {{ status_file }} with complete structure including all nodes and steps" + +- name: "Kubernetes Upgrade - Load node groups for backup and setup_repos" + hosts: localhost + connection: local + gather_facts: false + strategy: linear + vars: + input_project_dir: "/opt/omnia/input/project_default" + nodes_yaml_path: "/opt/omnia/openchami/workdir/nodes/nodes.yaml" + group_cp_first: "service_kube_control_plane_first_x86_64" + group_cp: "service_kube_control_plane_x86_64" + group_worker: "service_kube_node_x86_64" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + + - name: Use cached omnia_config + ansible.builtin.set_fact: + omnia_config: "{{ hostvars['localhost']['cached_omnia_config'] }}" + + - name: Set k8s_nfs_storage_name + ansible.builtin.set_fact: + k8s_nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Read nodes.yaml + ansible.builtin.slurp: + src: "{{ nodes_yaml_path }}" + register: nodes_slurp + changed_when: false + + - name: Parse nodes.yaml + ansible.builtin.set_fact: + parsed_nodes: "{{ nodes_slurp.content | b64decode | from_yaml }}" + + - name: Build first control plane list + ansible.builtin.set_fact: + groups_cp_first: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp_first) + | map(attribute='name') | list }} + + - name: Build additional control plane list + ansible.builtin.set_fact: + groups_cp: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp) + | map(attribute='name') | list }} + + - name: Build worker list + ansible.builtin.set_fact: + groups_worker: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_worker) + | map(attribute='name') | list }} + + - name: Add first control plane to inventory + ansible.builtin.add_host: + name: "{{ item }}" + groups: + - k8s_control_plane_first + - k8s_control_planes + ansible_host: >- + {{ (parsed_nodes.nodes | selectattr('name', 'equalto', item) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first }} + loop: "{{ groups_cp_first }}" + + - name: Add additional control planes to inventory + ansible.builtin.add_host: + name: "{{ item }}" + groups: + - k8s_control_plane + - k8s_control_planes + ansible_host: >- + {{ (parsed_nodes.nodes | selectattr('name', 'equalto', item) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first }} + loop: "{{ groups_cp }}" + + - name: Add workers to inventory + ansible.builtin.add_host: + name: "{{ item }}" + groups: k8s_workers + ansible_host: >- + {{ (parsed_nodes.nodes | selectattr('name', 'equalto', item) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first }} + loop: "{{ groups_worker }}" + + - name: Write dynamic inventory file for subprocess calls (created once) + ansible.builtin.copy: + content: | + {% set ssh_args = '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -%} + [k8s_control_plane_first] + {% for host in groups_cp_first %} + {% set host_ip = (parsed_nodes.nodes + | selectattr('name', 'equalto', host) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first -%} + {{ host }} ansible_host={{ host_ip }} ansible_user=root ansible_ssh_common_args="{{ ssh_args }}" + {% endfor %} + + [k8s_control_plane] + {% for host in groups_cp %} + {% set host_ip = (parsed_nodes.nodes + | selectattr('name', 'equalto', host) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first -%} + {{ host }} ansible_host={{ host_ip }} ansible_user=root ansible_ssh_common_args="{{ ssh_args }}" + {% endfor %} + + [k8s_workers] + {% for host in groups_worker %} + {% set host_ip = (parsed_nodes.nodes + | selectattr('name', 'equalto', host) + | first).interfaces.0.ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first -%} + {{ host }} ansible_host={{ host_ip }} ansible_user=root ansible_ssh_common_args="{{ ssh_args }}" + {% endfor %} + + [kube_vip_group] + {% set vip = hostvars['localhost']['kube_vip'] -%} + {{ vip }} ansible_host={{ vip }} ansible_user=root ansible_ssh_common_args="{{ ssh_args }}" + dest: /tmp/k8s_upgrade_inventory.ini + mode: "0644" + + - name: Set backup paths for control planes + ansible.builtin.set_fact: + k8s_config_backup_dir: "{{ (storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | first).mount_point }}/upgrade/backup/configs" + +- name: "Kubernetes Upgrade - Backup etcd on kube_vip" + hosts: kube_vip_group + gather_facts: false + strategy: linear + vars: + input_project_dir: "/opt/omnia/input/project_default" + cluster_os_version: "{{ _software_config.cluster_os_version }}" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Load variables from file + ansible.builtin.include_vars: + file: /tmp/k8s_vars.yml + + - name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + + - name: Use cached omnia_config + ansible.builtin.set_fact: + omnia_config: "{{ hostvars['localhost']['cached_omnia_config'] }}" + + - name: Set k8s_nfs_storage_name + ansible.builtin.set_fact: + k8s_nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Set backup paths (using k8s_from_version as backup is taken before upgrade) + ansible.builtin.set_fact: + k8s_client_mount_path: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }} + backup_dir_client: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }}/upgrade/backup/{{ k8s_from_version }} + etcd_members_file: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }}/upgrade/backup/etcd-members.json + status_file: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }}/upgrade/upgrade_status.yml + kube_vip: "{{ hostvars['localhost']['kube_vip'] }}" + + - name: Run etcd backup + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/backup_etcd.yml" + +- name: "Kubernetes Upgrade - Backup K8s config on control planes" + hosts: k8s_control_planes + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + k8s_client_mount_path: "{{ hostvars[groups.kube_vip_group[0]]['k8s_client_mount_path_kube_vip'] }}" + kube_vip: "{{ hostvars[groups.kube_vip_group[0]]['ansible_host'] }}" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Set backup paths from hostvars + ansible.builtin.set_fact: + k8s_config_backup_dir: "{{ hostvars['localhost']['k8s_config_backup_dir'] }}" + + - name: Run K8s config backup + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/backup_k8s_config.yml" + +- name: "Kubernetes Upgrade - Load setup_repos configuration on localhost" + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + oim_provision_path: "/opt/omnia/provision" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Load software_config.json + ansible.builtin.slurp: + path: "{{ input_project_dir }}/software_config.json" + register: _sw_config_slurp + + - name: Parse software_config.json + ansible.builtin.set_fact: + _software_config: "{{ _sw_config_slurp.content | b64decode | from_json }}" + + - name: Load local_repo_access.yml + ansible.builtin.slurp: + path: "{{ oim_provision_path }}/local_repo_access.yml" + register: _local_repo_slurp + + - name: Parse local_repo_access.yml + ansible.builtin.set_fact: + _local_repo_access: "{{ _local_repo_slurp.content | b64decode | from_yaml }}" + + - name: Extract pulp protocol + ansible.builtin.set_fact: + pulp_protocol: "{{ _local_repo_access.offline_tarball_path | regex_replace('^(https?)://.*', '\\1') }}" + + - name: Extract Pulp server IP from local_repo_access + ansible.builtin.set_fact: + pulp_server_ip: "{{ _local_repo_access.offline_tarball_path | regex_replace('^(https?)://([^:]+):.*', '\\2') }}" + + - name: Set admin_nic_ip from Pulp server IP + ansible.builtin.set_fact: + admin_nic_ip: "{{ pulp_server_ip }}" + + - name: Set cluster OS version + ansible.builtin.set_fact: + cluster_os_version: "{{ _software_config.cluster_os_version }}" + + - name: Load HA config + ansible.builtin.slurp: + path: "{{ input_project_dir }}/high_availability_config.yml" + register: _ha_config_slurp + + - name: Parse HA config + ansible.builtin.set_fact: + _ha_config_data: "{{ _ha_config_slurp.content | b64decode | from_yaml }}" + + - name: Extract kube_vip from HA config + ansible.builtin.set_fact: + kube_vip: "{{ _ha_config_data.service_k8s_cluster_ha[0].virtual_ip_address }}" + + - name: Load upgrade_vars.yml + ansible.builtin.slurp: + path: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + register: _upgrade_config_slurp + + - name: Parse upgrade_vars.yml + ansible.builtin.set_fact: + _upgrade_config: "{{ _upgrade_config_slurp.content | b64decode | from_yaml }}" + + - name: Extract K8s target version + ansible.builtin.set_fact: + k8s_target_version: "{{ _upgrade_config.components.service_k8s.supported_versions | last }}" + + - name: Set k8s_target_minor + ansible.builtin.set_fact: + k8s_target_minor: "{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') }}" + + - name: Set intermediate variables for repo generation + ansible.builtin.set_fact: + _pulp_protocol: "{{ _local_repo_access.offline_tarball_path | regex_replace('^(https?)://.*', '\\1') }}" + _pulp_server_ip: "{{ pulp_server_ip }}" + _cluster_os_version: "{{ _software_config.cluster_os_version }}" + _k8s_target_version: "{{ _upgrade_config.components.service_k8s.supported_versions | last }}" + _k8s_target_minor: "{{ _upgrade_config.components.service_k8s.supported_versions | last | regex_replace('\\.[0-9]+$', '') }}" + + - name: Set pulp base URLs + ansible.builtin.set_fact: + pulp_content_base: "{{ _pulp_protocol }}://{{ _pulp_server_ip }}:2225/pulp/content" + pulp_repo_base: >- + {{ _pulp_protocol }}://{{ _pulp_server_ip }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/rhel/{{ _cluster_os_version }}/rpms + + - name: Generate upgrade repo file content + ansible.builtin.set_fact: + repo_content: | + # Omnia K8s Upgrade Repository Configuration + # Generated by upgrade playbook for K8s {{ _k8s_target_version }} + # This file configures access to Pulp repositories for upgrade packages + + [x86_64_rhel_{{ _cluster_os_version }}_kubernetes-v{{ _k8s_target_minor | replace('.', '-') }}] + name=Kubernetes {{ _k8s_target_version }} Repository + baseurl={{ pulp_repo_base }}/x86_64_rhel_{{ _cluster_os_version }}_kubernetes-v{{ _k8s_target_minor | replace('.', '-') }}/ + enabled=1 + gpgcheck=0 + + [x86_64_rhel_{{ _cluster_os_version }}_cri-o-v{{ _k8s_target_minor | replace('.', '-') }}] + name=CRI-O {{ _k8s_target_version }} Repository + baseurl={{ pulp_repo_base }}/x86_64_rhel_{{ _cluster_os_version }}_cri-o-v{{ _k8s_target_minor | replace('.', '-') }}/ + enabled=1 + gpgcheck=0 + +- name: "Kubernetes Upgrade - Setup repos on all nodes" + hosts: k8s_control_planes:k8s_workers + gather_facts: false + strategy: linear + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Deploy upgrade repo file to node + ansible.builtin.copy: + content: "{{ hostvars['localhost']['repo_content'] }}" + dest: "/etc/yum.repos.d/omnia-upgrade.repo" + mode: "0644" + + - name: Verify repo file exists + ansible.builtin.stat: + path: "/etc/yum.repos.d/omnia-upgrade.repo" + register: repo_file_check + + - name: Fail if repo file was not created + ansible.builtin.fail: + msg: "Failed to create repository file /etc/yum.repos.d/omnia-upgrade.repo on {{ inventory_hostname }}" + when: not repo_file_check.stat.exists + + - name: Display repo file status + ansible.builtin.debug: + msg: "Repo file created on {{ inventory_hostname }}: {{ repo_file_check.stat.exists }}" + + - name: Test repository URL reachability (Kubernetes repo) + ansible.builtin.uri: + url: >- + {{ hostvars['localhost']['pulp_repo_base'] }}/x86_64_rhel_{{ + hostvars['localhost']['_cluster_os_version'] + }}_kubernetes-v{{ + hostvars['localhost']['_k8s_target_minor'] + | replace('.', '-') }}/ + method: HEAD + validate_certs: false + timeout: 10 + register: k8s_repo_url_check + failed_when: false + changed_when: false + + - name: Test repository URL reachability (CRI-O repo) + ansible.builtin.uri: + url: >- + {{ hostvars['localhost']['pulp_repo_base'] }}/x86_64_rhel_{{ + hostvars['localhost']['_cluster_os_version'] + }}_cri-o-v{{ + hostvars['localhost']['_k8s_target_minor'] + | replace('.', '-') }}/ + method: HEAD + validate_certs: false + timeout: 10 + register: crio_repo_url_check + failed_when: false + changed_when: false + + - name: Fail if repository URLs are not reachable + ansible.builtin.fail: + msg: | + {% set repo_base = hostvars['localhost']['pulp_repo_base'] -%} + {% set os_ver = hostvars['localhost']['_cluster_os_version'] -%} + {% set k8s_minor = hostvars['localhost']['_k8s_target_minor'] -%} + ═══════════════════════════════════════════════════════════════════════════ + REPOSITORY URL VALIDATION FAILED ON {{ inventory_hostname }} + ═══════════════════════════════════════════════════════════════════════════ + + The upgrade repository URLs are not reachable from this node. + + Kubernetes Repository: + URL: {{ repo_base }}/x86_64_rhel_{{ os_ver }}_kubernetes-v{{ k8s_minor | replace('.', '-') }}/ + Status: {{ k8s_repo_url_check.status | default('UNREACHABLE') }} + {% if k8s_repo_url_check.msg is defined %} + Error: {{ k8s_repo_url_check.msg }} + {% endif %} + + CRI-O Repository: + URL: {{ repo_base }}/x86_64_rhel_{{ os_ver }}_cri-o-v{{ k8s_minor | replace('.', '-') }}/ + Status: {{ crio_repo_url_check.status | default('UNREACHABLE') }} + {% if crio_repo_url_check.msg is defined %} + Error: {{ crio_repo_url_check.msg }} + {% endif %} + + REQUIRED ACTIONS: + ┌─────────────────────────────────────────────────────────────────────┐ + │ 1. Verify Pulp server is running and accessible │ + │ 2. Check network connectivity from {{ inventory_hostname }} to Pulp server │ + │ 3. Verify firewall rules allow access to port 2225 │ + │ 4. Ensure the repository paths exist on the Pulp server │ + └─────────────────────────────────────────────────────────────────────┘ + + UPGRADE STOPPED: Repository must be reachable before proceeding. + ═══════════════════════════════════════════════════════════════════════════ + when: >- + k8s_repo_url_check.status is not defined or + crio_repo_url_check.status is not defined or + k8s_repo_url_check.status not in [200, 301, 302] or + crio_repo_url_check.status not in [200, 301, 302] + + - name: Check if kubeadm package is available + ansible.builtin.command: + cmd: dnf list available --showduplicates kubeadm-{{ hostvars['localhost']['_k8s_target_version'] }} + register: kubeadm_check + changed_when: false + failed_when: false + + - name: Check if CRI-O package is available + ansible.builtin.command: + cmd: dnf list available --showduplicates cri-o-{{ hostvars['localhost']['_k8s_target_version'] }} + register: crio_check + changed_when: false + failed_when: false + + - name: Set repository validation facts + ansible.builtin.set_fact: + repo_validation_passed: "{{ kubeadm_check.rc == 0 and crio_check.rc == 0 }}" + repo_validation_errors: [] + delegate_to: localhost + + - name: Collect repository validation errors + ansible.builtin.set_fact: + repo_validation_errors: >- + {{ repo_validation_errors + [{'node': inventory_hostname, + 'kubeadm_rc': kubeadm_check.rc, 'crio_rc': crio_check.rc, + 'kubeadm_error': kubeadm_check.stderr, + 'crio_error': crio_check.stderr}] }} + delegate_to: localhost + when: kubeadm_check.rc != 0 or crio_check.rc != 0 + + - name: Display package availability for successful nodes + ansible.builtin.debug: + msg: | + Repository validation PASSED on {{ inventory_hostname }}: + - kubeadm package available: {{ kubeadm_check.stdout_lines | first if kubeadm_check.rc == 0 else 'NOT AVAILABLE' }} + - CRI-O package available: {{ crio_check.stdout_lines | first if crio_check.rc == 0 else 'NOT AVAILABLE' }} + when: kubeadm_check.rc == 0 and crio_check.rc == 0 + + - name: Fail playbook if any repository validation errors exist + ansible.builtin.fail: + msg: | + ═══════════════════════════════════════════════════════════════════════════ + REPOSITORY VALIDATION FAILED - UPGRADE CANNOT PROCEED + ═══════════════════════════════════════════════════════════════════════════ + + The following nodes cannot access required Kubernetes packages: + {% for error in repo_validation_errors %} + + Node: {{ error.node }} + ├─ kubeadm package: {{ 'FAILED' if error.kubeadm_rc != 0 else 'OK' }} + ├─ CRI-O package: {{ 'FAILED' if error.crio_rc != 0 else 'OK' }} + {% if error.kubeadm_rc != 0 %} + └─ kubeadm error: {{ error.kubeadm_error | regex_replace('\\n', '\\n │ ') }} + {% endif %} + {% if error.crio_rc != 0 %} + └─ CRI-O error: {{ error.crio_error | regex_replace('\\n', '\\n │ ') }} + {% endif %} + {% endfor %} + + REQUIRED ACTIONS: + ┌─────────────────────────────────────────────────────────────────────┐ + │ 1. Verify Pulp server (182.10.5.150:2225) is accessible from ALL nodes │ + │ 2. Check network connectivity and firewall rules │ + │ 3. Validate repository URLs in /etc/yum.repos.d/omnia-upgrade.repo │ + │ 4. Ensure all required repositories are available on Pulp server │ + │ 5. Fix repository connectivity issues before retrying upgrade │ + └─────────────────────────────────────────────────────────────────────┘ + + UPGRADE STOPPED: All nodes must have reliable repository access. + ═══════════════════════════════════════════════════════════════════════════ + delegate_to: localhost + when: repo_validation_errors | length > 0 + + - name: Display repository validation success + ansible.builtin.debug: + msg: | + ═══════════════════════════════════════════════════════════════════════════ + REPOSITORY VALIDATION PASSED - ALL NODES CAN ACCESS PACKAGES + ═══════════════════════════════════════════════════════════════════════════ + + All {{ ansible_play_hosts_all | length }} nodes successfully validated: + ✅ kubeadm package version {{ hostvars['localhost']['_k8s_target_version'] }} + ✅ CRI-O package version {{ hostvars['localhost']['_k8s_target_version'] }} + + Repository connectivity is confirmed. Proceeding with upgrade... + ═══════════════════════════════════════════════════════════════════════════ + delegate_to: localhost + when: repo_validation_errors | length == 0 + +- name: "Kubernetes Upgrade - Execute Prep and Orchestration" + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + cluster_os_version: "{{ _software_config.cluster_os_version }}" + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Load omnia_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/omnia_config.yml" + name: omnia_config + + - name: Set nfs_storage_name from omnia_config + ansible.builtin.set_fact: + nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Call upgrade_k8s role + ansible.builtin.include_role: + name: ../roles/upgrade_k8s + +- name: "Kubernetes Upgrade - Post-Validation" + hosts: localhost + connection: local + gather_facts: true + vars: + input_project_dir: "/opt/omnia/input/project_default" + oim_data_path: "/opt/omnia/.data" + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: k8s + tasks: + - name: "Skip all tasks — service_k8s not configured" + ansible.builtin.meta: end_play + when: not (hostvars['localhost']['k8s_upgrade_enabled'] | default(true)) + + - name: Load HA config + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/high_availability_config.yml" + name: ha_config_data + + - name: Extract kube_vip + ansible.builtin.set_fact: + kube_vip: "{{ ha_config_data.service_k8s_cluster_ha[0].virtual_ip_address }}" + + - name: Load upgrade_vars.yml + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../../common/vars/upgrade_vars.yml" + + - name: Set k8s_target_version + ansible.builtin.set_fact: + k8s_target_version: "{{ components.service_k8s.supported_versions | last }}" + + - name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + + - name: Use cached omnia_config + ansible.builtin.set_fact: + omnia_config: "{{ hostvars['localhost']['cached_omnia_config'] }}" + + - name: Derive k8s_client_mount_path from storage config + ansible.builtin.set_fact: + k8s_nfs_storage_name: "{{ omnia_config.service_k8s_cluster[0].nfs_storage_name }}" + + - name: Set k8s_client_mount_path + ansible.builtin.set_fact: + k8s_client_mount_path: >- + {{ (storage_config.mounts + | selectattr('name', 'equalto', k8s_nfs_storage_name) + | first).mount_point }} + + - name: Set status_file path + ansible.builtin.set_fact: + status_file: "{{ k8s_client_mount_path }}/upgrade/upgrade_status.yml" + + - name: Read nodes.yaml for all_upgrade_nodes + ansible.builtin.slurp: + src: "/opt/omnia/openchami/workdir/nodes/nodes.yaml" + register: nodes_slurp + + - name: Parse nodes.yaml + ansible.builtin.set_fact: + parsed_nodes: "{{ nodes_slurp.content | b64decode | from_yaml }}" + + - name: Build all_upgrade_nodes list + ansible.builtin.set_fact: + all_upgrade_nodes: >- + {{ (parsed_nodes.nodes + | selectattr('group', 'equalto', 'service_kube_control_plane_first_x86_64') + | map(attribute='name') | list) + + (parsed_nodes.nodes + | selectattr('group', 'equalto', 'service_kube_control_plane_x86_64') + | map(attribute='name') | list) + + (parsed_nodes.nodes + | selectattr('group', 'equalto', 'service_kube_node_x86_64') + | map(attribute='name') | list) }} + + - name: Get k8s_from_version from status file + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + + - name: Parse status file + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + + - name: Set k8s_from_version + ansible.builtin.set_fact: + k8s_from_version: "{{ upgrade_status.upgrade.from_version }}" + + - name: Run post-validation + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/post_validation.yml" + + - name: Update upgrade and multi_hop status to completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_step.yml" + vars: + status_update: + upgrade: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" + multi_hop: + current_hop: "{{ upgrade_status.multi_hop.current_hop | default(0) }}" + hops: >- + {{ (upgrade_status.multi_hop.hops | default([])) + | map('combine', {'status': 'completed', 'completed_at': ansible_date_time.iso8601}) + | list }} + + - name: Update upgrade manifest + block: + - name: Load current manifest + ansible.builtin.set_fact: + manifest: "{{ lookup('file', oim_data_path ~ '/upgrade_manifest.yml') | from_yaml }}" + ignore_errors: true + register: manifest_load + + - name: Update k8s component status + ansible.builtin.copy: + content: | + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + 'k8s': 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ oim_data_path }}/upgrade_manifest.yml" + mode: "0644" + when: manifest_load is succeeded + + - name: Display upgrade completion message + ansible.builtin.debug: + msg: | + ============================================================ + Kubernetes Upgrade Complete! + ============================================================ + From version: {{ k8s_from_version }} + To version: {{ k8s_target_version }} + Nodes upgraded: {{ all_upgrade_nodes | length }} + + Post-validation passed: + - All nodes Ready + - All nodes at target version + - kube-system pods Running + - etcd cluster healthy + - Calico pods Running + - MetalLB pods Running + - API server reachable + - DNS resolution working + ============================================================ - name: "Display upgrade status completed — {{ component_name }}" ansible.builtin.debug: diff --git a/upgrade/playbooks/upgrade_oim.yml b/upgrade/playbooks/upgrade_oim.yml index 0d37f3ff40..9e39726d7d 100644 --- a/upgrade/playbooks/upgrade_oim.yml +++ b/upgrade/playbooks/upgrade_oim.yml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - # ============================================================================ # upgrade_oim.yml — Internal playbook (imported by upgrade.yml --tags oim) # ============================================================================ @@ -39,7 +38,6 @@ input_project_dir: "/opt/omnia/input/project_default" build_stream_config_path: "/opt/omnia/input/project_default/build_stream_config.yml" tasks: - # ── Pre-flight: manifest read + idempotency ───────────────────────── - name: Read upgrade_manifest.yml ansible.builtin.slurp: @@ -59,7 +57,7 @@ # Bypass with: -e skip_approval=true (for CI/CD automation) - name: Display OIM upgrade summary and request approval ansible.builtin.pause: - prompt: | + prompt: |2 ══════════════════════════════════════════════════════════════ OIM UPGRADE — APPROVAL REQUIRED @@ -88,7 +86,7 @@ }) }) | to_nice_yaml }} dest: "{{ manifest_path }}" - mode: '0644' + mode: "0644" - name: "Display upgrade status in-progress — {{ component_name }}" ansible.builtin.debug: @@ -191,7 +189,7 @@ }) }) | to_nice_yaml }} dest: "{{ manifest_path }}" - mode: '0644' + mode: "0644" - name: "Display upgrade status completed — {{ component_name }}" ansible.builtin.debug: diff --git a/upgrade/playbooks/upgrade_workers.yml b/upgrade/playbooks/upgrade_workers.yml new file mode 100644 index 0000000000..1c041ac424 --- /dev/null +++ b/upgrade/playbooks/upgrade_workers.yml @@ -0,0 +1,1022 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Worker upgrade playbook - runs directly on worker nodes via SSH +# Called from execute_single_hop.yml via ansible-playbook command +# +# Batch Processing: +# - Worker-1 is always upgraded first (Component Spec PHASE 6 requirement) +# - Remaining workers are upgraded in batches controlled by worker_parallel_count +# - Default worker_parallel_count=1 (serial), can be overridden with --extra-vars +# +# Error Handling: +# - Each step has its own block/rescue to capture errors +# - Failed steps are marked with status: failed and error description +# - On subsequent runs, steps with status pending/in_progress/failed will be retried +# - Only steps with status: completed are skipped + +- name: Upgrade worker-1 first (Component Spec PHASE 6) + hosts: k8s_workers[0] + serial: 1 + tasks: + - name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + + - name: Load upgrade status from kube_vip + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + when: status_file_check.stat.exists | default(false) + + - name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + when: status_file_check.stat.exists | default(false) + + - name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + when: not (status_file_check.stat.exists | default(false)) + + - name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + + - name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[current_node_name].ip }}" + + - name: "Skip node if already completed - {{ current_node_name }}" + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + + - name: Upgrade worker {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + # ── kubeadm_install ────────────────────────────────────────── + - name: Run kubeadm_install on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install kubeadm {{ k8s_target_version }} + ansible.builtin.dnf: + name: "kubeadm-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: kubeadm_install_result + + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubeadm install failed')) }}" + - name: Fail kubeadm_install step + ansible.builtin.fail: + msg: "kubeadm_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubeadm_upgrade_node ────────────────────────────────────── + - name: Run kubeadm_upgrade_node on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Run kubeadm upgrade node + ansible.builtin.command: kubeadm upgrade node + register: upgrade_node_result + changed_when: false + + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_upgrade_node failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_upgrade_node: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('kubeadm upgrade node failed')) }}" + - name: Fail kubeadm_upgrade_node step + ansible.builtin.fail: + msg: "kubeadm_upgrade_node failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── cordon (production-safe: cordon only, no eviction) ──────── + # Instead of drain (which evicts pods and can be blocked by PDBs), + # we cordon the node to prevent new pods, then let existing pods + # restart in-place after kubelet upgrade. This is safer for stateful + # workloads like Kafka that have strict PodDisruptionBudgets. + - name: Run cordon on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Cordon node (prevent new pod scheduling) {{ current_node_name }} + ansible.builtin.command: kubectl cordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: cordon_result + changed_when: false + + - name: Mark drain completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark drain failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + drain: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('cordon failed')) }}" + - name: Fail drain step + ansible.builtin.fail: + msg: "cordon/drain failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── kubelet_install ────────────────────────────────────────── + - name: Run kubelet_install on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install kubelet and kubectl {{ k8s_target_version }} + ansible.builtin.dnf: + name: + - "kubelet-{{ k8s_target_version }}" + - "kubectl-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: kubelet_install_result + + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet install failed')) }}" + - name: Fail kubelet_install step + ansible.builtin.fail: + msg: "kubelet_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── crio_install ───────────────────────────────────────────── + - name: Run crio_install on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install cri-o {{ k8s_target_version }} + ansible.builtin.dnf: + name: "cri-o-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: crio_install_result + + - name: Reload systemd after cri-o install + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart cri-o service + ansible.builtin.systemd: + name: crio + state: restarted + enabled: true + + - name: Mark crio_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark crio_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + crio_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('cri-o install failed')) }}" + - name: Fail crio_install step + ansible.builtin.fail: + msg: "crio_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubelet_restart ────────────────────────────────────────── + - name: Run kubelet_restart on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart kubelet + ansible.builtin.systemd: + name: kubelet + state: restarted + enabled: true + + - name: Wait for node to become Ready with target version + ansible.builtin.command: >- + kubectl get node {{ node_ip }} + -o jsonpath="{.status.nodeInfo.kubeletVersion}:{range .status.conditions[?(@.type==\"Ready\")]}{.status}{end}" + delegate_to: "{{ kube_vip }}" + register: node_ready + until: + - node_ready.rc == 0 + - "'v' + k8s_target_version + ':True' in node_ready.stdout" + retries: "{{ kubelet_ready_retries }}" + delay: "{{ kubelet_ready_delay }}" + changed_when: false + + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_restart failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_restart: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet restart failed')) }}" + - name: Fail kubelet_restart step + ansible.builtin.fail: + msg: "kubelet_restart failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── uncordon ───────────────────────────────────────────────── + - name: Run uncordon on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Uncordon node {{ current_node_name }} + ansible.builtin.command: kubectl uncordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: uncordon_result + changed_when: false + + - name: Mark uncordon completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark uncordon failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + uncordon: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('uncordon failed')) }}" + - name: Fail uncordon step + ansible.builtin.fail: + msg: "uncordon failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── validation ─────────────────────────────────────────────── + - name: Run validation on node {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Mark validation in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Verify node version + ansible.builtin.command: > + kubectl get node {{ node_ip }} -o jsonpath='{.status.nodeInfo.kubeletVersion}' + delegate_to: "{{ kube_vip }}" + register: node_version + failed_when: node_version.stdout != ("v" + k8s_target_version) + changed_when: false + + - name: Mark validation completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark validation failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + validation: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "Node version mismatch. Expected: v{{ k8s_target_version }}, Got: {{ node_version.stdout | default('unknown') }}" + - name: Fail validation step + ansible.builtin.fail: + msg: "validation failed: Node version mismatch. Expected: v{{ k8s_target_version }}, Got: {{ node_version.stdout | default('unknown') }}" + + # ── Mark node completed ────────────────────────────────────── + - name: Mark node completed {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + rescue: + - name: Mark node as failed (outer rescue) {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Fail after marking status + ansible.builtin.fail: + msg: "Worker {{ current_node_name }} upgrade failed." + +# ============================================================================ +# Second Play: Upgrade remaining workers in batches +# ============================================================================ +- name: Upgrade remaining workers in batches + hosts: k8s_workers[1:] + serial: "{{ worker_parallel_count | default(1) }}" + tasks: + - name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + + - name: Load upgrade status from kube_vip + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + when: status_file_check.stat.exists | default(false) + + - name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + when: status_file_check.stat.exists | default(false) + + - name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + when: not (status_file_check.stat.exists | default(false)) + + - name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + + - name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[current_node_name].ip }}" + + - name: "Skip node if already completed - {{ current_node_name }}" + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + + - name: Upgrade worker {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + # ── kubeadm_install ────────────────────────────────────────── + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install kubeadm {{ k8s_target_version }} + ansible.builtin.dnf: + name: "kubeadm-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: kubeadm_install_result + + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubeadm install failed')) }}" + - name: Fail kubeadm_install step + ansible.builtin.fail: + msg: "kubeadm_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubeadm_upgrade_node ────────────────────────────────────── + - name: Run kubeadm_upgrade_node on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Run kubeadm upgrade node + ansible.builtin.command: kubeadm upgrade node + register: upgrade_node_result + changed_when: false + + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubeadm_upgrade_node failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubeadm_upgrade_node: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('kubeadm upgrade node failed')) }}" + - name: Fail kubeadm_upgrade_node step + ansible.builtin.fail: + msg: "kubeadm_upgrade_node failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── cordon (production-safe: cordon only, no eviction) ──────── + # Instead of drain (which evicts pods and can be blocked by PDBs), + # we cordon the node to prevent new pods, then let existing pods + # restart in-place after kubelet upgrade. This is safer for stateful + # workloads like Kafka that have strict PodDisruptionBudgets. + - name: Run cordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Cordon node (prevent new pod scheduling) {{ current_node_name }} + ansible.builtin.command: kubectl cordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: cordon_result + changed_when: false + + - name: Mark drain completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark drain failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + drain: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('cordon failed')) }}" + - name: Fail drain step + ansible.builtin.fail: + msg: "cordon/drain failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── kubelet_install ────────────────────────────────────────── + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install kubelet and kubectl {{ k8s_target_version }} + ansible.builtin.dnf: + name: + - "kubelet-{{ k8s_target_version }}" + - "kubectl-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: kubelet_install_result + + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet install failed')) }}" + - name: Fail kubelet_install step + ansible.builtin.fail: + msg: "kubelet_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── crio_install ───────────────────────────────────────────── + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Install cri-o {{ k8s_target_version }} + ansible.builtin.dnf: + name: "cri-o-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + register: crio_install_result + + - name: Reload systemd after cri-o install + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart cri-o service + ansible.builtin.systemd: + name: crio + state: restarted + enabled: true + + - name: Mark crio_install completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark crio_install failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + crio_install: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('cri-o install failed')) }}" + - name: Fail crio_install step + ansible.builtin.fail: + msg: "crio_install failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── kubelet_restart ────────────────────────────────────────── + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + + - name: Restart kubelet + ansible.builtin.systemd: + name: kubelet + state: restarted + enabled: true + + - name: Wait for node to become Ready with target version + ansible.builtin.command: >- + kubectl get node {{ node_ip }} + -o jsonpath="{.status.nodeInfo.kubeletVersion}:{range .status.conditions[?(@.type==\"Ready\")]}{.status}{end}" + delegate_to: "{{ kube_vip }}" + register: node_ready + until: + - node_ready.rc == 0 + - "'v' + k8s_target_version + ':True' in node_ready.stdout" + retries: "{{ kubelet_ready_retries }}" + delay: "{{ kubelet_ready_delay }}" + changed_when: false + + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark kubelet_restart failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + kubelet_restart: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('kubelet restart failed')) }}" + - name: Fail kubelet_restart step + ansible.builtin.fail: + msg: "kubelet_restart failed: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown error')) }}" + + # ── uncordon ───────────────────────────────────────────────── + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Uncordon node {{ current_node_name }} + ansible.builtin.command: kubectl uncordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: uncordon_result + changed_when: false + + - name: Mark uncordon completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark uncordon failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + uncordon: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "{{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('uncordon failed')) }}" + - name: Fail uncordon step + ansible.builtin.fail: + msg: "uncordon failed: {{ ansible_failed_result.stderr | default(ansible_failed_result.msg | default('unknown error')) }}" + + # ── validation ─────────────────────────────────────────────── + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Mark validation in_progress + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: Verify node version + ansible.builtin.command: > + kubectl get node {{ node_ip }} -o jsonpath='{.status.nodeInfo.kubeletVersion}' + delegate_to: "{{ kube_vip }}" + register: node_version + failed_when: node_version.stdout != ("v" + k8s_target_version) + changed_when: false + + - name: Mark validation completed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark validation failed + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + steps: + validation: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: "Node version mismatch. Expected: v{{ k8s_target_version }}, Got: {{ node_version.stdout | default('unknown') }}" + - name: Fail validation step + ansible.builtin.fail: + msg: "validation failed: Node version mismatch. Expected: v{{ k8s_target_version }}, Got: {{ node_version.stdout | default('unknown') }}" + + # ── Mark node completed ────────────────────────────────────── + - name: Mark node completed {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + rescue: + - name: Mark node as failed (outer rescue) {{ current_node_name }} + ansible.builtin.include_tasks: "{{ playbook_dir }}/../roles/upgrade_k8s/tasks/update_node_status.yml" + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Fail after marking status + ansible.builtin.fail: + msg: "Worker {{ current_node_name }} upgrade failed." diff --git a/upgrade/roles/upgrade_k8s/defaults/main.yml b/upgrade/roles/upgrade_k8s/defaults/main.yml new file mode 100644 index 0000000000..1d3c08145c --- /dev/null +++ b/upgrade/roles/upgrade_k8s/defaults/main.yml @@ -0,0 +1,32 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Overridable defaults +input_project_dir: "/opt/omnia/input" +oim_shared_path: "/opt/omnia" +oim_data_path: "/opt/omnia/.data" +oim_provision_path: "/opt/omnia/provision" +tmp_path: "/tmp" +cluster_os_version: "10.0" +admin_nic_ip: "{{ hostvars['localhost']['admin_nic_ip'] | default('127.0.0.1') }}" +admin_nic_cidr: "{{ hostvars['localhost']['admin_nic_cidr'] | default('10.0.0.0/24') }}" + +# MinIO S3 for boot images +minio_ip: "{{ hostvars['localhost']['minio_ip'] | default(admin_nic_ip) }}" +minio_port: "9000" +minio_bucket: "omnia" + +# Worker drain options +worker_drain_delete_emptydir: true diff --git a/upgrade/roles/upgrade_k8s/handlers/main.yml b/upgrade/roles/upgrade_k8s/handlers/main.yml new file mode 100644 index 0000000000..7fd6d96f6f --- /dev/null +++ b/upgrade/roles/upgrade_k8s/handlers/main.yml @@ -0,0 +1,11 @@ +--- +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + listen: Reload systemd daemon + +- name: Restart CRI-O service + ansible.builtin.systemd: + name: crio + state: restarted + listen: Restart CRI-O service diff --git a/upgrade/roles/upgrade_k8s/tasks/acquire_lock.yml b/upgrade/roles/upgrade_k8s/tasks/acquire_lock.yml new file mode 100644 index 0000000000..21ff0631d0 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/acquire_lock.yml @@ -0,0 +1,134 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Ensure upgrade directory exists on kube_vip client NFS mount + ansible.builtin.file: + path: "{{ upgrade_dir_client }}" + state: directory + mode: "0755" + delegate_to: "{{ kube_vip }}" + +- name: Check if lock file exists + ansible.builtin.stat: + path: "{{ lock_file }}" + delegate_to: "{{ kube_vip }}" + register: lock_check + +- name: Handle existing lock + when: lock_check.stat.exists + block: + - name: Read existing lock file + ansible.builtin.slurp: + src: "{{ lock_file }}" + delegate_to: "{{ kube_vip }}" + register: lock_content + + - name: Parse existing lock + ansible.builtin.set_fact: + existing_lock: "{{ lock_content.content | b64decode | from_yaml }}" + when: + - lock_content.content | length > 0 + - (lock_content.content | b64decode | from_yaml) is mapping + + - name: Set empty lock when file is empty or invalid + ansible.builtin.set_fact: + existing_lock: {} + when: + - lock_content.content | length == 0 or (lock_content.content | b64decode | from_yaml) is not mapping + + - name: Check if lock holder process is alive (same host) + ansible.builtin.command: + cmd: "kill -0 {{ existing_lock.pid }}" + register: pid_check + changed_when: false + failed_when: false + when: + - existing_lock is defined + - existing_lock.host is defined + - existing_lock.host == ansible_hostname + - existing_lock.pid is defined + + - name: Abort if lock held by live process on same host + ansible.builtin.fail: + msg: "{{ msg_lock_held_same_host }}" + when: + - existing_lock is defined + - existing_lock.host is defined + - existing_lock.host == ansible_hostname + - pid_check is defined + - pid_check.rc == 0 + + - name: Abort if lock held by different host + ansible.builtin.fail: + msg: "{{ msg_lock_held_other_host }}" + when: + - existing_lock is defined + - existing_lock.host is defined + - existing_lock.host != ansible_hostname + + - name: Remove stale lock (same host, dead PID) + ansible.builtin.file: + path: "{{ lock_file }}" + state: absent + delegate_to: "{{ kube_vip }}" + when: + - existing_lock is defined + - existing_lock is mapping + - existing_lock.host is defined + - existing_lock.host == ansible_hostname + - pid_check is defined + - pid_check.rc is defined + - pid_check.rc != 0 + + - name: Warn about stale lock removal + ansible.builtin.debug: + msg: "{{ msg_stale_lock_removed }}" + when: + - existing_lock is defined + - existing_lock is mapping + - existing_lock.host is defined + - existing_lock.host == ansible_hostname + - pid_check is defined + - pid_check.rc is defined + - pid_check.rc != 0 + +- name: Write new lock file + ansible.builtin.copy: + content: | + pid: {{ ansible_pid | default(lookup('pipe', 'echo $PPID')) }} + host: {{ ansible_hostname }} + started_at: {{ ansible_date_time.iso8601 }} + playbook: upgrade_k8s.yml + dest: "{{ lock_file }}" + mode: "0644" + delegate_to: "{{ kube_vip }}" + register: lock_write_result + +- name: Verify lock file ownership + ansible.builtin.slurp: + src: "{{ lock_file }}" + delegate_to: "{{ kube_vip }}" + register: verify_lock + +- name: Parse verified lock + ansible.builtin.set_fact: + verified_lock: "{{ verify_lock.content | b64decode | from_yaml }}" + +- name: Abort if lock verification failed + ansible.builtin.fail: + msg: "{{ msg_lock_verification_failed }}" + when: + - verified_lock is defined + - verified_lock.host is defined + - verified_lock.host != ansible_hostname diff --git a/upgrade/roles/upgrade_k8s/tasks/add_nodes_to_inventory.yml b/upgrade/roles/upgrade_k8s/tasks/add_nodes_to_inventory.yml new file mode 100644 index 0000000000..f42efbf1a8 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/add_nodes_to_inventory.yml @@ -0,0 +1,68 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Add all K8s nodes to Ansible inventory for proper SSH delegation + +- name: Add first control plane nodes to inventory + ansible.builtin.add_host: + name: "{{ item }}" + ansible_host: "{{ node_ips[item] }}" + ansible_user: root + ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + groups: + - k8s_nodes + - k8s_control_plane_first + node_role: control_plane_first + node_ip: "{{ node_ips[item] }}" + loop: "{{ groups_cp_first }}" + loop_control: + label: "{{ item }} ({{ node_ips[item] }})" + +- name: Add additional control plane nodes to inventory + ansible.builtin.add_host: + name: "{{ item }}" + ansible_host: "{{ node_ips[item] }}" + ansible_user: root + ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + groups: + - k8s_nodes + - k8s_control_plane + node_role: control_plane + node_ip: "{{ node_ips[item] }}" + loop: "{{ groups_cp }}" + loop_control: + label: "{{ item }} ({{ node_ips[item] }})" + +- name: Add worker nodes to inventory + ansible.builtin.add_host: + name: "{{ item }}" + ansible_host: "{{ node_ips[item] }}" + ansible_user: root + ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + groups: + - k8s_nodes + - k8s_workers + node_role: worker + node_ip: "{{ node_ips[item] }}" + loop: "{{ groups_worker }}" + loop_control: + label: "{{ item }} ({{ node_ips[item] }})" + +- name: Display added nodes + ansible.builtin.debug: + msg: | + Added {{ all_upgrade_nodes | length }} nodes to inventory: + - Control plane (first): {{ groups_cp_first | join(', ') }} + - Control plane (additional): {{ groups_cp | join(', ') }} + - Workers: {{ groups_worker | join(', ') }} diff --git a/upgrade/roles/upgrade_k8s/tasks/backup_etcd.yml b/upgrade/roles/upgrade_k8s/tasks/backup_etcd.yml new file mode 100644 index 0000000000..350add08c5 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/backup_etcd.yml @@ -0,0 +1,189 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if etcd backup is already completed + ansible.builtin.debug: + msg: "etcd backup already completed (status: {{ upgrade_status.etcd_backup.status | default('pending') }}) - skipping backup" + when: + - upgrade_status.etcd_backup.status | default('pending') == 'completed' + +- name: Skip etcd backup if already completed + ansible.builtin.meta: noop + when: upgrade_status.etcd_backup.status | default('pending') == 'completed' + +- name: Check if etcd backup already exists + ansible.builtin.stat: + path: "{{ backup_dir_client }}/etcd-snapshot.db" + register: etcd_backup_check + failed_when: false + when: upgrade_status.etcd_backup.status | default('pending') != 'completed' + +- name: Skip backup if already exists + ansible.builtin.debug: + msg: "etcd backup already exists at {{ backup_dir_client }}/etcd-snapshot.db - skipping backup" + when: + - etcd_backup_check.stat.exists + - upgrade_status.etcd_backup.status | default('pending') != 'completed' + +- name: Update etcd backup status (existing backup) + ansible.builtin.include_tasks: update_node_step.yml + run_once: true + when: + - etcd_backup_check.stat.exists + - upgrade_status.etcd_backup.status | default('pending') != 'completed' + vars: + status_update: + etcd_backup: + status: completed + path: "{{ backup_dir_client }}/etcd-snapshot.db" + timestamp: "{{ now(utc=true).isoformat() }}" + error: + +- name: Perform etcd backup + when: + - not etcd_backup_check.stat.exists + - upgrade_status.etcd_backup.status | default('pending') != 'completed' + block: + - name: Ensure backup directory exists on client side (for kube_vip) + ansible.builtin.file: + path: "{{ backup_dir_client }}" + state: directory + mode: "0755" + + - name: Get etcd pod name + ansible.builtin.command: /usr/bin/kubectl get pods -n kube-system -l component=etcd -o jsonpath='{.items[0].metadata.name}' + register: etcd_pod + changed_when: false + + - name: Set etcd backup paths (use client path for kube_vip) + ansible.builtin.set_fact: + etcd_snapshot_file_client: "{{ backup_dir_client }}/etcd-snapshot.db" + etcdctl_binary_client: "{{ backup_dir_client }}/etcdctl" + when: backup_dir_client is defined + + - name: Get etcd container ID + ansible.builtin.shell: set -o pipefail && /usr/bin/crictl ps | grep etcd | grep -v pause | awk '{print $1}' | head -1 + args: + executable: /bin/bash + register: etcd_container_id + changed_when: false + + - name: Find etcdctl in CRI-O overlay storage + ansible.builtin.shell: set -o pipefail && find /tmp/crio-storage/overlay -name etcdctl -type f 2>/dev/null | head -1 + args: + executable: /bin/bash + register: etcdctl_overlay_path + changed_when: false + + - name: Find etcdutl in CRI-O overlay storage + ansible.builtin.shell: set -o pipefail && find /tmp/crio-storage/overlay -name etcdutl -type f 2>/dev/null | head -1 + args: + executable: /bin/bash + register: etcdutl_overlay_path + changed_when: false + + - name: Copy etcdctl to /usr/local/bin on host + ansible.builtin.copy: + src: "{{ etcdctl_overlay_path.stdout }}" + dest: /usr/local/bin/etcdctl + mode: "0755" + remote_src: true + when: etcdctl_overlay_path.stdout | length > 0 + + - name: Copy etcdutl to /usr/local/bin on host + ansible.builtin.copy: + src: "{{ etcdutl_overlay_path.stdout }}" + dest: /usr/local/bin/etcdutl + mode: "0755" + remote_src: true + when: etcdutl_overlay_path.stdout | length > 0 + + - name: Copy etcdctl to backup directory + ansible.builtin.copy: + src: "{{ etcdctl_overlay_path.stdout }}" + dest: "{{ backup_dir_client }}/etcdctl" + mode: "0755" + remote_src: true + when: etcdctl_overlay_path.stdout | length > 0 + + - name: Copy etcdutl to backup directory + ansible.builtin.copy: + src: "{{ etcdutl_overlay_path.stdout }}" + dest: "{{ backup_dir_client }}/etcdutl" + mode: "0755" + remote_src: true + when: etcdutl_overlay_path.stdout | length > 0 + + - name: Take etcd snapshot to host etcd data directory + ansible.builtin.command: + cmd: >- + etcdctl snapshot save /var/lib/etcd/snapshot.db + --endpoints=https://127.0.0.1:2379 + --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/server.crt + --key=/etc/kubernetes/pki/etcd/server.key + register: etcd_snapshot_result + retries: 12 + delay: 10 + until: etcd_snapshot_result.rc == 0 + changed_when: true + + - name: Copy etcd snapshot from host to NFS backup + ansible.builtin.copy: + src: /var/lib/etcd/snapshot.db + dest: "{{ backup_dir_client }}/etcd-snapshot.db" + mode: "0644" + remote_src: true + + - name: Remove temporary snapshot file + ansible.builtin.file: + path: /var/lib/etcd/snapshot.db + state: absent + + - name: Verify etcd snapshot exists on NFS + ansible.builtin.stat: + path: "{{ backup_dir_client }}/etcd-snapshot.db" + register: snapshot_verify + failed_when: not snapshot_verify.stat.exists + + - name: Save etcd member list + ansible.builtin.command: + cmd: >- + etcdctl member list -w json + --endpoints=https://127.0.0.1:2379 + --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/server.crt + --key=/etc/kubernetes/pki/etcd/server.key + register: etcd_members + changed_when: false + retries: 12 + delay: 10 + until: etcd_members.rc == 0 + + - name: Write etcd members to NFS + ansible.builtin.copy: + content: "{{ etcd_members.stdout }}" + dest: "{{ etcd_members_file }}" + mode: "0644" + + - name: Update etcd backup status + ansible.builtin.include_tasks: update_node_step.yml + run_once: true + vars: + status_update: + etcd_backup: + status: completed + path: "{{ backup_dir_client }}/etcd-snapshot.db" + timestamp: "{{ now(utc=true).isoformat() }}" + error: diff --git a/upgrade/roles/upgrade_k8s/tasks/backup_k8s_config.yml b/upgrade/roles/upgrade_k8s/tasks/backup_k8s_config.yml new file mode 100644 index 0000000000..532bdd7c4c --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/backup_k8s_config.yml @@ -0,0 +1,86 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Load upgrade status from kube_vip + ansible.builtin.slurp: + src: "{{ k8s_client_mount_path }}/upgrade/upgrade_status.yml" + delegate_to: "{{ kube_vip }}" + register: status_slurp + +- name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + +- name: Check if k8s config backup is already completed + ansible.builtin.debug: + msg: "k8s config backup already completed (status: {{ upgrade_status.k8s_config_backup.status | default('pending') }}) - skipping backup" + when: + - upgrade_status.k8s_config_backup.status | default('pending') == 'completed' + +- name: Skip k8s config backup if already completed + ansible.builtin.meta: noop + when: upgrade_status.k8s_config_backup.status | default('pending') == 'completed' + +- name: Ensure config backup directory exists + ansible.builtin.file: + path: "{{ k8s_config_backup_dir }}" + state: directory + mode: "0755" + when: upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + +- name: Create node-specific backup directories + ansible.builtin.file: + path: "{{ k8s_config_backup_dir }}/{{ inventory_hostname }}" + state: directory + mode: "0755" + when: upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + +- name: Check if k8s-config backup already exists + ansible.builtin.stat: + path: "{{ k8s_config_backup_dir }}/{{ inventory_hostname }}/k8s-config.tar.gz" + register: k8s_config_backup_stat + when: upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + +- name: Archive /etc/kubernetes to NFS mount (only if backup doesn't exist) + ansible.builtin.archive: + path: /etc/kubernetes + dest: "{{ k8s_config_backup_dir }}/{{ inventory_hostname }}/k8s-config.tar.gz" + format: gz + mode: "0644" + register: k8s_config_archive + when: + - upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + - k8s_config_backup_stat.stat is defined + - not k8s_config_backup_stat.stat.exists + +- name: Set permissions on archived k8s config + ansible.builtin.file: + path: "{{ k8s_config_backup_dir }}/{{ inventory_hostname }}/k8s-config.tar.gz" + mode: "0644" + when: + - k8s_config_archive is defined + - k8s_config_archive.dest is defined + +- name: Update k8s config backup status + ansible.builtin.include_tasks: update_status.yml + run_once: true + when: upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + vars: + status_file: "{{ k8s_client_mount_path }}/upgrade/upgrade_status.yml" + status_update: + k8s_config_backup: + status: completed + path: "{{ k8s_config_backup_dir }}" + timestamp: "{{ now(utc=true).isoformat() }}" + error: diff --git a/upgrade/roles/upgrade_k8s/tasks/build_squashfs.yml b/upgrade/roles/upgrade_k8s/tasks/build_squashfs.yml new file mode 100644 index 0000000000..365e2c9a9a --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/build_squashfs.yml @@ -0,0 +1,124 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Build squashfs boot image for a specific target version. +# +# Creates a per-hop staging directory whose layout mirrors input_project_dir: +# - Version-specific files (software_config.json, service_k8s.json) +# are symlinked from artifacts// +# - All other files are symlinked from the real input_project_dir +# Then calls build_image_x86_64.yml with the staging dir. +# +# Each squashfs must contain exactly one version's RPMs, so this is +# called once per hop (cannot be merged like local_repo). +# +# Inputs: +# _hop_target_version — full target version (e.g. "1.35.1") +# _hop_artifacts_dir — artifacts directory name (e.g. "v1.35.1") +# input_project_dir — real input directory (shared files) + +# Commented out MinIO check to always proceed with build +# - name: "build_squashfs [{{ _hop_target_version }}] — Check if image already exists in MinIO" +# ansible.builtin.command: +# cmd: >- +# mc stat minio/boot-images/k8s-{{ _hop_target_version }}/squashfs.img +# register: _squashfs_check +# changed_when: false +# failed_when: false + +# - name: "build_squashfs [{{ _hop_target_version }}] — Skip if image already exists" +# ansible.builtin.debug: +# msg: "Squashfs image for v{{ _hop_target_version }} already exists in MinIO — skipping build." +# when: _squashfs_check.rc == 0 + +- name: "Build squashfs — Build and upload image" + # when: _squashfs_check.rc != 0 + vars: + _artifacts_abs: "{{ role_path }}/../../artifacts/{{ _hop_artifacts_dir }}" + block: + - name: "Build squashfs — Create staging directory" + ansible.builtin.tempfile: + state: directory + prefix: "upgrade_build_{{ _hop_target_version }}_" + register: _build_staging + + # -- Version-specific files from artifacts -- + - name: "Build squashfs — Symlink software_config.json from artifacts" + ansible.builtin.file: + src: "{{ _artifacts_abs }}/software_config.json" + dest: "{{ _build_staging.path }}/software_config.json" + state: link + + - name: "Build squashfs — Create config subdir for service_k8s.json" + ansible.builtin.file: + path: "{{ _build_staging.path }}/config/x86_64/rhel/{{ cluster_os_version }}" + state: directory + mode: "{{ dir_perm_755 }}" + + - name: "Build squashfs — Symlink service_k8s.json from artifacts" + ansible.builtin.file: + src: "{{ _artifacts_abs }}/service_k8s.json" + dest: "{{ _build_staging.path }}/config/x86_64/rhel/{{ cluster_os_version }}/service_k8s.json" + state: link + + # -- Shared files from real input_project_dir -- + - name: "Build squashfs — Symlink shared config files" + ansible.builtin.file: + src: "{{ input_project_dir }}/{{ item }}" + dest: "{{ _build_staging.path }}/{{ item }}" + state: link + force: false + loop: + - network_spec.yml + - build_stream_config.yml + - omnia_config_credentials.yml + - storage_config.yml + - high_availability_config.yml + failed_when: false + + - name: "Build squashfs — Symlink shared package manifests" + ansible.builtin.file: + src: "{{ input_project_dir }}/config/x86_64/rhel/{{ cluster_os_version }}/{{ item }}" + dest: "{{ _build_staging.path }}/config/x86_64/rhel/{{ cluster_os_version }}/{{ item }}" + state: link + force: false + loop: + - default_packages.json + - additional_packages.json + - admin_debug_packages.json + failed_when: false + + - name: "Build squashfs — Run build_image playbook with staging dir" + ansible.builtin.command: + cmd: >- + ansible-playbook + {{ role_path }}/../../build_image_x86_64/build_image_x86_64.yml + --extra-vars "input_project_dir={{ _build_staging.path }}" + --extra-vars "project_dir_status=true" + --extra-vars "target_k8s_version={{ _hop_target_version }}" + register: _build_result + changed_when: "'changed=' in _build_result.stdout" + + - name: "Build squashfs — Verify image uploaded to MinIO" + ansible.builtin.command: + cmd: >- + mc stat minio/boot-images/k8s-{{ _hop_target_version }}/squashfs.img + changed_when: false + + always: + - name: "Build squashfs — Clean up staging directory" + ansible.builtin.file: + path: "{{ _build_staging.path }}" + state: absent + when: _build_staging.path is defined diff --git a/upgrade/roles/upgrade_k8s/tasks/check_and_mark_hop_completed.yml b/upgrade/roles/upgrade_k8s/tasks/check_and_mark_hop_completed.yml new file mode 100644 index 0000000000..4f3e829e4c --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/check_and_mark_hop_completed.yml @@ -0,0 +1,47 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Get all node versions + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' + register: all_node_versions + changed_when: false + +- name: Check if all nodes are at target version + ansible.builtin.set_fact: + _all_nodes_upgraded: >- + {{ all_node_versions.stdout_lines + | map('regex_replace', '^v', '') + | unique | list + | length == 1 + and k8s_target_version in all_node_versions.stdout }} + +- name: Update hop_status to completed if all nodes upgraded + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + hop_status: >- + {{ {k8s_target_version: {'status': 'completed', 'completed_at': ansible_date_time.iso8601}} }} + when: _all_nodes_upgraded | bool + +- name: Display hop completion status + ansible.builtin.debug: + msg: >- + {% if _all_nodes_upgraded %} + Hop to {{ k8s_target_version }} COMPLETED - All nodes upgraded successfully. + {% else %} + Hop to {{ k8s_target_version }} IN PROGRESS - Some nodes still pending upgrade. + {% endif %} diff --git a/upgrade/roles/upgrade_k8s/tasks/detect_addon_versions.yml b/upgrade/roles/upgrade_k8s/tasks/detect_addon_versions.yml new file mode 100644 index 0000000000..4b85d004e4 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/detect_addon_versions.yml @@ -0,0 +1,149 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Detect current Calico version + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + kubectl get pods -n kube-system -l k8s-app=calico-node + -o jsonpath='{.items[0].spec.containers[0].image}' 2>/dev/null | + grep -oP 'v\d+\.\d+\.\d+' || echo "unknown" + args: + executable: /bin/bash + register: calico_current_version_raw + changed_when: false + failed_when: false + +- name: Set calico_current_version fact + ansible.builtin.set_fact: + calico_current_version: >- + {{ calico_current_version_raw.stdout | trim | regex_replace('^v', '') + if calico_current_version_raw.stdout is defined and calico_current_version_raw.stdout != 'unknown' + else 'unknown' }} + +- name: Set calico_from_version fact (preserve existing if set) + ansible.builtin.set_fact: + calico_from_version: >- + {{ upgrade_status.addon_upgrade.calico.from_version + if (upgrade_status.addon_upgrade.calico.from_version | default('unknown')) not in ['unknown', ''] + else calico_current_version }} + +- name: Detect current MetalLB version + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + kubectl get deployment -n metallb-system controller + -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null | + grep -oP 'v\d+\.\d+\.\d+' || echo "unknown" + args: + executable: /bin/bash + register: metallb_current_version_raw + changed_when: false + failed_when: false + +- name: Set metallb_current_version fact + ansible.builtin.set_fact: + metallb_current_version: >- + {{ metallb_current_version_raw.stdout | trim | regex_replace('^v', '') + if metallb_current_version_raw.stdout is defined and metallb_current_version_raw.stdout != 'unknown' + else 'unknown' }} + +- name: Set metallb_from_version fact (preserve existing if set) + ansible.builtin.set_fact: + metallb_from_version: >- + {{ upgrade_status.addon_upgrade.metallb.from_version + if (upgrade_status.addon_upgrade.metallb.from_version | default('unknown')) not in ['unknown', ''] + else metallb_current_version }} + +- name: Detect current Helm version + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + helm version --short 2>/dev/null | + grep -oP 'v\d+\.\d+\.\d+' || echo "unknown" + args: + executable: /bin/bash + register: helm_current_version_raw + changed_when: false + failed_when: false + +- name: Set helm_current_version fact + ansible.builtin.set_fact: + helm_current_version: >- + {{ helm_current_version_raw.stdout | trim | regex_replace('^v', '') + if helm_current_version_raw.stdout is defined and helm_current_version_raw.stdout != 'unknown' + else 'unknown' }} + +- name: Set helm_from_version fact (preserve existing if set) + ansible.builtin.set_fact: + helm_from_version: >- + {{ upgrade_status.addon_upgrade.helm.from_version + if (upgrade_status.addon_upgrade.helm.from_version | default('unknown')) not in ['unknown', ''] + else helm_current_version }} + +- name: Determine Calico status + ansible.builtin.set_fact: + calico_status: >- + {{ + 'completed' if calico_current_version == calico_target_version + else 'pending' + }} + +- name: Determine MetalLB status + ansible.builtin.set_fact: + metallb_status: >- + {{ + 'completed' if metallb_current_version == metallb_target_version + else 'pending' + }} + +- name: Determine Helm status + ansible.builtin.set_fact: + helm_status: >- + {{ + 'completed' if helm_current_version == helm_target_version + else 'pending' + }} + +- name: Update addon status with detected versions + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + addon_upgrade: + calico: + status: "{{ calico_status }}" + current_version: "{{ calico_current_version }}" + from_version: "{{ calico_from_version }}" + target_version: "{{ calico_target_version }}" + metallb: + status: "{{ metallb_status }}" + current_version: "{{ metallb_current_version }}" + from_version: "{{ metallb_from_version }}" + target_version: "{{ metallb_target_version }}" + helm: + status: "{{ helm_status }}" + current_version: "{{ helm_current_version }}" + from_version: "{{ helm_from_version }}" + target_version: "{{ helm_target_version }}" + +- name: Display detected addon versions + ansible.builtin.debug: + msg: | + Detected addon versions: + Calico: {{ calico_current_version }} (target: {{ calico_target_version }}) - {{ calico_status }} + MetalLB: {{ metallb_current_version }} (target: {{ metallb_target_version }}) - {{ metallb_status }} + Helm: {{ helm_current_version }} (target: {{ helm_target_version }}) - {{ helm_status }} diff --git a/upgrade/roles/upgrade_k8s/tasks/detect_hop_chain_from_manifest.yml b/upgrade/roles/upgrade_k8s/tasks/detect_hop_chain_from_manifest.yml new file mode 100644 index 0000000000..93c9409848 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/detect_hop_chain_from_manifest.yml @@ -0,0 +1,117 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Detect the upgrade hop chain from upgrade_vars.yml +# This replaces the separate upgrade_paths.yml file for single source of truth +# +# Inputs: +# k8s_from_version — current cluster version (e.g. "1.34.1") +# k8s_from_minor — current cluster minor (e.g. "1.34") +# components — loaded from upgrade_vars.yml (top-level key) +# +# Outputs (set_fact): +# hop_chain — list of dicts: [{from_minor, to_version}, ...] +# is_multi_hop — boolean, true if chain has > 1 hop +# k8s_target_version — target version from manifest + +# ── Extract target version from manifest ───────────────────────────── +- name: Set k8s_target_version from manifest + ansible.builtin.set_fact: + k8s_target_version: "{{ components.service_k8s.supported_versions | last }}" + +# ── Extract supported versions from manifest ──────────────────────── +- name: Set supported versions list + ansible.builtin.set_fact: + _supported_versions: "{{ components.service_k8s.supported_versions }}" + +# ── Find current version index in supported versions ───────────────── +- name: Find current version index + ansible.builtin.set_fact: + _current_version_index: "{{ _supported_versions.index(k8s_from_version) }}" + when: k8s_from_version in _supported_versions + +# ── Fail if current version not in supported versions ─────────────── +- name: Fail if current version not supported + ansible.builtin.fail: + msg: >- + Current K8s version {{ k8s_from_version }} not found in supported versions: {{ _supported_versions | join(', ') }}. + Cannot determine upgrade path. + when: k8s_from_version not in _supported_versions + +# ── Build hop chain from supported versions ─────────────────────────── +- name: Build hop chain from current version to target + ansible.builtin.set_fact: + _hop_chain_raw: "{{ _supported_versions[_current_version_index | int + 1 :] }}" + when: _current_version_index is defined + +# ── Convert version list to hop chain format ───────────────────────── +# For multi-hop chains, each hop's from_minor must be the minor version +# of the PREVIOUS version in _supported_versions, not the initial +# k8s_from_minor. e.g. 1.34.1 → 1.35.1 → 1.36.1 produces: +# hop 1: from_minor=1.34, to_version=1.35.1 +# hop 2: from_minor=1.35, to_version=1.36.1 +- name: Build from-version list for each hop + ansible.builtin.set_fact: + _hop_from_versions: >- + {{ _supported_versions[_current_version_index | int : _current_version_index | int + (_hop_chain_raw | length)] }} + when: _hop_chain_raw is defined and _hop_chain_raw | length > 0 + +- name: Initialize hop chain + ansible.builtin.set_fact: + hop_chain: [] + when: _hop_chain_raw is defined and _hop_chain_raw | length > 0 + +- name: Build hop chain with correct from_minor per hop + ansible.builtin.set_fact: + hop_chain: >- + {{ hop_chain + [{ + 'from_minor': _hop_from_versions[hop_idx] | regex_replace('\.[0-9]+$', ''), + 'to_version': _hop_chain_raw[hop_idx], + 'artifacts_dir': 'v' + _hop_chain_raw[hop_idx] + }] }} + loop: "{{ range(_hop_chain_raw | length) | list }}" + loop_control: + loop_var: hop_idx + when: _hop_chain_raw is defined and _hop_chain_raw | length > 0 + +# ── Handle case where already at target version ─────────────────────── +- name: Set empty hop chain if already at target + ansible.builtin.set_fact: + hop_chain: [] + when: _hop_chain_raw is not defined or _hop_chain_raw | length == 0 + +# ── Set is_multi_hop flag ─────────────────────────────────────────── +- name: Set is_multi_hop flag + ansible.builtin.set_fact: + is_multi_hop: "{{ hop_chain | length > 1 }}" + +# ── Display hop chain ──────────────────────────────────────────────── +- name: Display upgrade hop chain + ansible.builtin.debug: + msg: >- + Upgrade hop chain ({{ hop_chain | length }} hop{{ 's' if hop_chain | length > 1 else '' }}): + {% for hop in hop_chain %} + {{ loop.index }}. {{ hop.from_minor }} → {{ hop.to_version }} + {% endfor %} + {% if hop_chain | length == 0 %} + Cluster already at target version {{ k8s_target_version }}. + {% endif %} + +# ── Display configuration summary ───────────────────────────────────── +- name: Display configuration summary + ansible.builtin.debug: + msg: >- + Hop chain derived from upgrade_vars.yml. + Current: {{ k8s_from_version }}, Target: {{ k8s_target_version }} + Hops: {{ hop_chain | length }} diff --git a/upgrade/roles/upgrade_k8s/tasks/execute_single_hop.yml b/upgrade/roles/upgrade_k8s/tasks/execute_single_hop.yml new file mode 100644 index 0000000000..472442d8e0 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/execute_single_hop.yml @@ -0,0 +1,639 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Execute a single upgrade hop (Phase 2). +# Runs the full upgrade sequence: preflight → backup → CPs → addons → workers. +# +# Inputs (set by caller): +# _current_hop — hop dict with from_minor, to_version, artifacts_dir +# _hop_idx — 0-based index of this hop in the chain + +# ── Load config artifacts for this hop ────────────────────────────── +- name: "Hop — Load version vars from input_project_dir" + ansible.builtin.include_tasks: load_version_vars.yml + vars: + software_config_file: "{{ input_project_dir }}/software_config.json" + # service_k8s_config_file is set dynamically in load_version_vars.yml based on k8s_target_version + +# ── Set upgrade paths (must be after load_version_vars) ────────────── +- name: "Hop — Set upgrade paths on client NFS mount" + ansible.builtin.set_fact: + upgrade_dir_client: "{{ k8s_client_mount_path }}/upgrade" + status_file: "{{ k8s_client_mount_path }}/upgrade/upgrade_status.yml" + lock_file: "{{ k8s_client_mount_path }}/upgrade/upgrade.lock" + backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup" + etcd_snapshot_file: "{{ k8s_client_mount_path }}/upgrade/backup/etcd-snapshot.db" + etcdctl_binary: "{{ k8s_client_mount_path }}/upgrade/backup/etcdctl" + etcd_members_file: "{{ k8s_client_mount_path }}/upgrade/backup/etcd-members.json" + k8s_config_backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup/configs" + +# ── Load status and build node inventory ───────────────────────────── +- name: "Hop — Load or create upgrade status" + ansible.builtin.include_tasks: load_status.yml + +# ── Check if this hop is already completed (re-run case) ──────────── +- name: "Hop — Check if hop already completed" + ansible.builtin.set_fact: + _hop_completed: >- + {{ (upgrade_status.multi_hop.hops | default([])) + | selectattr('to', 'equalto', _current_hop.to_version) + | selectattr('status', 'equalto', 'completed') + | list | length > 0 }} + +- name: "Hop — Skip completed hop" + ansible.builtin.debug: + msg: >- + Hop {{ _hop_idx + 1 }} ({{ _current_hop.from_minor }} → {{ _current_hop.to_version }}) + already completed — skipping. + when: _hop_completed | bool + +# ── Execute this hop (only if not already completed) ───────────────── +- name: "Hop — Execute upgrade" + when: not (_hop_completed | bool) + block: + # ── Set per-hop backup directory ───────────────────────────────── + # Each hop gets its own backup subdir so rollback finds the right + # snapshot. Derived vars (etcd_snapshot_file, etc.) in vars/main.yml + # reference {{ backup_dir }} and pick up this override automatically. + - name: "Hop — Set per-hop backup directory" + ansible.builtin.set_fact: + backup_dir: "{{ upgrade_dir_client }}/backup/{{ _current_hop.from_minor }}" + backup_dir_client: "{{ upgrade_dir_client }}/backup/{{ _current_hop.from_minor }}" + + - name: "Hop — Determine if this is final hop" + ansible.builtin.set_fact: + _is_final_hop: "{{ _hop_idx == ((hop_chain | length) - 1) }}" + + # ── Reset status for this hop ──────────────────────────────────── + # After a previous hop completes, the status file has + # upgrade.status=completed, etcd_backup.status=completed, etc. + # We must reset these so tasks in this hop are not skipped. + # The reset is done when writing the status file below to ensure persistence. + - name: "Hop — Write reset status file" + ansible.builtin.include_tasks: update_node_step.yml + vars: + _reset_nodes: >- + {%- set nodes_dict = {} -%} + {%- for node in all_upgrade_nodes -%} + {%- set _ = nodes_dict.update({node: { + 'status': 'pending', + 'version_before': k8s_from_version, + 'version_current': k8s_from_version, + 'steps': {} + }}) -%} + {%- endfor -%} + {{ nodes_dict }} + status_update: + upgrade: + from_version: "{{ k8s_from_version }}" + target_version: "{{ _current_hop.to_version }}" + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + completed_at: None + etcd_backup: + status: >- + {{ upgrade_status.etcd_backup.status | default('pending') + if (upgrade_status.etcd_backup.status | default('pending')) == 'completed' + else 'pending' }} + path: >- + {{ upgrade_status.etcd_backup.path | default(None) + if (upgrade_status.etcd_backup.status | default('pending')) == 'completed' + else None }} + timestamp: >- + {{ upgrade_status.etcd_backup.timestamp | default(None) + if (upgrade_status.etcd_backup.status | default('pending')) == 'completed' + else None }} + error: None + k8s_config_backup: + status: >- + {{ upgrade_status.k8s_config_backup.status | default('pending') + if (upgrade_status.k8s_config_backup.status | default('pending')) == 'completed' + else 'pending' }} + path: >- + {{ upgrade_status.k8s_config_backup.path | default(None) + if (upgrade_status.k8s_config_backup.status | default('pending')) == 'completed' + else None }} + timestamp: >- + {{ upgrade_status.k8s_config_backup.timestamp | default(None) + if (upgrade_status.k8s_config_backup.status | default('pending')) == 'completed' + else None }} + error: None + addon_upgrade: + status: pending + calico: + status: pending + metallb: + status: pending + helm: + status: pending + nodes: "{{ _reset_nodes }}" + + # ── Record hop in multi-hop tracking ───────────────────────────── + - name: "Hop — Record hop in status" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + multi_hop: + current_hop: "{{ _hop_idx }}" + hops: >- + {{ (upgrade_status.multi_hop.hops | default([])) + | rejectattr('to', 'equalto', _current_hop.to_version) + | list + + [{ + 'from': k8s_from_version, + 'to': _current_hop.to_version, + 'status': 'in_progress', + 'started_at': ansible_date_time.iso8601 + }] }} + + # ── Preflight ──────────────────────────────────────────────────── + - name: "Hop — Run preflight checks (Pulp)" + ansible.builtin.include_tasks: preflight_checks_pulp.yml + when: upgrade_status.upgrade.status != 'completed' + + # ── Backup ─────────────────────────────────────────────────────── + # Etcd backup now runs in a separate play targeting kube_vip_group + # - name: "hop {{ _hop_idx + 1 }} [{{ _current_hop.to_version }}] — Backup etcd" + # ansible.builtin.include_tasks: backup_etcd.yml + # when: + # - upgrade_status.etcd_backup.status | default('pending') != 'completed' + # - upgrade_status.upgrade.status != 'completed' + + # K8s config backup now runs in separate plays targeting node groups + # - name: "hop {{ _hop_idx + 1 }} [{{ _current_hop.to_version }}] — Backup K8s config" + # ansible.builtin.include_tasks: backup_k8s_config.yml + # when: + # - upgrade_status.k8s_config_backup.status | default('pending') != 'completed' + # - upgrade_status.upgrade.status != 'completed' + + # ── Validate backup was successful (Engineering Spec §4.7.3 Gate 6) ── + - name: "Hop — Reload status after backup" + ansible.builtin.include_tasks: load_status.yml + + - name: "Hop — Validate backup was successful" + ansible.builtin.fail: + msg: >- + Backup failed. Upgrade cannot proceed. + etcd_backup status: {{ upgrade_status.etcd_backup.status | default('unknown') }} + k8s_config_backup status: {{ upgrade_status.k8s_config_backup.status | default('unknown') }} + {{ upgrade_status.etcd_backup.error | default('') }} + {{ upgrade_status.k8s_config_backup.error | default('') }} + when: + - upgrade_status.etcd_backup.status | default('pending') == 'failed' or upgrade_status.k8s_config_backup.status | default('pending') == 'failed' + + # ══════════════════════════════════════════════════════════════════ + # IMPORTANT: Node Upgrade Architecture + # ══════════════════════════════════════════════════════════════════ + # Control plane and worker node upgrades require SSH connections to + # individual nodes to execute commands like: + # - dnf install kubeadm- + # - kubeadm upgrade apply/node + # - systemctl restart kubelet + # - kubectl drain/uncordon + # + # PROBLEM: This file (execute_single_hop.yml) is included from tasks + # (via include_tasks in main.yml), so it cannot use import_playbook + # to call plays that target different hosts. + # + # SOLUTION: Use ansible-playbook command to call sub-playbooks that + # target node groups via SSH. This is the only way to call plays + # from within tasks in Ansible. + # ══════════════════════════════════════════════════════════════════ + + # NOTE: Dynamic inventory file is created once in the main playbook + # at /tmp/k8s_upgrade_inventory.ini - no need to recreate per hop + + # ── Upgrade control planes ─────────────────────────────────────── + - name: "Hop — Starting first control plane upgrade" + ansible.builtin.debug: + msg: >- + Upgrading first control plane to {{ _current_hop.to_version }}. + Log file: /tmp/upgrade_cp_first.log + when: upgrade_status.upgrade.status != 'completed' + + - name: "Hop — Upgrade first control plane" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/upgrade_cp_first.yml + -i {{ k8s_upgrade_inventory }} + -e k8s_target_version={{ _current_hop.to_version }} + -e status_file={{ status_file }} + -e kube_vip={{ kube_vip }} + -e drain_timeout={{ drain_timeout }} + -e kubelet_ready_delay={{ kubelet_ready_delay }} + -e kubelet_ready_retries={{ kubelet_ready_retries }} + -e etcd_health_delay={{ etcd_health_delay }} + -e etcd_health_retries={{ etcd_health_retries }} + -e current_node_role={{ current_node_role | default('control_plane_first') }} + -e upgrade_dir_client={{ upgrade_dir_client }} + -e k8s_from_version={{ k8s_from_version }} + -e cluster_os_version={{ cluster_os_version }} + 2>&1 | tee /tmp/upgrade_cp_first.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: first_cp_result + changed_when: true + failed_when: first_cp_result.rc != 0 + when: upgrade_status.upgrade.status != 'completed' + + # ── Update cloud-init and BSS for first control plane ───────────── + - name: "Hop — Set BSS status to in_progress for service_kube_control_plane_first" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_control_plane_first: + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_control_plane_first.status | default('pending') != 'completed' + + - name: "Hop — Update cloud-init and BSS for service_kube_control_plane_first_x86_64" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/update_k8s_cloud_init_bss.yml + -e functional_group_name=service_kube_control_plane_first_x86_64 + 2>&1 | tee /tmp/update_bss_cp_first.log + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: bss_cp_first_result + changed_when: true + failed_when: bss_cp_first_result.rc != 0 + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_control_plane_first.status | default('pending') != 'completed' + + - name: "Hop — Set BSS status to completed for service_kube_control_plane_first" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_control_plane_first: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - bss_cp_first_result is defined + - bss_cp_first_result is not skipped + - bss_cp_first_result.rc == 0 + + # ── Reboot first control plane and verify cloud-init ───────────── + - name: "Hop — Reboot first control plane and wait for cloud-init" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/reboot_and_verify_cloud_init.yml + -i {{ k8s_upgrade_inventory }} + -e target_host={{ groups_cp_first[0] }} + -e cloud_init_timeout={{ cloud_init_timeout | default(600) }} + 2>&1 | tee /tmp/reboot_cp_first.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: reboot_cloud_init_first_cp_result + changed_when: true + failed_when: reboot_cloud_init_first_cp_result.rc != 0 + when: + - upgrade_status.upgrade.status != 'completed' + - bss_cp_first_result is defined + - bss_cp_first_result is not skipped + - bss_cp_first_result.rc == 0 + - groups_cp_first | length > 0 + + - name: "Hop — Display cloud-init verification success for first control plane" + ansible.builtin.debug: + msg: "Cloud-init completed successfully on first control plane {{ groups_cp_first[0] }} after reboot" + when: + - upgrade_status.upgrade.status != 'completed' + - reboot_cloud_init_first_cp_result is defined + - reboot_cloud_init_first_cp_result is not skipped + - reboot_cloud_init_first_cp_result.rc == 0 + + - name: "Hop — Starting additional control planes upgrade" + ansible.builtin.debug: + msg: >- + Upgrading additional control planes to {{ _current_hop.to_version }}. + Log file: /tmp/upgrade_cp_additional.log + when: upgrade_status.upgrade.status != 'completed' + + - name: "Hop — Upgrade additional control planes" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/upgrade_cp.yml + -i {{ k8s_upgrade_inventory }} + -e k8s_target_version={{ _current_hop.to_version }} + -e status_file={{ status_file }} + -e kube_vip={{ kube_vip }} + -e drain_timeout={{ drain_timeout }} + -e kubelet_ready_delay={{ kubelet_ready_delay }} + -e kubelet_ready_retries={{ kubelet_ready_retries }} + -e etcd_health_delay={{ etcd_health_delay }} + -e etcd_health_retries={{ etcd_health_retries }} + -e current_node_role={{ current_node_role | default('control_plane') }} + -e upgrade_dir_client={{ upgrade_dir_client }} + -e k8s_from_version={{ k8s_from_version }} + -e cluster_os_version={{ cluster_os_version }} + 2>&1 | tee /tmp/upgrade_cp_additional.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: additional_cp_result + changed_when: true + failed_when: additional_cp_result.rc != 0 + when: upgrade_status.upgrade.status != 'completed' + + # ── Update cloud-init and BSS for additional control planes ─────── + - name: "Hop — Set BSS status to in_progress for service_kube_control_plane" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_control_plane: + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_control_plane.status | default('pending') != 'completed' + + - name: "Hop — Update cloud-init and BSS for service_kube_control_plane_x86_64" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/update_k8s_cloud_init_bss.yml + -e functional_group_name=service_kube_control_plane_x86_64 + 2>&1 | tee /tmp/update_bss_cp_additional.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: bss_cp_additional_result + changed_when: true + failed_when: bss_cp_additional_result.rc != 0 + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_control_plane.status | default('pending') != 'completed' + + - name: "Hop — Set BSS status to completed for service_kube_control_plane" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_control_plane: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - bss_cp_additional_result is defined + - bss_cp_additional_result is not skipped + - bss_cp_additional_result.rc == 0 + + - name: "Hop — Control plane upgrades complete" + ansible.builtin.debug: + msg: | + ═══════════════════════════════════════════════════════════════════════════ + Control plane upgrade execution is complete for hop {{ _hop_idx + 1 }}. + Proceeding with addon pre-checks... + ═══════════════════════════════════════════════════════════════════════════ + when: + - _is_final_hop | bool + - upgrade_status.upgrade.status != 'completed' + + # ── Verify all control planes upgraded before addons ──────────── + - name: "Hop — Reload status before addon upgrade" + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: _status_reload + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + + - name: "Hop — Parse reloaded status" + ansible.builtin.set_fact: + upgrade_status: "{{ _status_reload.content | b64decode | from_yaml }}" + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + - _status_reload is defined + - _status_reload.content is defined + + - name: "Hop — Check all control planes upgraded" + ansible.builtin.set_fact: + _cp_nodes_status: >- + {{ upgrade_status.nodes | dict2items + | selectattr('value.role', 'in', ['control_plane_first', 'control_plane']) + | list }} + _failed_cp_nodes: >- + {{ upgrade_status.nodes | dict2items + | selectattr('value.role', 'in', ['control_plane_first', 'control_plane']) + | selectattr('value.status', 'in', ['failed', 'in_progress', 'pending']) + | map(attribute='key') + | list }} + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + + - name: "Hop — Fail if control planes not upgraded" + ansible.builtin.fail: + msg: | + ═══════════════════════════════════════════════════════════════════════════ + CANNOT PROCEED TO ADDON UPGRADE - CONTROL PLANES NOT READY + ═══════════════════════════════════════════════════════════════════════════ + + The following control plane nodes have not completed upgrade: + {{ _failed_cp_nodes | to_nice_yaml }} + + All control plane nodes must be successfully upgraded before proceeding + with addon upgrades (Calico, MetalLB, Helm). + + Current control plane status: + {% for node in _cp_nodes_status %} + - {{ node.key }}: {{ node.value.status }} (version: {{ node.value.version_current }}) + {% endfor %} + + REQUIRED ACTIONS: + ┌─────────────────────────────────────────────────────────────────────┐ + │ 1. Review the upgrade status file at {{ status_file }} │ + │ 2. Check logs for failed control plane upgrades │ + │ 3. Fix any issues with control plane nodes │ + │ 4. Re-run the upgrade playbook to continue │ + └─────────────────────────────────────────────────────────────────────┘ + + UPGRADE STOPPED: All control planes must be upgraded before addons. + ═══════════════════════════════════════════════════════════════════════════ + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + - _failed_cp_nodes is defined + - _failed_cp_nodes | length > 0 + + # ── Starting addon upgrade ─────────────────────────────────────── + - name: "Hop — Starting addon upgrade" + ansible.builtin.debug: + msg: | + ═══════════════════════════════════════════════════════════════════════════ + All control planes have been upgraded to {{ _current_hop.to_version }}. + Proceeding with addon upgrades (Calico, MetalLB, Helm)... + ═══════════════════════════════════════════════════════════════════════════ + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + - _failed_cp_nodes is defined + - _failed_cp_nodes | length == 0 + + # ── Upgrade addons ─────────────────────────────────────────────── + - name: "Hop — Upgrade addons" + ansible.builtin.include_tasks: upgrade_addons.yml + when: + - _is_final_hop | bool + - upgrade_status.addon_upgrade.status | default('pending') != 'completed' + - upgrade_status.upgrade.status != 'completed' + - _failed_cp_nodes is defined + - _failed_cp_nodes | length == 0 + + # ── Upgrade workers ────────────────────────────────────────────── + - name: "Hop — Starting worker upgrades" + ansible.builtin.debug: + msg: >- + Upgrading workers to {{ _current_hop.to_version }} (batch size: {{ worker_parallel_count }}). + Log file: /tmp/upgrade_workers.log + when: upgrade_status.upgrade.status != 'completed' + + - name: "Hop — Upgrade workers (batch size) {{ worker_parallel_count }}" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/upgrade_workers.yml + -i {{ k8s_upgrade_inventory }} + -e k8s_target_version={{ _current_hop.to_version }} + -e status_file={{ status_file }} + -e kube_vip={{ kube_vip }} + -e drain_timeout={{ drain_timeout }} + -e kubelet_ready_delay={{ kubelet_ready_delay }} + -e kubelet_ready_retries={{ kubelet_ready_retries }} + -e upgrade_dir_client={{ upgrade_dir_client }} + -e k8s_from_version={{ k8s_from_version }} + -e cluster_os_version={{ cluster_os_version }} + -e worker_parallel_count={{ worker_parallel_count }} + 2>&1 | tee /tmp/upgrade_workers.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: workers_result + changed_when: true + failed_when: workers_result.rc != 0 + when: upgrade_status.upgrade.status != 'completed' + + # ── Update cloud-init and BSS for workers ───────────────────────── + - name: "Hop — Set BSS status to in_progress for service_kube_node" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_node: + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_node.status | default('pending') != 'completed' + + - name: "Hop — Update cloud-init and BSS for service_kube_node_x86_64" + ansible.builtin.shell: + cmd: > + set -o pipefail && ansible-playbook + {{ playbook_dir }}/../playbooks/update_k8s_cloud_init_bss.yml + -e functional_group_name=service_kube_node_x86_64 + 2>&1 | tee /tmp/update_bss_workers.log /dev/tty 2>/dev/null; + exit ${PIPESTATUS[0]} + args: + executable: /bin/bash + environment: + ANSIBLE_CONFIG: "{{ playbook_dir }}/../ansible.cfg" + register: bss_workers_result + changed_when: true + failed_when: bss_workers_result.rc != 0 + when: + - upgrade_status.upgrade.status != 'completed' + - upgrade_status.bss_update.service_kube_node.status | default('pending') != 'completed' + + - name: "Hop — Set BSS status to completed for service_kube_node" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + bss_update: + service_kube_node: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" + when: + - upgrade_status.upgrade.status != 'completed' + - bss_workers_result is defined + - bss_workers_result is not skipped + - bss_workers_result.rc == 0 + + # ── Post-validation ────────────────────────────────────────────── + - name: "Hop — Run post-validation" + ansible.builtin.include_tasks: post_validation.yml + + # ── Check and mark hop as completed ────────────────────────────── + - name: "Hop — Check if all nodes upgraded and mark hop completed" + ansible.builtin.include_tasks: check_and_mark_hop_completed.yml + + # ── Mark hop completed ─────────────────────────────────────────── + - name: "Hop — Mark upgrade completed" + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + upgrade: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" + multi_hop: + current_hop: "{{ _hop_idx }}" + hops: >- + {{ (upgrade_status.multi_hop.hops | default([])) + | rejectattr('to', 'equalto', _current_hop.to_version) + | list + + [{ + 'from': k8s_from_version, + 'to': _current_hop.to_version, + 'status': 'completed', + 'completed_at': ansible_date_time.iso8601, + 'backup_dir': backup_dir + }] }} + + - name: "Hop — Hop complete" + ansible.builtin.debug: + msg: >- + Hop {{ _hop_idx + 1 }} complete: {{ _current_hop.from_minor }} → {{ _current_hop.to_version }}. + Backup at: {{ backup_dir }} diff --git a/upgrade/roles/upgrade_k8s/tasks/load_status.yml b/upgrade/roles/upgrade_k8s/tasks/load_status.yml new file mode 100644 index 0000000000..ab23d7f1e1 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/load_status.yml @@ -0,0 +1,297 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ── Read or create status file ────────────────────────────────────── +# Status file operations now handled by separate play targeting kube_vip_group +# - name: Check if status file exists +# ansible.builtin.stat: +# path: "{{ status_file }}" +# delegate_to: "{{ kube_vip }}" +# register: status_file_stat + +# - name: Read existing status file +# ansible.builtin.slurp: +# src: "{{ status_file }}" +# delegate_to: "{{ kube_vip }}" +# register: status_content +# when: status_file_stat.stat.exists + +# - name: Parse existing status +# ansible.builtin.set_fact: +# upgrade_status: "{{ status_content.content | b64decode | from_yaml }}" +# when: status_file_stat.stat.exists + +# ── Ensure kube_vip is in inventory for delegation ───────────────── +- name: Ensure kube_vip host is present for delegation + ansible.builtin.add_host: + name: "{{ kube_vip }}" + ansible_host: "{{ kube_vip }}" + ansible_connection: ssh + ansible_user: root + ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + groups: kube_vip_group + +# ── Read nodes.yaml for inventory ────────────────────────────────── +- name: Read nodes.yaml + ansible.builtin.slurp: + src: "{{ nodes_yaml_path }}" + register: nodes_slurp + changed_when: false + +- name: Parse nodes.yaml + ansible.builtin.set_fact: + parsed_nodes: "{{ nodes_slurp.content | b64decode | from_yaml }}" + +# ── Build node lists by role ──────────────────────────────────────── +- name: Build first control plane list + ansible.builtin.set_fact: + groups_cp_first: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp_first) + | map(attribute='name') | list }} + +- name: Build additional control plane list + ansible.builtin.set_fact: + groups_cp: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_cp) + | map(attribute='name') | list }} + +- name: Build worker list + ansible.builtin.set_fact: + groups_worker: >- + {{ parsed_nodes.nodes + | selectattr('group', 'equalto', group_worker) + | map(attribute='name') | list }} + +- name: Build all nodes list + ansible.builtin.set_fact: + all_upgrade_nodes: "{{ groups_cp_first + groups_cp + groups_worker }}" + +# ── Build node IP map ─────────────────────────────────────────────── +- name: Build node name-to-IP mapping + ansible.builtin.set_fact: + node_ips: >- + {{ node_ips | default({}) | combine({ + item.name: (item.interfaces | first).ip_addrs + | selectattr('name', 'equalto', 'management') + | map(attribute='ip_addr') | first + }) }} + loop: "{{ parsed_nodes.nodes }}" + loop_control: + label: "{{ item.name }}" + +# ── Add all nodes to Ansible inventory ────────────────────────────── +- name: Add K8s nodes to inventory + ansible.builtin.include_tasks: add_nodes_to_inventory.yml + +# ── Detect current cluster version ────────────────────────────────── +# Cluster version is now obtained from playbook level (upgrade_k8s.yml) +# which runs on kube_vip_group directly to avoid delegation issues + +# ── Initialize status file if new upgrade ─────────────────────────── + +- name: Try to load existing status file from kube_vip + when: upgrade_status is not defined or (upgrade_status is defined and upgrade_status.keys() | length == 0) + block: + - name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: _status_file_stat + + - name: Read status file (only if it exists) + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: _existing_status_slurp + when: _status_file_stat.stat.exists | default(false) + + - name: Parse existing status (only if valid and has nodes) + ansible.builtin.set_fact: + upgrade_status: "{{ _parsed_status }}" + vars: + _parsed_status: "{{ (_existing_status_slurp.content | b64decode | from_yaml) | default({}, true) }}" + when: + - _status_file_stat.stat.exists | default(false) + - _existing_status_slurp is defined + - (_existing_status_slurp.content | default('') | length) > 0 + - _parsed_status is mapping + - _parsed_status.nodes is defined + + - name: Debug loaded status + ansible.builtin.debug: + msg: "Loaded existing status from {{ status_file }}" + when: + - upgrade_status is defined + - upgrade_status is mapping + - upgrade_status.nodes is defined + +- name: Initialize upgrade status for new upgrade + when: + - upgrade_status is not defined or (upgrade_status is defined and upgrade_status.keys() | length == 0) or (upgrade_status is defined and + (upgrade_status.nodes is not defined)) + block: + - name: Build initial node status entries + ansible.builtin.set_fact: + initial_nodes: >- + {{ initial_nodes | default({}) | combine({ + item.0: { + 'role': item.1, + 'ip': node_ips[item.0], + 'version_before': k8s_from_version, + 'version_current': k8s_from_version, + 'status': 'pending', + 'steps': ( + { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_apply': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None}, + 'etcd_health_check': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'control_plane_first' + else { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_node': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None}, + 'etcd_health_check': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'control_plane' + else { + 'setup_repos': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubeadm_upgrade_node': {'status': 'pending', 'timestamp': None, 'error': None}, + 'drain': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'crio_install': {'status': 'pending', 'timestamp': None, 'error': None}, + 'kubelet_restart': {'status': 'pending', 'timestamp': None, 'error': None}, + 'uncordon': {'status': 'pending', 'timestamp': None, 'error': None}, + 'validation': {'status': 'pending', 'timestamp': None, 'error': None} + } + if item.1 == 'worker' + else {} + ) + } + }) }} + loop: >- + {{ (groups_cp_first | zip_longest([], fillvalue='control_plane_first')) + + (groups_cp | zip_longest([], fillvalue='control_plane')) + + (groups_worker | zip_longest([], fillvalue='worker')) }} + loop_control: + label: "{{ item.0 }}" + + - name: Set initial upgrade_status + ansible.builtin.set_fact: + upgrade_status: + upgrade: + from_version: "{{ k8s_from_version }}" + target_version: "{{ k8s_target_version }}" + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + completed_at: + etcd_backup: + status: pending + path: + timestamp: + error: + k8s_config_backup: + status: pending + path: + timestamp: + error: + addon_upgrade: + status: pending + calico: + status: pending + metallb: + status: pending + helm: + status: pending + bss_update: + service_kube_control_plane_first: + status: pending + service_kube_control_plane: + status: pending + service_kube_node: + status: pending + nodes: "{{ initial_nodes }}" + + - name: Write initial status file locally + ansible.builtin.copy: + content: "{{ upgrade_status | to_json }}" + dest: "{{ upgrade_status_temp_json }}" + mode: "0644" + changed_when: true + + - name: Convert JSON to YAML + ansible.builtin.shell: + cmd: >- + python3 -c "import json, yaml; + f = open('{{ upgrade_status_temp_json }}'); + data = json.load(f); f.close(); + f = open('{{ upgrade_status_temp_yml }}', 'w'); + yaml.dump(data, f, default_flow_style=False, sort_keys=False); + f.close()" + changed_when: true + + - name: Ensure status file directory exists on kube_vip + ansible.builtin.file: + path: "{{ status_file | dirname }}" + state: directory + mode: "0755" + delegate_to: "{{ kube_vip }}" + + - name: Copy status file to kube_vip + ansible.builtin.copy: + src: "{{ upgrade_status_temp_yml }}" + dest: "{{ status_file }}" + mode: "0644" + delegate_to: "{{ kube_vip }}" + + - name: Verify status file was created on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: _status_file_verify + + - name: Fail if status file was not created on kube_vip + ansible.builtin.fail: + msg: "Failed to create status file on kube_vip: {{ status_file }}" + when: not (_status_file_verify.stat.exists | default(false)) + + - name: Clean up temp files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "{{ upgrade_status_temp_json }}" + - "{{ upgrade_status_temp_yml }}" + changed_when: false + +# ── Detect current addon versions and update status ──────────────── +- name: Detect current addon versions + ansible.builtin.include_tasks: detect_addon_versions.yml + when: upgrade_status is defined diff --git a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml new file mode 100644 index 0000000000..d930690682 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml @@ -0,0 +1,133 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Reusable task file: loads configuration files and extracts version variables. +# Can be re-included per hop in multi-hop upgrades with overridden +# software_config_file / service_k8s_config_file paths. + +# ── Load configuration ────────────────────────────────────────────── +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_file }}" + name: software_config + +# Use cached configs (loaded once at playbook start for performance) +- name: Use cached storage_config + ansible.builtin.set_fact: + storage_config: "{{ hostvars['localhost']['cached_storage_config'] }}" + +- name: Use cached ha_config + ansible.builtin.set_fact: + ha_config: "{{ hostvars['localhost']['cached_ha_config'] }}" + +# ── Extract NFS paths ────────────────────────────────────────────── +- name: Set NFS mount info from storage_config + ansible.builtin.set_fact: + _nfs_mount: >- + {{ storage_config.mounts + | selectattr('name', 'equalto', nfs_storage_name) + | first }} + +- name: Set NFS paths from mount info + ansible.builtin.set_fact: + k8s_server_share_path: "{{ _nfs_mount.source.split(':')[1] }}" + k8s_client_mount_path: "{{ _nfs_mount.mount_point }}" + k8s_nfs_server_ip: "{{ _nfs_mount.source.split(':')[0] }}" + +# ── Extract kube_vip ─────────────────────────────────────────────── +- name: Set kube_vip from high_availability_config + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address }}" + +# ── Extract version variables ────────────────────────────────────── +- name: Set k8s_target_version from software_config + ansible.builtin.set_fact: + k8s_target_version: >- + {{ software_config.softwares + | selectattr('name', 'equalto', 'service_k8s') + | map(attribute='version') | first }} + +- name: Extract cluster_os_version from software_config + ansible.builtin.set_fact: + cluster_os_version: "{{ software_config.cluster_os_version }}" + +- name: Set version-specific service_k8s config path + ansible.builtin.set_fact: + service_k8s_config_file: "{{ input_project_dir }}/config/x86_64/rhel/{{ cluster_os_version }}/service_k8s_v{{ k8s_target_version }}.json" + +- name: Verify version-specific service_k8s config exists + ansible.builtin.stat: + path: "{{ service_k8s_config_file }}" + register: service_k8s_config_stat + +- name: Fail if version-specific service_k8s config is missing + ansible.builtin.fail: + msg: "Required version-specific config not found: {{ service_k8s_config_file }}" + when: not service_k8s_config_stat.stat.exists + +- name: Load version-specific service_k8s config + ansible.builtin.include_vars: + file: "{{ service_k8s_config_file }}" + name: service_k8s_config + +- name: Derive k8s_target_minor + ansible.builtin.set_fact: + k8s_target_minor: "{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') }}" + +- name: Extract calico_target_version from service_k8s.json + ansible.builtin.set_fact: + calico_target_version: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('package', 'search', 'calico/node') + | map(attribute='tag') | first | regex_replace('^v', '') }} + +- name: Extract metallb_target_version from service_k8s.json + ansible.builtin.set_fact: + metallb_target_version: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('package', 'search', 'metallb/speaker') + | map(attribute='tag') | first | regex_replace('^v', '') }} + +- name: Extract helm_target_version from service_k8s.json + ansible.builtin.set_fact: + helm_target_version: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('package', 'search', 'helm-v') + | map(attribute='package') | first + | regex_replace('^helm-v', '') + | regex_replace('-.*$', '') }} + +# ── Extract addon package names (for manifest file names) ───────────── +- name: Extract addon package names from service_k8s.json + ansible.builtin.set_fact: + calico_package: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('type', 'equalto', 'manifest') + | selectattr('package', 'search', 'calico') + | map(attribute='package') | join }} + metallb_package: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('type', 'equalto', 'manifest') + | selectattr('package', 'search', 'metallb-native') + | map(attribute='package') | join }} + helm_package: >- + {{ service_k8s_config.service_kube_control_plane_first.cluster + | selectattr('type', 'equalto', 'tarball') + | selectattr('package', 'search', 'helm') + | map(attribute='package') | join }} + +# ── Set OIM host ─────────────────────────────────────────────────── +- name: Set oim_host to NFS server IP + ansible.builtin.set_fact: + oim_host: "{{ k8s_nfs_server_ip }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/main.yml b/upgrade/roles/upgrade_k8s/tasks/main.yml new file mode 100644 index 0000000000..fed6782bb2 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/main.yml @@ -0,0 +1,184 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ── COMP-K8S-001: Load configuration and extract version variables ─── +- name: Load config and version variables + ansible.builtin.include_tasks: load_version_vars.yml + +- name: Load local_repo_access.yml to get pulp protocol + ansible.builtin.include_vars: + file: "{{ local_repo_access_file }}" + when: inventory_hostname == "localhost" + +- name: Extract pulp protocol from local_repo_access + ansible.builtin.set_fact: + pulp_protocol: "{{ offline_tarball_path | regex_replace('^(https?)://.*', '\\1') }}" + when: inventory_hostname == "localhost" + +# ── COMP-K8S-001: Validate component dependencies ───────────────────── +- name: Validate upgrade_manifest.yml dependencies + block: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ upgrade_manifest_path }}" + register: manifest_raw + delegate_to: localhost + + - name: Parse upgrade_manifest.yml + ansible.builtin.set_fact: + upgrade_manifest: "{{ manifest_raw.content | b64decode | from_yaml }}" + delegate_to: localhost + + - name: Check OIM component status + ansible.builtin.fail: + msg: >- + K8s upgrade requires OIM to be completed first. + Current OIM status: {{ upgrade_manifest.component_status.oim | default('not found') }} + when: + - upgrade_manifest.component_status.oim | default('pending') != 'completed' + + - name: Check local_repo component status + ansible.builtin.fail: + msg: >- + K8s upgrade requires local_repo to be completed first. + Current local_repo status: {{ upgrade_manifest.component_status.local_repo | default('not found') }} + when: + - upgrade_manifest.component_status.local_repo | default('pending') != 'completed' + + - name: Check build_image component status + ansible.builtin.fail: + msg: >- + K8s upgrade requires build_image to be completed first. + Current build_image status: {{ upgrade_manifest.component_status.build_image | default('not found') }} + when: + - upgrade_manifest.component_status.build_image | default('pending') != 'completed' + + - name: Display dependency validation success + ansible.builtin.debug: + msg: >- + Component dependency validation passed: + - OIM: {{ upgrade_manifest.component_status.oim }} + - local_repo: {{ upgrade_manifest.component_status.local_repo }} + - build_image: {{ upgrade_manifest.component_status.build_image }} + +# ── COMP-K8S-001: Set upgrade paths ─────────────────────────────────── +- name: Set upgrade paths on client NFS mount + ansible.builtin.set_fact: + upgrade_dir_client: "{{ k8s_client_mount_path }}/upgrade" + status_file: "{{ k8s_client_mount_path }}/upgrade/upgrade_status.yml" + lock_file: "{{ k8s_client_mount_path }}/upgrade/upgrade.lock" + backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup" + etcd_snapshot_file: "{{ k8s_client_mount_path }}/upgrade/backup/etcd-snapshot.db" + etcdctl_binary: "{{ k8s_client_mount_path }}/upgrade/backup/etcdctl" + etcd_members_file: "{{ k8s_client_mount_path }}/upgrade/backup/etcd-members.json" + k8s_config_backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup/configs" + +# ── COMP-K8S-011: Detect hop chain ──────────────────────────────────── +# Note: This task is skipped here and will be run in a separate play targeting kube_vip_group +# to avoid delegation issues. See upgrade_k8s.yml for the separate play. + +# ══════════════════════════════════════════════════════════════════════ +# Phase 1 (Prep) — zero cluster impact +# ══════════════════════════════════════════════════════════════════════ +- name: "Phase 1 — Prep all target versions" + ansible.builtin.include_tasks: prep_phase.yml + when: hop_chain | length > 0 + +# ══════════════════════════════════════════════════════════════════════ +# Phase 2 (Upgrade) — cluster impact +# ══════════════════════════════════════════════════════════════════════ +- name: "Phase 2 — Execute upgrade" + block: + # ── COMP-K8S-002: Acquire distributed lock ────────────────────── + - name: Acquire upgrade lock + ansible.builtin.include_tasks: acquire_lock.yml + + # ── COMP-K8S-003: Load or create upgrade status ────────────────── + - name: Load or create upgrade status + ansible.builtin.include_tasks: load_status.yml + + # ── COMP-K8S-003: Update Omnia lifecycle manifest (in-progress) ──────────── + - name: Update upgrade_manifest.yml (K8s in-progress) + ansible.builtin.lineinfile: + path: "{{ upgrade_manifest_path }}" + regexp: "^\\s+k8s:\\s*.*$" + line: " k8s: in-progress" + state: present + delegate_to: localhost + + # ── COMP-K8S-011: Record multi-hop chain ───────────────────────── + - name: Record hop chain in status file + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + multi_hop: + chain: >- + {{ hop_chain | map(attribute='from_minor') + | zip(hop_chain | map(attribute='to_version')) + | map('join', '→') | list }} + total_hops: "{{ hop_chain | length }}" + when: hop_chain | length > 1 + + # ── COMP-K8S-011: Execute each hop ─────────────────────────────── + - name: Execute hop + ansible.builtin.include_tasks: execute_single_hop.yml + loop: "{{ hop_chain }}" + loop_control: + loop_var: _current_hop + index_var: _hop_idx + label: "{{ _current_hop.from_minor }} → {{ _current_hop.to_version }}" + + # ── COMP-K8S-003: Update Omnia lifecycle manifest (completed) ────────────── + - name: Update upgrade_manifest.yml (K8s completed) + ansible.builtin.lineinfile: + path: "{{ upgrade_manifest_path }}" + regexp: "^\\s+k8s:\\s*.*$" + line: " k8s: completed" + state: present + delegate_to: localhost + + - name: Display final upgrade success + ansible.builtin.debug: + msg: >- + {% if hop_chain | length > 1 %} + Multi-hop upgrade complete ({{ hop_chain | length }} hops). + {% endif %} + {{ msg_upgrade_complete }} + + rescue: + # ── COMP-K8S-012: Failure Scenario Handler ─────────────────────── + - name: Update upgrade_manifest.yml (K8s failed) + ansible.builtin.lineinfile: + path: "{{ upgrade_manifest_path }}" + regexp: "^\\s+k8s:\\s*.*$" + line: " k8s: failed" + state: present + delegate_to: localhost + + - name: Display failure message + ansible.builtin.debug: + msg: | + Kubernetes upgrade failed! + Status file: {{ status_file }} + Backup location: {{ backup_dir }} + For rollback, run: ansible-playbook rollback/rollback.yml --tags k8s + + - name: Fail the playbook + ansible.builtin.fail: + msg: "Kubernetes upgrade failed" + + always: + # ── COMP-K8S-002: Release distributed lock ─────────────────────── + - name: Release upgrade lock + ansible.builtin.include_tasks: release_lock.yml diff --git a/upgrade/roles/upgrade_k8s/tasks/post_validation.yml b/upgrade/roles/upgrade_k8s/tasks/post_validation.yml new file mode 100644 index 0000000000..0148e5752b --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/post_validation.yml @@ -0,0 +1,190 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Get all node statuses + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: kubectl get nodes --no-headers + register: post_nodes + changed_when: false + +- name: Verify all nodes are Ready + ansible.builtin.fail: + msg: "Post-validation failed: Some nodes are not Ready.\n{{ post_nodes.stdout }}" + when: "'NotReady' in post_nodes.stdout" + +- name: Verify all nodes at target version + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}={.status.nodeInfo.kubeletVersion}{"\n"}{end}' + register: post_versions + changed_when: false + +- name: Check version output for mismatches + ansible.builtin.fail: + msg: >- + Post-validation failed: Not all nodes at v{{ k8s_target_version }}. + {{ post_versions.stdout }} + when: "'v' + k8s_target_version not in item" + loop: "{{ post_versions.stdout_lines }}" + loop_control: + label: "{{ item }}" + +- name: Verify all kube-system pods are Running + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + kubectl get pods -n kube-system --no-headers + --field-selector status.phase!=Running,status.phase!=Succeeded + 2>/dev/null | head -20 + args: + executable: /bin/bash + register: unhealthy_pods + changed_when: false + failed_when: false + +- name: Fail if kube-system pods are not Running + ansible.builtin.fail: + msg: >- + Post-validation failed: Some kube-system pods are not Running. + {{ unhealthy_pods.stdout }} + when: unhealthy_pods.stdout | length > 0 + +# ── etcd health check (Engineering Spec §4.7.5 Check 4) ─────────── +- name: Get first control plane pod name for etcd + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: kubectl get pods -n kube-system -l component=etcd -o jsonpath='{.items[0].metadata.name}' + register: etcd_pod_name + changed_when: false + failed_when: false + +- name: Verify etcd cluster health + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl exec -n kube-system {{ etcd_pod_name.stdout }} -- + etcdctl --endpoints=https://127.0.0.1:2379 + --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/server.crt + --key=/etc/kubernetes/pki/etcd/server.key + endpoint health --cluster + register: etcd_health + changed_when: false + when: etcd_pod_name.rc == 0 + +- name: Fail if etcd is unhealthy + ansible.builtin.fail: + msg: >- + Post-validation failed: etcd cluster is unhealthy. + {{ etcd_health.stdout }} + {{ etcd_health.stderr }} + when: + - etcd_pod_name.rc == 0 + - "'is unhealthy' in etcd_health.stdout or etcd_health.rc != 0" + +# ── Calico pods check (Engineering Spec §4.7.5 Check 5) ─────────── +- name: Check Calico pods + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + kubectl get pods -n calico-system --no-headers + --field-selector status.phase!=Running,status.phase!=Succeeded + 2>/dev/null | head -20 + args: + executable: /bin/bash + register: unhealthy_calico_pods + changed_when: false + failed_when: false + +- name: Fail if Calico pods are not Running + ansible.builtin.fail: + msg: >- + Post-validation failed: Some Calico pods are not Running. + {{ unhealthy_calico_pods.stdout }} + when: unhealthy_calico_pods.stdout | length > 0 + +# ── MetalLB pods check (Engineering Spec §4.7.5 Check 6) ───────── +- name: Check MetalLB pods + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + kubectl get pods -n metallb-system --no-headers + --field-selector status.phase!=Running,status.phase!=Succeeded + 2>/dev/null | head -20 + args: + executable: /bin/bash + register: unhealthy_metallb_pods + changed_when: false + failed_when: false + +- name: Fail if MetalLB pods are not Running + ansible.builtin.fail: + msg: >- + Post-validation failed: Some MetalLB pods are not Running. + {{ unhealthy_metallb_pods.stdout }} + when: unhealthy_metallb_pods.stdout | length > 0 + +# ── API server reachability check (Engineering Spec §4.7.5 Check 7) ─ +- name: Verify API server reachability + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: kubectl cluster-info + register: cluster_info + changed_when: false + failed_when: false + +- name: Fail if API server is not reachable + ansible.builtin.fail: + msg: >- + Post-validation failed: API server is not reachable. + {{ cluster_info.stdout }} + {{ cluster_info.stderr }} + when: "'is running at' not in cluster_info.stdout" + +# ── DNS resolution check (Engineering Spec §4.7.5 Check 8) ──────── +- name: Test DNS resolution + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl run k8s-dns-test-{{ ansible_date_time.epoch }} --image=busybox:1.36 + --rm -i --restart=Never -- nslookup kubernetes.default.svc.cluster.local + register: dns_test + changed_when: false + failed_when: false + +- name: Fail if DNS resolution fails + ansible.builtin.fail: + msg: >- + Post-validation failed: DNS resolution is not working. + {{ dns_test.stdout }} + {{ dns_test.stderr }} + when: "'Server:' not in dns_test.stdout or dns_test.rc != 0" + +- name: Display post-validation summary + ansible.builtin.debug: + msg: >- + Post-validation complete. + All {{ all_upgrade_nodes | length }} nodes at v{{ k8s_target_version }} and Ready. + kube-system pods: Running + etcd: healthy + Calico pods: Running + MetalLB pods: Running + API server: reachable + DNS: working + Cluster upgrade from {{ k8s_from_version }} to {{ k8s_target_version }} successful. diff --git a/upgrade/roles/upgrade_k8s/tasks/preflight_checks_pulp.yml b/upgrade/roles/upgrade_k8s/tasks/preflight_checks_pulp.yml new file mode 100644 index 0000000000..b0d33c4c00 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/preflight_checks_pulp.yml @@ -0,0 +1,92 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ── Pulp repo checks (on omnia_core) ─────────────────────────────── +- name: Verify Pulp kubernetes distribution exists + ansible.builtin.command: + cmd: >- + /usr/local/bin/pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_minor | replace('.', '-') }} + register: pulp_k8s_dist + changed_when: false + failed_when: false + +- name: Fail if kubernetes distribution missing + ansible.builtin.fail: + msg: >- + Required Pulp distribution 'kubernetes-v{{ k8s_target_minor | replace('.', '-') }}' not found. + The prep phase (Phase 1) should have synced this. + Check prep phase output or run 'ansible-playbook local_repo/local_repo.yml' manually. + when: pulp_k8s_dist.rc != 0 + +- name: Verify Pulp cri-o distribution exists + ansible.builtin.command: + cmd: >- + /usr/local/bin/pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_minor | replace('.', '-') }} + register: pulp_crio_dist + changed_when: false + failed_when: false + +- name: Fail if cri-o distribution missing + ansible.builtin.fail: + msg: >- + Required Pulp distribution 'cri-o-v{{ k8s_target_minor | replace('.', '-') }}' not found. + The prep phase (Phase 1) should have synced this. + Check prep phase output or run 'ansible-playbook local_repo/local_repo.yml' manually. + when: pulp_crio_dist.rc != 0 + +# ── Verify container images in Pulp (on omnia_core) ──────────────── +# Images are stored in Pulp without registry prefix (e.g., kube-apiserver not registry.k8s.io/kube-apiserver) +- name: Check required container images in Pulp + ansible.builtin.uri: + url: "https://{{ admin_nic_ip }}:2225/v2/{{ item.name }}/tags/list" + method: GET + validate_certs: false + status_code: [200] + loop: + - { name: "kube-apiserver", tag: "v{{ k8s_target_version }}" } + - { name: "kube-controller-manager", tag: "v{{ k8s_target_version }}" } + - { name: "kube-scheduler", tag: "v{{ k8s_target_version }}" } + - { name: "kube-proxy", tag: "v{{ k8s_target_version }}" } + register: image_checks + changed_when: false + failed_when: false + +- name: Verify required tags exist for each image + ansible.builtin.fail: + msg: "Required image {{ item.item.name }}:{{ item.item.tag }} not found in Pulp registry. Available tags: {{ item.json.tags | default([]) | join(', ') }}" + loop: "{{ image_checks.results }}" + loop_control: + label: "{{ item.item.name }}:{{ item.item.tag }}" + when: + - item.status == 200 + - item.item.tag not in (item.json.tags | default([])) + +- name: Warn if image check failed + ansible.builtin.debug: + msg: >- + WARNING: Could not verify image {{ item.item.name }} in Pulp registry + (status: {{ item.status | default('unknown') }}). Upgrade may fail if image is not available. + loop: "{{ image_checks.results }}" + loop_control: + label: "{{ item.item.name }}" + when: item.status != 200 + +- name: Display verified images + ansible.builtin.debug: + msg: >- + Verified: All required K8s images (kube-apiserver, kube-controller-manager, + kube-scheduler, kube-proxy) with tag v{{ k8s_target_version }} are present in Pulp registry. + when: image_checks.results | selectattr('status', 'equalto', 200) | list | length == 4 diff --git a/upgrade/roles/upgrade_k8s/tasks/prep_merge_configs.yml b/upgrade/roles/upgrade_k8s/tasks/prep_merge_configs.yml new file mode 100644 index 0000000000..a8d7f3a41d --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/prep_merge_configs.yml @@ -0,0 +1,134 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Merge local_repo_config.yml and service_k8s.json from ALL hop artifacts +# into combined files in a staging directory, so local_repo.yml can be called once. +# +# Inputs: +# hop_chain — list of hops (from detect_hop_chain_from_manifest.yml) +# _staging_dir — pre-created temp directory to write merged files into +# +# Outputs (files written to _staging_dir): +# local_repo_config.yml — merged repo URLs +# software_config.json — from final hop +# config/x86_64/rhel//service_k8s.json — merged image+RPM entries + +- name: "Merge_configs — Merge local_repo_config.yml across all hops" + ansible.builtin.script: + cmd: >- + python3 -c " + import json, yaml, sys, os + + staging = sys.argv[1] + artifacts_base = sys.argv[2] + hop_dirs = sys.argv[3].split(',') + + # --- Merge local_repo_config.yml --- + # Start from the final hop's config (most complete), then union repo entries from earlier hops + merged_repo_x86 = [] + merged_repo_aarch64 = [] + seen_names_x86 = set() + seen_names_aarch64 = set() + base_config = None + + for hop_dir in hop_dirs: + path = os.path.join(artifacts_base, hop_dir, 'local_repo_config.yml') + with open(path) as f: + cfg = yaml.safe_load(f) + base_config = cfg # last one wins for non-list fields + + for entry in (cfg.get('omnia_repo_url_rhel_x86_64') or []): + key = entry.get('name', '') + if key not in seen_names_x86: + seen_names_x86.add(key) + merged_repo_x86.append(entry) + + for entry in (cfg.get('omnia_repo_url_rhel_aarch64') or []): + key = entry.get('name', '') + if key not in seen_names_aarch64: + seen_names_aarch64.add(key) + merged_repo_aarch64.append(entry) + + base_config['omnia_repo_url_rhel_x86_64'] = merged_repo_x86 + base_config['omnia_repo_url_rhel_aarch64'] = merged_repo_aarch64 + + with open(os.path.join(staging, 'local_repo_config.yml'), 'w') as f: + yaml.dump(base_config, f, default_flow_style=False, sort_keys=False) + + # --- Merge service_k8s.json --- + # Union cluster entries across all hops, dedup by (package, tag, type) + merged_svc = {} + for hop_dir in hop_dirs: + path = os.path.join(artifacts_base, hop_dir, 'service_k8s.json') + with open(path) as f: + svc = json.load(f) + for section_name, section_data in svc.items(): + if section_name not in merged_svc: + merged_svc[section_name] = {'cluster': []} + existing = merged_svc[section_name]['cluster'] + seen = set() + for e in existing: + seen.add((e.get('package',''), e.get('tag',''), e.get('type',''))) + for entry in section_data.get('cluster', []): + key = (entry.get('package',''), entry.get('tag',''), entry.get('type','')) + if key not in seen: + seen.add(key) + existing.append(entry) + + os.makedirs(os.path.join(staging, 'config', 'x86_64', 'rhel'), exist_ok=True) + # We write to a temp location; the caller sets the os_version subdir + svc_out = os.path.join(staging, 'merged_service_k8s.json') + with open(svc_out, 'w') as f: + json.dump(merged_svc, f, indent=2) + + print(json.dumps({'repo_count_x86': len(merged_repo_x86), + 'repo_count_aarch64': len(merged_repo_aarch64), + 'service_k8s_sections': list(merged_svc.keys())})) + " + "{{ _staging_dir }}" + "{{ role_path }}/../../artifacts" + "{{ hop_chain | map(attribute='artifacts_dir') | join(',') }}" + register: _merge_result + changed_when: false + +- name: "Merge_configs — Display merge summary" + ansible.builtin.debug: + msg: >- + Merged configs from {{ hop_chain | length }} hops: + {{ _merge_result.stdout | trim }} + +- name: "Merge_configs — Copy final hop's software_config.json to staging" + ansible.builtin.copy: + src: "{{ role_path }}/../../artifacts/{{ hop_chain[-1].artifacts_dir }}/software_config.json" + dest: "{{ _staging_dir }}/software_config.json" + mode: "{{ file_perm_644 }}" + remote_src: true + +- name: "Merge_configs — Create config subdirectory for service_k8s.json" + ansible.builtin.file: + path: "{{ _staging_dir }}/config/x86_64/rhel/{{ cluster_os_version }}" + state: directory + mode: "{{ dir_perm_755 }}" + +- name: "Merge_configs — Move merged service_k8s.json to expected path" + ansible.builtin.copy: + src: "{{ _staging_dir }}/merged_service_k8s.json" + dest: "{{ _staging_dir }}/config/x86_64/rhel/{{ cluster_os_version }}/service_k8s.json" + mode: "{{ file_perm_644 }}" + remote_src: true + +- name: "Merge_configs — Remove temp merged file" + ansible.builtin.file: + path: "{{ _staging_dir }}/merged_service_k8s.json" + state: absent diff --git a/upgrade/roles/upgrade_k8s/tasks/prep_phase.yml b/upgrade/roles/upgrade_k8s/tasks/prep_phase.yml new file mode 100644 index 0000000000..b49adda983 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/prep_phase.yml @@ -0,0 +1,217 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Phase 1 (Prep) — zero cluster impact. +# +# 1. Sync local repos for ALL hops in a single merged local_repo call. +# (local_repo_config.yml and service_k8s.json merged across hops.) +# 2. Per hop: build squashfs image, verify all prep artifacts. +# +# If any step fails here, the cluster is untouched and the playbook aborts. +# +# Inputs: +# hop_chain — list of hops (from detect_hop_chain_from_manifest.yml) + +- name: Phase 1 — Prep all hop target versions + ansible.builtin.debug: + msg: >- + Starting Phase 1 (Prep). Will sync repos (merged), build squashfs, and verify + images for {{ hop_chain | length }} target version(s). + Cluster will NOT be touched during this phase. + +# ── Step 1: Sync ALL repos in one local_repo call ──────────────────── +- name: "Phase 1 — Sync Pulp repos for all target versions" + ansible.builtin.include_tasks: prep_sync_all_repos.yml + +# ── Step 1.5: Download addon manifests to NFS (once for all hops) ───── +# NOTE: These tasks delegate to kube_vip because the NFS is mounted there, +# not on localhost (Ansible controller). This ensures manifests are available +# to all nodes via the shared NFS mount. +- name: "Phase 1 — Download addon manifests for target versions" + block: + - name: Include local repo access variable file + ansible.builtin.include_vars: "{{ local_repo_access_file }}" + + - name: Validate offline_manifest_path and offline_tarball_path + ansible.builtin.assert: + that: + - offline_manifest_path is defined + - offline_manifest_path | length > 0 + - offline_tarball_path is defined + - offline_tarball_path | length > 0 + fail_msg: >- + offline_manifest_path and/or offline_tarball_path not defined in + {{ local_repo_access_file }}. These are required for downloading + addon manifests and tarballs from Pulp. + quiet: true + + - name: Create addon directories on NFS + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/{{ item }}" + state: directory + mode: "0755" + delegate_to: "{{ kube_vip }}" + loop: + - calico + - metallb + - helm + + - name: Load service_k8s config for target version + ansible.builtin.set_fact: + service_k8s_config: "{{ lookup('file', service_k8s_config_file) | from_json }}" + + - name: Extract addon package names from service_k8s config + ansible.builtin.set_fact: + calico_package: >- + {{ service_k8s_config['service_kube_control_plane_first']['cluster'] + | selectattr('type', 'equalto', 'manifest') + | selectattr('package', 'search', 'calico') + | map(attribute='package') | join }} + metallb_package: >- + {{ service_k8s_config['service_kube_control_plane_first']['cluster'] + | selectattr('type', 'equalto', 'manifest') + | selectattr('package', 'search', 'metallb-native') + | map(attribute='package') | join }} + helm_package: >- + {{ service_k8s_config['service_kube_control_plane_first']['cluster'] + | selectattr('type', 'equalto', 'tarball') + | selectattr('package', 'search', 'helm') + | map(attribute='package') | join }} + + - name: Extract helm_target_version from service_k8s config + ansible.builtin.set_fact: + helm_target_version: >- + {{ helm_package | regex_replace('^helm-v([0-9.]+).*', '\1') }} + + - name: Set addon manifest URLs (following k8s_config pattern) + ansible.builtin.set_fact: + calico_manifest_yaml_url: >- + {{ offline_manifest_path }}/{{ calico_package }}/{{ calico_package }}.yml + metallb_manifest_yaml_url: >- + {{ offline_manifest_path }}/{{ metallb_package }}/{{ metallb_package }}.yml + helm_tarball_url: >- + {{ offline_tarball_path }}/{{ helm_package }}/{{ helm_package }}.tar.gz + + - name: Download Calico manifest from Pulp to NFS + ansible.builtin.get_url: + url: "{{ calico_manifest_yaml_url }}" + dest: "{{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml" + mode: "0644" + timeout: 300 + delegate_to: "{{ kube_vip }}" + register: calico_download + failed_when: false + + - name: Fail if Calico manifest download failed + ansible.builtin.fail: + msg: >- + Failed to download Calico manifest from Pulp. + URL: {{ calico_manifest_yaml_url }}. + Please verify Pulp has this artifact or manually stage at + {{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml + when: calico_download.failed + + - name: Download MetalLB manifest from Pulp to NFS + ansible.builtin.get_url: + url: "{{ metallb_manifest_yaml_url }}" + dest: "{{ k8s_client_mount_path }}/metallb/{{ metallb_package }}.yml" + mode: "0644" + timeout: 300 + delegate_to: "{{ kube_vip }}" + register: metallb_download + failed_when: false + + - name: Fail if MetalLB manifest download failed + ansible.builtin.fail: + msg: >- + Failed to download MetalLB manifest from Pulp. + URL: {{ metallb_manifest_yaml_url }}. + Please verify Pulp has this artifact or manually stage at + {{ k8s_client_mount_path }}/metallb/{{ metallb_package }}.yml + when: metallb_download.failed + + - name: Download Helm tarball from Pulp to NFS (consistent with fresh install) + ansible.builtin.get_url: + url: "{{ helm_tarball_url }}" + dest: "{{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz" + mode: "0644" + timeout: 300 + delegate_to: "{{ kube_vip }}" + register: helm_download + failed_when: false + + - name: Fail if Helm tarball download failed + ansible.builtin.fail: + msg: >- + Failed to download Helm tarball from Pulp. + URL: {{ helm_tarball_url }}. + Please verify Pulp has this artifact or manually stage at + {{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz + when: helm_download.failed + + - name: Extract Helm tarball to version-specific directory (consistent with fresh install) + ansible.builtin.unarchive: + src: "{{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz" + dest: "{{ k8s_client_mount_path }}/helm/" + remote_src: true + extra_opts: + - "--transform" + - "s|^linux-amd64|linux-amd64-helm-v{{ helm_target_version }}|" + delegate_to: "{{ kube_vip }}" + when: not helm_download.failed + + - name: Verify staged addon artifacts on NFS + ansible.builtin.stat: + path: "{{ item.path }}" + delegate_to: "{{ kube_vip }}" + register: addon_artifacts + loop: + - path: "{{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml" + - path: "{{ k8s_client_mount_path }}/metallb/{{ metallb_package }}.yml" + - path: "{{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz" + - path: "{{ k8s_client_mount_path }}/helm/linux-amd64-helm-v{{ helm_target_version }}/helm" + + - name: Fail if any addon artifact is missing + ansible.builtin.fail: + msg: "Required addon artifact not found: {{ item.item.path }}" + loop: "{{ addon_artifacts.results }}" + when: not item.stat.exists + + - name: Display addon staging summary + ansible.builtin.debug: + msg: >- + Addon manifests staged successfully from Pulp to NFS ({{ kube_vip }}): + - Calico: {{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml + - MetalLB: {{ k8s_client_mount_path }}/metallb/{{ metallb_package }}.yml + - Helm tarball: {{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz + - Helm binary: {{ k8s_client_mount_path }}/helm/linux-amd64-helm-v{{ helm_target_version }}/helm + +# ── Step 2: Per-hop build + verify ─────────────────────────────────── +- name: "Phase 1 — Build & verify for target version {{ _prep_hop.to_version }}" + ansible.builtin.include_tasks: prep_phase_single_hop.yml + loop: "{{ hop_chain }}" + loop_control: + loop_var: _prep_hop + label: "{{ _prep_hop.from_minor }} → {{ _prep_hop.to_version }}" + vars: + _hop_target_version: "{{ _prep_hop.to_version }}" + _hop_target_minor: "{{ _prep_hop.to_version | regex_replace('\\.[0-9]+$', '') }}" + _hop_artifacts_dir: "{{ _prep_hop.artifacts_dir }}" + +- name: "Phase 1 — Prep complete" + ansible.builtin.debug: + msg: >- + Phase 1 (Prep) complete. All repos synced, squashfs images built, and + artifacts verified for {{ hop_chain | length }} target version(s). + Proceeding to Phase 2 (Upgrade). diff --git a/upgrade/roles/upgrade_k8s/tasks/prep_phase_single_hop.yml b/upgrade/roles/upgrade_k8s/tasks/prep_phase_single_hop.yml new file mode 100644 index 0000000000..02daea9bfd --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/prep_phase_single_hop.yml @@ -0,0 +1,28 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Prep a single hop's target version: build squashfs and verify artifacts. +# Repo sync is handled by prep_sync_all_repos.yml (called once for all hops). +# +# Inputs (set by caller): +# _hop_target_version — e.g. "1.35.1" +# _hop_target_minor — e.g. "1.35" +# _hop_artifacts_dir — e.g. "v1.35.1" + +# Commented out to skip squashfs build and proceed directly with k8s upgrade +# - name: "prep [{{ _hop_target_version }}] — Build squashfs image" +# ansible.builtin.include_tasks: build_squashfs.yml + +# - name: "prep [{{ _hop_target_version }}] — Verify all artifacts" +# ansible.builtin.include_tasks: verify_images.yml diff --git a/upgrade/roles/upgrade_k8s/tasks/prep_sync_all_repos.yml b/upgrade/roles/upgrade_k8s/tasks/prep_sync_all_repos.yml new file mode 100644 index 0000000000..da2c031fed --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/prep_sync_all_repos.yml @@ -0,0 +1,130 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Sync local Pulp repos for ALL hop target versions in a single local_repo call. +# +# Merges local_repo_config.yml and service_k8s.json from every hop's artifacts +# directory into a combined staging directory, then invokes local_repo.yml once. +# This avoids N× overhead of credential loading, Pulp validation, and SELinux setup. +# +# Inputs: +# hop_chain — list of hops (from detect_hop_chain_from_manifest.yml) +# input_project_dir — real input directory (shared files symlinked from here) + +- name: "Prep_sync_repos — Check which hops already have Pulp repos" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ item.to_version + | regex_replace('\.[0-9]+$', '') + | replace('.', '-') }} + loop: "{{ hop_chain }}" + loop_control: + label: "v{{ item.to_version }}" + register: _pulp_checks + changed_when: false + failed_when: false + +- name: "Prep_sync_repos — Build list of hops needing sync" + ansible.builtin.set_fact: + _hops_needing_sync: >- + {{ _pulp_checks.results + | selectattr('rc', 'ne', 0) + | map(attribute='item') + | list }} + +- name: "Prep_sync_repos — Skip if all repos already synced" + ansible.builtin.debug: + msg: >- + All {{ hop_chain | length }} hop(s) already have Pulp repos synced — skipping. + when: _hops_needing_sync | length == 0 + +- name: Prep sync repos — Sync repos + when: _hops_needing_sync | length > 0 + block: + - name: "Prep_sync_repos — Create staging directory" + ansible.builtin.tempfile: + state: directory + prefix: "upgrade_repo_sync_" + register: _repo_staging + + - name: "Prep_sync_repos — Merge configs from all hops" + ansible.builtin.include_tasks: prep_merge_configs.yml + vars: + _staging_dir: "{{ _repo_staging.path }}" + + # Symlink shared (non-version-specific) files from real input dir. + # local_repo.yml / validate_config may need these. + - name: "Prep_sync_repos — Symlink shared files from input_project_dir" + ansible.builtin.file: + src: "{{ input_project_dir }}/{{ item }}" + dest: "{{ _repo_staging.path }}/{{ item }}" + state: link + force: false + loop: + - network_spec.yml + - omnia_config_credentials.yml + - build_stream_config.yml + - storage_config.yml + - high_availability_config.yml + failed_when: false + + # Symlink package manifests (used by parse_and_download for RPM list). + - name: "Prep_sync_repos — Create config subdirectories for shared package manifests" + ansible.builtin.file: + path: "{{ _repo_staging.path }}/config/x86_64/rhel/{{ cluster_os_version }}" + state: directory + mode: "{{ dir_perm_755 }}" + + - name: "Prep_sync_repos — Symlink shared package manifests" + ansible.builtin.file: + src: "{{ input_project_dir }}/config/x86_64/rhel/{{ cluster_os_version }}/{{ item }}" + dest: "{{ _repo_staging.path }}/config/x86_64/rhel/{{ cluster_os_version }}/{{ item }}" + state: link + force: false + loop: + - default_packages.json + - additional_packages.json + - admin_debug_packages.json + failed_when: false + + - name: "Prep_sync_repos — Run local_repo playbook with merged staging dir" + ansible.builtin.command: + cmd: >- + ansible-playbook + {{ role_path }}/../../local_repo/local_repo.yml + --extra-vars "input_project_dir={{ _repo_staging.path }}" + --extra-vars "project_dir_status=true" + --extra-vars "softwares=service_k8s" + register: _sync_result + changed_when: "'changed=' in _sync_result.stdout" + + - name: "Prep_sync_repos — Verify repos exist for all hops" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ item.to_version + | regex_replace('\.[0-9]+$', '') + | replace('.', '-') }} + loop: "{{ _hops_needing_sync }}" + loop_control: + label: "v{{ item.to_version }}" + changed_when: false + + always: + - name: "Prep_sync_repos — Clean up staging directory" + ansible.builtin.file: + path: "{{ _repo_staging.path }}" + state: absent + when: _repo_staging.path is defined diff --git a/upgrade/roles/upgrade_k8s/tasks/release_lock.yml b/upgrade/roles/upgrade_k8s/tasks/release_lock.yml new file mode 100644 index 0000000000..dc5235b9eb --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/release_lock.yml @@ -0,0 +1,20 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Remove upgrade lock file + ansible.builtin.file: + path: "{{ lock_file }}" + state: absent + delegate_to: "{{ kube_vip }}" + failed_when: false diff --git a/upgrade/roles/upgrade_k8s/tasks/step_addon_validation.yml b/upgrade/roles/upgrade_k8s/tasks/step_addon_validation.yml new file mode 100644 index 0000000000..72acf94fa0 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_addon_validation.yml @@ -0,0 +1,61 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Verify calico-node pods are Running + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get pods -n kube-system -l k8s-app=calico-node + --no-headers -o custom-columns=':status.phase' + register: calico_pods + changed_when: false + failed_when: false + +- name: Warn if any calico-node pod is not Running + ansible.builtin.debug: + msg: "WARNING: Some calico-node pods are not Running: {{ calico_pods.stdout }}" + when: "'Running' not in calico_pods.stdout or calico_pods.rc != 0" + +- name: Verify MetalLB pods are Running + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get pods -n metallb-system --no-headers + -o custom-columns=':metadata.name,:status.phase' + register: metallb_pods + changed_when: false + failed_when: false + +- name: Warn if any MetalLB pod is not Running + ansible.builtin.debug: + msg: "WARNING: Some MetalLB pods not Running: {{ metallb_pods.stdout }}" + when: "'Running' not in metallb_pods.stdout or metallb_pods.rc != 0" + +- name: Verify LoadBalancer services have EXTERNAL-IP + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get svc -A --field-selector spec.type=LoadBalancer + -o jsonpath='{range .items[*]}{.metadata.name}: {.status.loadBalancer.ingress[0].ip}{"\n"}{end}' + register: lb_services + changed_when: false + failed_when: false + +- name: Display addon validation summary + ansible.builtin.debug: + msg: >- + Addon validation complete. + Calico pods: {{ 'OK' if calico_pods.rc == 0 else 'WARN' }} + MetalLB pods: {{ 'OK' if metallb_pods.rc == 0 else 'WARN' }} + LoadBalancer IPs: {{ lb_services.stdout | default('none') }} diff --git a/upgrade/roles/upgrade_k8s/tasks/step_calico_upgrade.yml b/upgrade/roles/upgrade_k8s/tasks/step_calico_upgrade.yml new file mode 100644 index 0000000000..034d83a832 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_calico_upgrade.yml @@ -0,0 +1,81 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Set Calico manifest paths + ansible.builtin.set_fact: + calico_source: "{{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml" + calico_work: "/tmp/calico-upgrade-{{ calico_target_version }}.yaml" + +- name: Back up current Calico resources + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: | + /usr/bin/kubectl get daemonset calico-node -n kube-system -o yaml \ + > {{ k8s_client_mount_path }}/upgrade/backup/calico-node-ds-backup.yaml + /usr/bin/kubectl get deployment calico-kube-controllers -n kube-system -o yaml \ + > {{ k8s_client_mount_path }}/upgrade/backup/calico-kube-controllers-backup.yaml + changed_when: true + failed_when: false + +- name: Copy manifest to working location + delegate_to: "{{ kube_vip }}" + ansible.builtin.copy: + src: "{{ calico_source }}" + dest: "{{ calico_work }}" + remote_src: true + mode: "0644" + +- name: Patch image registry (quay.io to docker.io) + delegate_to: "{{ kube_vip }}" + ansible.builtin.replace: + path: "{{ calico_work }}" + regexp: "quay\\.io/calico/" + replace: "docker.io/calico/" + +- name: Check if IP_AUTODETECTION_METHOD already set + delegate_to: "{{ kube_vip }}" + ansible.builtin.shell: + cmd: "grep -c 'IP_AUTODETECTION_METHOD' {{ calico_work }} || true" + register: autodetect_check + changed_when: false + +- name: Patch IP_AUTODETECTION_METHOD + delegate_to: "{{ kube_vip }}" + ansible.builtin.replace: + path: "{{ calico_work }}" + regexp: '(^\\s*value: "autodetect"\\s*$)' + replace: "\\1\n - name: IP_AUTODETECTION_METHOD\n value: \"cidr={{ admin_nic_cidr }}\"" + when: autodetect_check.stdout | int == 0 + changed_when: true + +- name: Apply Calico manifest + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "/usr/bin/kubectl apply --server-side --force-conflicts -f {{ calico_work }}" + register: calico_apply + changed_when: true + +- name: Wait for calico-node DaemonSet rollout + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "/usr/bin/kubectl rollout status daemonset/calico-node -n kube-system --timeout={{ addon_rollout_timeout }}s" + register: calico_node_rollout + changed_when: false + +- name: Wait for calico-kube-controllers Deployment rollout + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "/usr/bin/kubectl rollout status deployment/calico-kube-controllers -n kube-system --timeout=120s" + register: calico_kc_rollout + changed_when: false diff --git a/upgrade/roles/upgrade_k8s/tasks/step_crio_install.yml b/upgrade/roles/upgrade_k8s/tasks/step_crio_install.yml new file mode 100644 index 0000000000..f16aa469d5 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_crio_install.yml @@ -0,0 +1,32 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install CRI-O package + ansible.builtin.dnf: + name: "cri-o-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + delegate_to: "{{ current_node_name }}" + +- name: Reload systemd daemon after CRI-O install + ansible.builtin.systemd: + daemon_reload: true + delegate_to: "{{ current_node_name }}" + +- name: Restart CRI-O service after install + ansible.builtin.systemd: + name: crio + state: restarted + delegate_to: "{{ current_node_name }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/step_drain.yml b/upgrade/roles/upgrade_k8s/tasks/step_drain.yml new file mode 100644 index 0000000000..16ddafe110 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_drain.yml @@ -0,0 +1,26 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Production-safe approach: Cordon only (no eviction) +# Instead of drain (which evicts pods and can be blocked by PDBs), +# we cordon the node to prevent new pods, then let existing pods +# restart in-place after kubelet upgrade. This is safer for stateful +# workloads like Kafka that have strict PodDisruptionBudgets. + +- name: Cordon node (prevent new pod scheduling) - {{ current_node_name }} + ansible.builtin.command: + cmd: kubectl cordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: drain_result + changed_when: true diff --git a/upgrade/roles/upgrade_k8s/tasks/step_etcd_health_check.yml b/upgrade/roles/upgrade_k8s/tasks/step_etcd_health_check.yml new file mode 100644 index 0000000000..0e8f04c71c --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_etcd_health_check.yml @@ -0,0 +1,89 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Get etcd pod name on {{ current_node_name }} + ansible.builtin.command: + cmd: >- + kubectl get pods -n kube-system + -l component=etcd + --field-selector spec.nodeName={{ node_ip }} + -o jsonpath="{.items[0].metadata.name}" + delegate_to: "{{ kube_vip }}" + register: etcd_pod + changed_when: false + +- name: Check etcd cluster health after upgrade (kubectl exec) - {{ current_node_name }} + ansible.builtin.command: + cmd: >- + kubectl exec -n kube-system {{ etcd_pod.stdout }} -- + etcdctl + --endpoints=https://127.0.0.1:2379 + --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/server.crt + --key=/etc/kubernetes/pki/etcd/server.key + endpoint health --cluster + delegate_to: "{{ kube_vip }}" + register: etcd_health_result + changed_when: false + retries: "{{ etcd_health_retries }}" + delay: "{{ etcd_health_delay }}" + until: etcd_health_result.rc == 0 + ignore_errors: true + +- name: Find etcdctl in CRI-O overlay storage (fallback) + ansible.builtin.shell: set -o pipefail && find /tmp/crio-storage/overlay -name etcdctl -type f 2>/dev/null | head -1 + args: + executable: /bin/bash + register: etcdctl_overlay_path + changed_when: false + when: etcd_health_result.rc != 0 + +- name: Copy etcdctl from overlay to /usr/local/bin (fallback) + ansible.builtin.copy: + src: "{{ etcdctl_overlay_path.stdout }}" + dest: /usr/local/bin/etcdctl + mode: "0755" + remote_src: true + when: + - etcd_health_result.rc != 0 + - etcdctl_overlay_path.stdout | length > 0 + +- name: Check etcd cluster health after upgrade (fallback to local etcdctl) - {{ current_node_name }} + ansible.builtin.command: + cmd: >- + etcdctl + --endpoints=https://127.0.0.1:2379 + --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/server.crt + --key=/etc/kubernetes/pki/etcd/server.key + endpoint health --cluster + register: etcd_health_result_fallback + changed_when: false + retries: "{{ etcd_health_retries }}" + delay: "{{ etcd_health_delay }}" + until: etcd_health_result_fallback.rc == 0 + when: etcd_health_result.rc != 0 + +- name: Set final etcd health result + ansible.builtin.set_fact: + etcd_health_result: "{{ etcd_health_result_fallback }}" + when: + - etcd_health_result_fallback is defined + - etcd_health_result_fallback is not skipped + - etcd_health_result_fallback.stdout is defined + +- name: ABORT if etcd quorum lost + ansible.builtin.fail: + msg: "{{ msg_etcd_quorum_lost }}\n{{ etcd_health_result.stdout | default('') }}\n{{ etcd_health_result.stderr | default('') }}" + when: "'is unhealthy' in (etcd_health_result.stdout | default('')) or (etcd_health_result.rc | default(0)) != 0" diff --git a/upgrade/roles/upgrade_k8s/tasks/step_helm_upgrade.yml b/upgrade/roles/upgrade_k8s/tasks/step_helm_upgrade.yml new file mode 100644 index 0000000000..dce11bb507 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_helm_upgrade.yml @@ -0,0 +1,44 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Set Helm tarball and binary paths (consistent with fresh install) + ansible.builtin.set_fact: + helm_tarball: "{{ k8s_client_mount_path }}/helm/{{ helm_package }}.tar.gz" + helm_binary_path: "{{ k8s_client_mount_path }}/helm/linux-amd64-helm-v{{ helm_target_version }}/helm" + +- name: Back up current Helm binary + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "cp /usr/local/bin/helm /usr/local/bin/helm.bak" + changed_when: true + failed_when: false + +- name: Copy Helm binary from NFS to kube_vip + delegate_to: "{{ kube_vip }}" + ansible.builtin.copy: + src: "{{ helm_binary_path }}" + dest: /usr/local/bin/helm + mode: "0755" + remote_src: true + +- name: Verify Helm version + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: helm version --short + register: helm_version_check + changed_when: false + +- name: Display Helm version + ansible.builtin.debug: + msg: "Helm upgraded to: {{ helm_version_check.stdout }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/step_kubeadm_install.yml b/upgrade/roles/upgrade_k8s/tasks/step_kubeadm_install.yml new file mode 100644 index 0000000000..dbe827f0c2 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_kubeadm_install.yml @@ -0,0 +1,22 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install kubeadm package + ansible.builtin.dnf: + name: "kubeadm-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + delegate_to: "{{ current_node_name }}" + register: kubeadm_install_result diff --git a/upgrade/roles/upgrade_k8s/tasks/step_kubelet_install.yml b/upgrade/roles/upgrade_k8s/tasks/step_kubelet_install.yml new file mode 100644 index 0000000000..4b42a991fd --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_kubelet_install.yml @@ -0,0 +1,24 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install kubelet and kubectl packages + ansible.builtin.dnf: + name: + - "kubelet-{{ k8s_target_version }}" + - "kubectl-{{ k8s_target_version }}" + state: present + disablerepo: "*" + enablerepo: "x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_version | regex_replace('\\.[0-9]+$', '') | replace('.', '-') }}" + delegate_to: "{{ current_node_name }}" + register: kubelet_install_result diff --git a/upgrade/roles/upgrade_k8s/tasks/step_kubelet_restart.yml b/upgrade/roles/upgrade_k8s/tasks/step_kubelet_restart.yml new file mode 100644 index 0000000000..f88d250e20 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_kubelet_restart.yml @@ -0,0 +1,47 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Part 1: Reload systemd on the node +- name: Reload systemd on {{ current_node_name }} + ansible.builtin.systemd: + daemon_reload: true + delegate_to: "{{ current_node_name }}" + +# Part 2: Restart crio service +- name: Restart crio on {{ current_node_name }} + ansible.builtin.systemd: + name: crio + state: restarted + delegate_to: "{{ current_node_name }}" + +# Part 3: Restart kubelet service +- name: Restart kubelet on {{ current_node_name }} + ansible.builtin.systemd: + name: kubelet + state: restarted + delegate_to: "{{ current_node_name }}" + +# Part 4: Wait for node to become Ready with correct version +- name: Wait for node to become Ready + ansible.builtin.command: >- + kubectl get node {{ node_ip }} + -o jsonpath="{.status.nodeInfo.kubeletVersion}:{range .status.conditions[?(@.type==\"Ready\")]}{.status}{end}" + delegate_to: "{{ kube_vip }}" + register: node_ready_check + changed_when: false + retries: "{{ kubelet_ready_retries }}" + delay: "{{ kubelet_ready_delay }}" + until: + - node_ready_check.rc == 0 + - "'v' + k8s_target_version + ':True' in node_ready_check.stdout" diff --git a/upgrade/roles/upgrade_k8s/tasks/step_metallb_upgrade.yml b/upgrade/roles/upgrade_k8s/tasks/step_metallb_upgrade.yml new file mode 100644 index 0000000000..c0c9cb8a18 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_metallb_upgrade.yml @@ -0,0 +1,47 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Set MetalLB manifest path + ansible.builtin.set_fact: + metallb_source: "{{ k8s_client_mount_path }}/metallb/{{ metallb_package }}.yml" + metallb_work: "/tmp/metallb-upgrade-{{ metallb_target_version }}.yaml" + +- name: Copy MetalLB manifest to working location + delegate_to: "{{ kube_vip }}" + ansible.builtin.copy: + src: "{{ metallb_source }}" + dest: "{{ metallb_work }}" + remote_src: true + mode: "0644" + +- name: Apply MetalLB manifest + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "kubectl apply --server-side --force-conflicts -f {{ metallb_work }}" + register: metallb_apply + changed_when: true + +- name: Wait for MetalLB speaker DaemonSet rollout + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "kubectl rollout status daemonset/speaker -n metallb-system --timeout={{ addon_rollout_timeout }}s" + register: metallb_speaker_rollout + changed_when: false + +- name: Wait for MetalLB controller Deployment rollout + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "kubectl rollout status deployment/controller -n metallb-system --timeout=120s" + register: metallb_controller_rollout + changed_when: false diff --git a/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml b/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml new file mode 100644 index 0000000000..4291804071 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml @@ -0,0 +1,19 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Uncordon node {{ current_node_name }} + ansible.builtin.command: kubectl uncordon {{ node_ip }} + delegate_to: "{{ kube_vip }}" + register: uncordon_result + changed_when: true diff --git a/upgrade/roles/upgrade_k8s/tasks/step_upgrade_apply.yml b/upgrade/roles/upgrade_k8s/tasks/step_upgrade_apply.yml new file mode 100644 index 0000000000..ee49cd0829 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_upgrade_apply.yml @@ -0,0 +1,28 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Run kubeadm upgrade plan + ansible.builtin.command: kubeadm upgrade plan v{{ k8s_target_version }} + register: upgrade_plan + changed_when: false + +- name: Display upgrade plan output + ansible.builtin.debug: + msg: "{{ upgrade_plan.stdout_lines }}" + verbosity: 1 + +- name: Run kubeadm upgrade apply + ansible.builtin.command: kubeadm upgrade apply v{{ k8s_target_version }} --yes + register: upgrade_apply_result + changed_when: true diff --git a/upgrade/roles/upgrade_k8s/tasks/step_upgrade_node.yml b/upgrade/roles/upgrade_k8s/tasks/step_upgrade_node.yml new file mode 100644 index 0000000000..3abde668e7 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_upgrade_node.yml @@ -0,0 +1,19 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Run kubeadm upgrade node on {{ current_node_name }} + ansible.builtin.command: kubeadm upgrade node + delegate_to: "{{ current_node_name }}" + register: upgrade_node_result + changed_when: true diff --git a/upgrade/roles/upgrade_k8s/tasks/step_validate_node.yml b/upgrade/roles/upgrade_k8s/tasks/step_validate_node.yml new file mode 100644 index 0000000000..9a97910bb2 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/step_validate_node.yml @@ -0,0 +1,32 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Validate node version and status - {{ current_node_name }} + ansible.builtin.command: + cmd: >- + kubectl get node {{ node_ip }} + -o jsonpath="{.status.nodeInfo.kubeletVersion} {range .status.conditions[?(@.type==\"Ready\")]}{.status}{end}" + delegate_to: "{{ kube_vip }}" + register: validate_result + changed_when: false + +- name: Verify node is at target version and Ready + ansible.builtin.assert: + that: + - "'v' + k8s_target_version in validate_result.stdout" + - "'True' in validate_result.stdout" + fail_msg: >- + Node {{ current_node_name }} validation failed. + Expected v{{ k8s_target_version }} Ready, got: {{ validate_result.stdout }} + success_msg: "Node {{ current_node_name }} upgraded to v{{ k8s_target_version }} and Ready." diff --git a/upgrade/roles/upgrade_k8s/tasks/sync_local_repo.yml b/upgrade/roles/upgrade_k8s/tasks/sync_local_repo.yml new file mode 100644 index 0000000000..71d53bdaad --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/sync_local_repo.yml @@ -0,0 +1,120 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# DEPRECATED: Superseded by prep_sync_all_repos.yml + prep_merge_configs.yml +# which merge configs across ALL hops and call local_repo.yml once. +# Kept for reference only; not included by any task file. +# +# Sync local Pulp repos for a specific target version. +# +# Creates a temporary staging directory with symlinks to the hop's artifact +# configs and passes it to local_repo.yml as input_project_dir via extra-vars. +# The user's real input directory is NEVER modified. +# +# Inputs: +# _hop_target_version — full target version (e.g. "1.35.1") +# _hop_target_minor — minor version (e.g. "1.35") +# _hop_artifacts_dir — artifacts directory name (e.g. "v1.35.1") +# admin_nic_ip — Pulp server address + +- name: "Sync local repo — Check if kubernetes repo already synced" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ _hop_target_minor | replace('.', '-') }} + register: _pulp_k8s_check + changed_when: false + failed_when: false + +- name: "Sync local repo — Check if cri-o repo already synced" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ _hop_target_minor | replace('.', '-') }} + register: _pulp_crio_check + changed_when: false + failed_when: false + +- name: "Sync local repo — Skip if repos already present" + ansible.builtin.debug: + msg: >- + Pulp repos for v{{ _hop_target_version }} already synced — skipping sync. + kubernetes: {{ 'present' if _pulp_k8s_check.rc == 0 else 'MISSING' }}, + cri-o: {{ 'present' if _pulp_crio_check.rc == 0 else 'MISSING' }} + when: + - _pulp_k8s_check.rc == 0 + - _pulp_crio_check.rc == 0 + +- name: "Sync local repo — Run local_repo sync" + when: _pulp_k8s_check.rc != 0 or _pulp_crio_check.rc != 0 + vars: + _artifacts_abs: "{{ role_path }}/../../artifacts/{{ _hop_artifacts_dir }}" + block: + # Build a temp staging dir that mirrors the layout local_repo expects. + # Only the 3 version-specific configs are symlinked from artifacts; + # everything else falls through from the real input dir. + - name: "Sync local repo — Create temp staging directory" + ansible.builtin.tempfile: + state: directory + prefix: "upgrade_prep_{{ _hop_target_version }}_" + register: _staging_dir + + - name: "Sync local repo — Symlink software_config.json into staging" + ansible.builtin.file: + src: "{{ _artifacts_abs }}/software_config.json" + dest: "{{ _staging_dir.path }}/software_config.json" + state: link + + - name: "Sync local repo — Symlink local_repo_config.yml into staging" + ansible.builtin.file: + src: "{{ _artifacts_abs }}/local_repo_config.yml" + dest: "{{ _staging_dir.path }}/local_repo_config.yml" + state: link + + - name: "Sync local repo — Create config subdirectory in staging" + ansible.builtin.file: + path: "{{ _staging_dir.path }}/config/x86_64/rhel/{{ cluster_os_version }}" + state: directory + mode: "{{ dir_perm_755 }}" + + - name: "Sync local repo — Symlink service_k8s.json into staging" + ansible.builtin.file: + src: "{{ _artifacts_abs }}/service_k8s.json" + dest: "{{ _staging_dir.path }}/config/x86_64/rhel/{{ cluster_os_version }}/service_k8s.json" + state: link + + - name: "Sync local repo — Run local_repo playbook with staging dir" + ansible.builtin.command: + cmd: >- + ansible-playbook + {{ role_path }}/../../local_repo/local_repo.yml + --extra-vars "input_project_dir={{ _staging_dir.path }}" + --extra-vars "softwares=service_k8s" + --extra-vars "target_k8s_version={{ _hop_target_version }}" + register: _sync_result + changed_when: "'changed=' in _sync_result.stdout" + + - name: "Sync local repo — Verify repos now exist" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ _hop_target_minor | replace('.', '-') }} + changed_when: false + + always: + - name: "Sync local repo — Clean up staging directory" + ansible.builtin.file: + path: "{{ _staging_dir.path }}" + state: absent + when: _staging_dir.path is defined diff --git a/upgrade/roles/upgrade_k8s/tasks/update_addon_step.yml b/upgrade/roles/upgrade_k8s/tasks/update_addon_step.yml new file mode 100644 index 0000000000..d6007d06b6 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/update_addon_step.yml @@ -0,0 +1,24 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Build resolved addon status entry for {{ addon_name }} + ansible.builtin.set_fact: + _addon_status_entry: >- + {{ {addon_name: addon_status_update} }} + +- name: Update addon status + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + addon_upgrade: "{{ _addon_status_entry }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/update_node_status.yml b/upgrade/roles/upgrade_k8s/tasks/update_node_status.yml new file mode 100644 index 0000000000..cac0b51919 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/update_node_status.yml @@ -0,0 +1,47 @@ +--- +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Node-specific status updates using custom k8s_upgrade_status module +# This module provides optimized performance with file locking and atomic operations +# +# Required variables: +# - node_name: The name of the node to update +# - node_status_update: The status update dictionary for this node +# - status_file: path to status file on kube_vip +# - kube_vip: target host where status file is stored +# +# Example usage: +# - ansible.builtin.include_tasks: update_node_status.yml +# vars: +# node_name: "{{ current_node_name }}" +# node_status_update: +# status: in_progress +# steps: +# setup_repos: +# status: completed +# timestamp: "{{ now(utc=true).isoformat() }}" + +- name: Update node status using custom module + k8s_upgrade_status: + status_file: "{{ status_file }}" + kube_vip: "{{ kube_vip }}" + node_name: "{{ node_name }}" + node_status_update: "{{ node_status_update }}" + delegate_to: "{{ kube_vip }}" + register: _status_update_result + +- name: Update in-memory upgrade_status + ansible.builtin.set_fact: + upgrade_status: "{{ _status_update_result.merged_status }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/update_node_step.yml b/upgrade/roles/upgrade_k8s/tasks/update_node_step.yml new file mode 100644 index 0000000000..13b3757d65 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/update_node_step.yml @@ -0,0 +1,43 @@ +--- +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Wrapper for general status updates (etcd_backup, k8s_config_backup, addon_upgrade, etc.) +# Calls the consolidated update_status.yml task +# +# Required variables: +# - status_update: The status update dictionary to merge +# - status_file: path to status file on kube_vip (or will be resolved from defaults) +# - kube_vip: target host where status file is stored (or will be resolved from defaults) +# +# Example usage: +# - ansible.builtin.include_tasks: update_node_step.yml +# vars: +# status_update: +# etcd_backup: +# status: completed +# timestamp: "{{ now(utc=true).isoformat() }}" + +- name: Fail if status_file is not defined + ansible.builtin.fail: + msg: "Required variable 'status_file' is not defined. Ensure it is set before calling update_node_step.yml" + when: status_file is not defined + +- name: Fail if kube_vip is not defined + ansible.builtin.fail: + msg: "Required variable 'kube_vip' is not defined. Ensure it is set before calling update_node_step.yml" + when: kube_vip is not defined + +- name: Update general status via consolidated task + ansible.builtin.include_tasks: update_status.yml diff --git a/upgrade/roles/upgrade_k8s/tasks/update_status.yml b/upgrade/roles/upgrade_k8s/tasks/update_status.yml new file mode 100644 index 0000000000..d4f3375d74 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/update_status.yml @@ -0,0 +1,69 @@ +--- +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Consolidated task for updating upgrade_status.yml using custom k8s_upgrade_status module +# This module provides optimized performance with file locking and atomic operations +# +# Required variables: +# - status_file: path to status file on kube_vip +# - kube_vip: target host where status file is stored +# +# For general updates (etcd_backup, k8s_config_backup, addon_upgrade, etc.): +# - status_update: dict to merge into status +# +# For node-specific updates: +# - node_name: name of the node to update +# - node_status_update: dict to merge into that node's status +# +# Example usage (general): +# - ansible.builtin.include_tasks: update_status.yml +# vars: +# status_update: +# etcd_backup: +# status: completed +# timestamp: "{{ now(utc=true).isoformat() }}" +# +# Example usage (node-specific): +# - ansible.builtin.include_tasks: update_status.yml +# vars: +# node_name: kcp1 +# node_status_update: +# status: in_progress +# steps: +# drain: +# status: completed + +- name: Update status using custom module (node-specific) + k8s_upgrade_status: + status_file: "{{ status_file }}" + kube_vip: "{{ kube_vip }}" + node_name: "{{ node_name }}" + node_status_update: "{{ node_status_update }}" + delegate_to: "{{ kube_vip }}" + register: _status_update_result + when: node_name is defined and node_status_update is defined + +- name: Update status using custom module (general) + k8s_upgrade_status: + status_file: "{{ status_file }}" + kube_vip: "{{ kube_vip }}" + status_update: "{{ status_update }}" + delegate_to: "{{ kube_vip }}" + register: _status_update_result + when: status_update is defined and node_name is not defined + +- name: Update in-memory upgrade_status + ansible.builtin.set_fact: + upgrade_status: "{{ _status_update_result.merged_status }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/upgrade_addons.yml b/upgrade/roles/upgrade_k8s/tasks/upgrade_addons.yml new file mode 100644 index 0000000000..ed0b8811e2 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/upgrade_addons.yml @@ -0,0 +1,171 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Detect current addon versions + ansible.builtin.include_tasks: detect_addon_versions.yml + +- name: Mark addon upgrade in_progress + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + addon_upgrade: + status: in_progress + started_at: "{{ ansible_date_time.iso8601 }}" + +# ── Calico upgrade (ABORT on failure) ────────────────────────────── +- name: Starting Calico upgrade + ansible.builtin.debug: + msg: "Starting Calico addon upgrade from {{ calico_from_version | default('unknown') }} to {{ calico_target_version }}..." + when: (upgrade_status.addon_upgrade.calico.status | default('pending')) != 'completed' + +- name: Mark Calico in_progress + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: calico + addon_status_update: + status: in_progress + from_version: "{{ calico_from_version | default('unknown') }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + when: (upgrade_status.addon_upgrade.calico.status | default('pending')) != 'completed' + +- name: Upgrade Calico + when: (upgrade_status.addon_upgrade.calico.status | default('pending')) != 'completed' + block: + - name: Execute Calico upgrade + ansible.builtin.include_tasks: step_calico_upgrade.yml + + - name: Mark Calico completed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: calico + addon_status_update: + status: completed + from_version: "{{ calico_from_version | default('unknown') }}" + to_version: "{{ calico_target_version }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark Calico failed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: calico + addon_status_update: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: ABORT — Calico upgrade failed + ansible.builtin.fail: + msg: "{{ msg_addon_calico_failed }}" + +# ── MetalLB upgrade (ABORT on failure) ─────────────────────────── +- name: Starting MetalLB upgrade + ansible.builtin.debug: + msg: "Starting MetalLB addon upgrade from {{ metallb_from_version | default('unknown') }} to {{ metallb_target_version }}..." + when: (upgrade_status.addon_upgrade.metallb.status | default('pending')) != 'completed' + +- name: Mark MetalLB in_progress + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: metallb + addon_status_update: + status: in_progress + from_version: "{{ metallb_from_version | default('unknown') }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + when: (upgrade_status.addon_upgrade.metallb.status | default('pending')) != 'completed' + +- name: Upgrade MetalLB + when: (upgrade_status.addon_upgrade.metallb.status | default('pending')) != 'completed' + block: + - name: Execute MetalLB upgrade + ansible.builtin.include_tasks: step_metallb_upgrade.yml + + - name: Mark MetalLB completed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: metallb + addon_status_update: + status: completed + from_version: "{{ metallb_from_version | default('unknown') }}" + to_version: "{{ metallb_target_version }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark MetalLB failed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: metallb + addon_status_update: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: ABORT — MetalLB upgrade failed + ansible.builtin.fail: + msg: "{{ msg_addon_metallb_failed }}" + +# ── Helm upgrade (ABORT on failure) ───────────────────────────── +- name: Starting Helm upgrade + ansible.builtin.debug: + msg: "Starting Helm addon upgrade from {{ helm_from_version | default('unknown') }} to {{ helm_target_version }}..." + when: (upgrade_status.addon_upgrade.helm.status | default('pending')) != 'completed' + +- name: Mark Helm in_progress + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: helm + addon_status_update: + status: in_progress + from_version: "{{ helm_from_version | default('unknown') }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + when: (upgrade_status.addon_upgrade.helm.status | default('pending')) != 'completed' + +- name: Upgrade Helm + when: (upgrade_status.addon_upgrade.helm.status | default('pending')) != 'completed' + block: + - name: Execute Helm upgrade + ansible.builtin.include_tasks: step_helm_upgrade.yml + + - name: Mark Helm completed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: helm + addon_status_update: + status: completed + from_version: "{{ helm_from_version | default('unknown') }}" + to_version: "{{ helm_target_version }}" + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + rescue: + - name: Mark Helm failed + ansible.builtin.include_tasks: update_addon_step.yml + vars: + addon_name: helm + addon_status_update: + status: failed + timestamp: "{{ ansible_date_time.iso8601 }}" + + - name: ABORT — Helm upgrade failed + ansible.builtin.fail: + msg: "{{ msg_addon_helm_failed }}" + +# ── Run addon validation ────────────────────────────────────────── +- name: Validate addons + ansible.builtin.include_tasks: step_addon_validation.yml + +- name: Mark addon upgrade completed + ansible.builtin.include_tasks: update_node_step.yml + vars: + status_update: + addon_upgrade: + status: completed + completed_at: "{{ ansible_date_time.iso8601 }}" diff --git a/upgrade/roles/upgrade_k8s/tasks/upgrade_cp_node.yml b/upgrade/roles/upgrade_k8s/tasks/upgrade_cp_node.yml new file mode 100644 index 0000000000..b73a556966 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/upgrade_cp_node.yml @@ -0,0 +1,324 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This file contains the tasks to upgrade a single additional control plane node. +# It is meant to be included from a play that targets k8s_control_plane hosts. +# This is the task-only version of upgrade_cp.yml (without the hosts: directive). + +- name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + run_once: true + +- name: Load upgrade status + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + run_once: true + when: status_file_check.stat.exists | default(false) + +- name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + run_once: true + when: status_file_check.stat.exists | default(false) + +- name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + run_once: true + when: not (status_file_check.stat.exists | default(false)) + +- name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + +- name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[inventory_hostname].ip }}" + when: upgrade_status.nodes[inventory_hostname].ip is defined + +- name: Skip node if already completed - {{ current_node_name }} + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + +- name: Upgrade control plane {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm install + ansible.builtin.include_tasks: step_kubeadm_install.yml + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_upgrade_node on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm upgrade node + ansible.builtin.include_tasks: step_upgrade_node.yml + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run drain on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute drain + ansible.builtin.include_tasks: step_drain.yml + - name: Mark drain completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet install + ansible.builtin.include_tasks: step_kubelet_install.yml + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute crio install + ansible.builtin.include_tasks: step_crio_install.yml + - name: Mark crio_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet restart + ansible.builtin.include_tasks: step_kubelet_restart.yml + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute uncordon + ansible.builtin.include_tasks: step_uncordon.yml + - name: Mark uncordon completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Execute node validation + ansible.builtin.include_tasks: step_validate_node.yml + - name: Mark validation completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run etcd health check after {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.etcd_health_check.status | default('pending')) != 'completed' + block: + - name: Execute etcd health check + ansible.builtin.include_tasks: step_etcd_health_check.yml + - name: Mark etcd_health_check completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + etcd_health_check: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Mark node completed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + - name: Display upgrade completion for {{ current_node_name }} + ansible.builtin.debug: + msg: | + ════════════════════════════════════════════════════════════════════ + NODE UPGRADE COMPLETE: {{ current_node_name }} + ════════════════════════════════════════════════════════════════════ + Role: Control Plane + From version: {{ upgrade_status.nodes[current_node_name].version_before | default('unknown') }} + To version: {{ k8s_target_version }} + Status: COMPLETED + ════════════════════════════════════════════════════════════════════ + + rescue: + - name: Mark node as failed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Warn about CP failure (continues to next CP) + ansible.builtin.debug: + msg: "WARNING: Control plane {{ current_node_name }} upgrade failed. Will retry on next run." diff --git a/upgrade/roles/upgrade_k8s/tasks/upgrade_worker.yml b/upgrade/roles/upgrade_k8s/tasks/upgrade_worker.yml new file mode 100644 index 0000000000..3a83dd3ad2 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/upgrade_worker.yml @@ -0,0 +1,253 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Skip node if already completed - {{ current_node_name }} + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + +- name: Upgrade worker {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm install + ansible.builtin.include_tasks: step_kubeadm_install.yml + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_upgrade_node on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm upgrade node + ansible.builtin.include_tasks: step_upgrade_node.yml + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run drain on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute drain + ansible.builtin.include_tasks: step_drain.yml + - name: Mark drain completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet install + ansible.builtin.include_tasks: step_kubelet_install.yml + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute crio install + ansible.builtin.include_tasks: step_crio_install.yml + - name: Mark crio_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet restart + ansible.builtin.include_tasks: step_kubelet_restart.yml + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute uncordon + ansible.builtin.include_tasks: step_uncordon.yml + - name: Mark uncordon completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Execute node validation + ansible.builtin.include_tasks: step_validate_node.yml + - name: Mark validation completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Mark node completed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + rescue: + - name: Mark node as failed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Warn about worker failure (continues to next worker) + ansible.builtin.debug: + msg: "WARNING: Worker {{ current_node_name }} upgrade failed. Will retry on next run." diff --git a/upgrade/roles/upgrade_k8s/tasks/upgrade_worker_node.yml b/upgrade/roles/upgrade_k8s/tasks/upgrade_worker_node.yml new file mode 100644 index 0000000000..c37ed5cd52 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/upgrade_worker_node.yml @@ -0,0 +1,308 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This file contains the tasks to upgrade a single worker node. +# It is meant to be included from a play that targets k8s_workers hosts. +# This is the task-only version of upgrade_worker.yml (without the hosts: directive). + +- name: Check if upgrade status file exists on kube_vip + ansible.builtin.stat: + path: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_file_check + run_once: true + +- name: Load upgrade status + ansible.builtin.slurp: + src: "{{ status_file }}" + delegate_to: "{{ kube_vip }}" + register: status_slurp + run_once: true + when: status_file_check.stat.exists | default(false) + +- name: Parse upgrade status + ansible.builtin.set_fact: + upgrade_status: "{{ status_slurp.content | b64decode | from_yaml }}" + run_once: true + when: status_file_check.stat.exists | default(false) + +- name: Abort if upgrade status file is missing on kube_vip + ansible.builtin.fail: + msg: >- + Upgrade status file is missing on kube_vip ({{ kube_vip }}). + Expected: {{ status_file }} + This file should be created during the orchestration phase (load_status.yml). + run_once: true + when: not (status_file_check.stat.exists | default(false)) + +- name: Set current node name + ansible.builtin.set_fact: + current_node_name: "{{ inventory_hostname }}" + +- name: Set node IP from upgrade status + ansible.builtin.set_fact: + node_ip: "{{ upgrade_status.nodes[inventory_hostname].ip }}" + when: upgrade_status.nodes[inventory_hostname].ip is defined + +- name: Skip node if already completed - {{ current_node_name }} + ansible.builtin.debug: + msg: "Node {{ current_node_name }} already completed — skipping." + when: (upgrade_status.nodes[current_node_name].status | default('pending')) == 'completed' + +- name: Upgrade worker {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].status | default('pending')) != 'completed' + block: + - name: Mark node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: in_progress + + # ── setup_repos ────────────────────────────────────────────────── + # NOTE: setup_repos is now done globally in upgrade_k8s.yml before Execute play + # Mark as completed here for status tracking + - name: Mark setup_repos completed (done globally) + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + setup_repos: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_install.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm install + ansible.builtin.include_tasks: step_kubeadm_install.yml + - name: Mark kubeadm_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubeadm_upgrade_node on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubeadm_upgrade_node.status | default('pending')) != 'completed' + block: + - name: Mark kubeadm_upgrade_node in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubeadm upgrade node + ansible.builtin.include_tasks: step_upgrade_node.yml + - name: Mark kubeadm_upgrade_node completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubeadm_upgrade_node: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run drain on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.drain.status | default('pending')) != 'completed' + block: + - name: Mark drain in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute drain + ansible.builtin.include_tasks: step_drain.yml + - name: Mark drain completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + drain: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_install.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet install + ansible.builtin.include_tasks: step_kubelet_install.yml + - name: Mark kubelet_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run crio_install on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.crio_install.status | default('pending')) != 'completed' + block: + - name: Mark crio_install in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute crio install + ansible.builtin.include_tasks: step_crio_install.yml + - name: Mark crio_install completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + crio_install: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run kubelet_restart on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.kubelet_restart.status | default('pending')) != 'completed' + block: + - name: Mark kubelet_restart in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute kubelet restart + ansible.builtin.include_tasks: step_kubelet_restart.yml + - name: Mark kubelet_restart completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + kubelet_restart: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run uncordon on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.uncordon.status | default('pending')) != 'completed' + block: + - name: Mark uncordon in_progress + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: in_progress + timestamp: "{{ ansible_date_time.iso8601 }}" + - name: Execute uncordon + ansible.builtin.include_tasks: step_uncordon.yml + - name: Mark uncordon completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + uncordon: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Run validation on {{ current_node_name }} + when: (upgrade_status.nodes[current_node_name].steps.validation.status | default('pending')) != 'completed' + block: + - name: Execute node validation + ansible.builtin.include_tasks: step_validate_node.yml + - name: Mark validation completed + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + steps: + validation: + status: completed + timestamp: "{{ ansible_date_time.iso8601 }}" + error: + + - name: Mark node completed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: completed + version_current: "{{ k8s_target_version }}" + + - name: Display upgrade completion for {{ current_node_name }} + ansible.builtin.debug: + msg: | + ════════════════════════════════════════════════════════════════════ + NODE UPGRADE COMPLETE: {{ current_node_name }} + ════════════════════════════════════════════════════════════════════ + Role: Worker + From version: {{ upgrade_status.nodes[current_node_name].version_before | default('unknown') }} + To version: {{ k8s_target_version }} + Status: COMPLETED + ════════════════════════════════════════════════════════════════════ + + rescue: + - name: Mark node as failed - {{ current_node_name }} + ansible.builtin.include_tasks: update_node_status.yml + vars: + node_name: "{{ current_node_name }}" + node_status_update: + status: failed + + - name: Warn about worker failure (continues to next worker) + ansible.builtin.debug: + msg: "WARNING: Worker {{ current_node_name }} upgrade failed. Will retry on next run." diff --git a/upgrade/roles/upgrade_k8s/tasks/upgrade_workers_batch_inner.yml b/upgrade/roles/upgrade_k8s/tasks/upgrade_workers_batch_inner.yml new file mode 100644 index 0000000000..26ffb9b5b1 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/upgrade_workers_batch_inner.yml @@ -0,0 +1,24 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Inner loop: upgrade each worker in the current batch sequentially. + +- name: Upgrade worker (batched) - {{ current_node_name }} + ansible.builtin.include_tasks: upgrade_worker.yml + loop: "{{ _current_batch }}" + loop_control: + loop_var: current_node_name + vars: + current_node_ip: "{{ node_ips[current_node_name] }}" + current_node_role: worker diff --git a/upgrade/roles/upgrade_k8s/tasks/validate_worker_ready.yml b/upgrade/roles/upgrade_k8s/tasks/validate_worker_ready.yml new file mode 100644 index 0000000000..c303a96496 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/validate_worker_ready.yml @@ -0,0 +1,52 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Validate a specific worker node is ready after upgrade (Component Spec PHASE 6). +# This validates that worker-1 is ready after its individual upgrade before +# proceeding to upgrade the remaining workers. + +- name: Get target node status + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: "kubectl get node {{ target_node_ip }} --no-headers" + register: worker_node_status + changed_when: false + +- name: Verify target node is Ready + ansible.builtin.fail: + msg: >- + Worker-1 validation failed: Node {{ target_node }} is not Ready. + {{ worker_node_status.stdout }} + when: "'NotReady' in worker_node_status.stdout" + +- name: Get target node version + delegate_to: "{{ kube_vip }}" + ansible.builtin.command: + cmd: >- + kubectl get node {{ target_node_ip }} + -o jsonpath='{.status.nodeInfo.kubeletVersion}' + register: worker_node_version + changed_when: false + +- name: Verify target node is at target version + ansible.builtin.fail: + msg: >- + Worker-1 validation failed: Node {{ target_node }} is not at v{{ k8s_target_version }}. + Current version: {{ worker_node_version.stdout }} + when: "'v' + k8s_target_version not in worker_node_version.stdout" + +- name: Display worker-1 validation success + ansible.builtin.debug: + msg: >- + Worker-1 validation successful: {{ target_node }} is Ready at v{{ k8s_target_version }}. diff --git a/upgrade/roles/upgrade_k8s/tasks/verify_images.yml b/upgrade/roles/upgrade_k8s/tasks/verify_images.yml new file mode 100644 index 0000000000..b71c1fe751 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/tasks/verify_images.yml @@ -0,0 +1,78 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Verify all prep artifacts exist for a specific target version. +# Checks: Pulp repos (RPMs), container images, and squashfs in MinIO. +# If ANY verification fails → caller should ABORT (cluster untouched). +# +# Inputs: +# _hop_target_version — full target version (e.g. "1.35.1") +# _hop_target_minor — minor version (e.g. "1.35") +# admin_nic_ip — Pulp/registry server address + +- name: "Verify images — Check kubernetes Pulp repo" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ _hop_target_minor | replace('.', '-') }} + changed_when: false + +- name: "Verify images — Check cri-o Pulp repo" + ansible.builtin.command: + cmd: >- + pulp rpm distribution show + --name x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ _hop_target_minor | replace('.', '-') }} + changed_when: false + +- name: "Verify images — Check squashfs image in MinIO" + ansible.builtin.command: + cmd: >- + mc stat minio/boot-images/k8s-{{ _hop_target_version }}/squashfs.img + changed_when: false + +- name: Verify images — Check core container images in Pulp registry + ansible.builtin.uri: + url: >- + http://{{ admin_nic_ip }}:2225/v2/{{ item.name }}/manifests/{{ item.tag }} + method: GET + status_code: [200] + validate_certs: false + loop: + - { name: "kube-apiserver", tag: "v{{ _hop_target_version }}" } + - { name: "kube-controller-manager", tag: "v{{ _hop_target_version }}" } + - { name: "kube-scheduler", tag: "v{{ _hop_target_version }}" } + - { name: "kube-proxy", tag: "v{{ _hop_target_version }}" } + register: core_image_checks + loop_control: + label: "{{ item.name }}:{{ item.tag }}" + changed_when: false + failed_when: false + +- name: Warn if core image check failed + ansible.builtin.debug: + msg: >- + WARNING: Could not verify image {{ item.item.name }}:{{ item.item.tag }} + in Pulp registry (status: {{ item.status | default('unknown') }}). + Upgrade may fail if image is not available. + loop: "{{ core_image_checks.results }}" + loop_control: + label: "{{ item.item.name }}:{{ item.item.tag }}" + when: item.status | default(0) != 200 + changed_when: false + +- name: "Verify images — All prep artifacts verified for target version {{ _hop_target_version }}" + ansible.builtin.debug: + msg: >- + All prep artifacts verified for v{{ _hop_target_version }}: + Pulp repos (kubernetes, cri-o), squashfs image, core container images. diff --git a/upgrade/roles/upgrade_k8s/templates/upgrade_repo.j2 b/upgrade/roles/upgrade_k8s/templates/upgrade_repo.j2 new file mode 100644 index 0000000000..6f13ffbc75 --- /dev/null +++ b/upgrade/roles/upgrade_k8s/templates/upgrade_repo.j2 @@ -0,0 +1,15 @@ +# Omnia K8s Upgrade Repository Configuration +# Generated by upgrade playbook for K8s {{ k8s_target_version }} +# This file configures access to Pulp repositories for upgrade packages + +[x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_minor | replace('.', '-') }}] +name=Kubernetes {{ k8s_target_version }} Repository +baseurl={{ pulp_repo_base }}/x86_64_rhel_{{ cluster_os_version }}_kubernetes-v{{ k8s_target_minor | replace('.', '-') }}/ +enabled=1 +gpgcheck=0 + +[x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_minor | replace('.', '-') }}] +name=CRI-O {{ k8s_target_version }} Repository +baseurl={{ pulp_repo_base }}/x86_64_rhel_{{ cluster_os_version }}_cri-o-v{{ k8s_target_minor | replace('.', '-') }}/ +enabled=1 +gpgcheck=0 diff --git a/upgrade/roles/upgrade_k8s/vars/main.yml b/upgrade/roles/upgrade_k8s/vars/main.yml new file mode 100644 index 0000000000..565d43585b --- /dev/null +++ b/upgrade/roles/upgrade_k8s/vars/main.yml @@ -0,0 +1,210 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +# NOTE: These paths are set dynamically in main.yml after load_version_vars.yml +# runs, so that k8s_client_mount_path is available. +# See: roles/upgrade_k8s/tasks/main.yml - "Set upgrade paths on client NFS mount" +# +# upgrade_dir_client: {{ k8s_client_mount_path }}/upgrade +# status_file: {{ k8s_client_mount_path }}/upgrade/upgrade_status.yml +# lock_file: {{ k8s_client_mount_path }}/upgrade/upgrade.lock +# backup_dir: {{ k8s_client_mount_path }}/upgrade/backup +# etcd_snapshot_file: {{ k8s_client_mount_path }}/upgrade/backup/etcd-snapshot.db +# etcdctl_binary: {{ k8s_client_mount_path }}/upgrade/backup/etcdctl +# etcd_members_file: {{ k8s_client_mount_path }}/upgrade/backup/etcd-members.json +# k8s_config_backup_dir: {{ k8s_client_mount_path }}/upgrade/backup/configs + +repo_file_path: "/etc/yum.repos.d/omnia-upgrade.repo" + +nfs_storage_name: "nfs_k8s" + +# nodes.yaml location (created by discovery) +# Use NFS mount path for omnia_core container context +nodes_yaml_path: "/opt/omnia/openchami/workdir/nodes/nodes.yaml" + +# Input config paths +software_config_file: "{{ input_project_dir }}/software_config.json" +storage_config_file: "{{ input_project_dir }}/storage_config.yml" +ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" +# Note: service_k8s_config_file is set dynamically in load_version_vars.yml + +# Omnia system paths +local_repo_access_file: "{{ oim_provision_path }}/local_repo_access.yml" +upgrade_manifest_path: "{{ oim_data_path }}/upgrade_manifest.yml" + +# Temp file paths +upgrade_status_temp_file: "{{ tmp_path }}/upgrade_status.tmp" +upgrade_status_temp_json: "{{ tmp_path }}/upgrade_status.tmp.json" +upgrade_status_temp_yml: "{{ tmp_path }}/upgrade_status_temp.yml" +detect_hop_chain_script: "{{ tmp_path }}/detect_hop_chain.py" +k8s_upgrade_inventory: "{{ tmp_path }}/k8s_upgrade_inventory.ini" + +# Pulp base URL +pulp_content_base: "{{ pulp_protocol | default('http') }}://{{ admin_nic_ip }}:2225/pulp/content" +pulp_repo_base: "{{ pulp_content_base }}/opt/omnia/offline_repo/cluster/x86_64/rhel/{{ cluster_os_version }}/rpms" + +# --------------------------------------------------------------------------- +# Timeouts and retries +# --------------------------------------------------------------------------- +kubelet_ready_timeout: 120 +kubelet_ready_delay: 5 +kubelet_ready_retries: 24 +etcd_health_retries: 12 +etcd_health_delay: 10 +drain_timeout: 300 +addon_rollout_timeout: 300 +apiserver_wait_timeout: 300 +apiserver_wait_delay: 10 +apiserver_wait_retries: 30 +cloud_init_timeout: 600 +reboot_timeout: 600 +reboot_connect_timeout: 300 + +# --------------------------------------------------------------------------- +# Worker rolling-upgrade parallelism +# --------------------------------------------------------------------------- +# Number of workers to upgrade concurrently per batch. +# Default 1 = fully serial (safest). Set higher for faster upgrades on +# large clusters. Workers within a batch are processed sequentially from +# localhost; true async parallelism requires refactoring to a multi-host +# play with serial: N. Override via --extra-vars worker_parallel_count=3. +worker_parallel_count: 1 + +# --------------------------------------------------------------------------- +# Step definitions per role +# --------------------------------------------------------------------------- +cp_first_steps: + - kubeadm_install + - kubeadm_upgrade_apply + - drain + - kubelet_install + - crio_install + - kubelet_restart + - uncordon + - validation + - etcd_health_check + +cp_steps: + - kubeadm_install + - kubeadm_upgrade_node + - drain + - kubelet_install + - crio_install + - kubelet_restart + - uncordon + - validation + - etcd_health_check + +worker_steps: + - kubeadm_install + - kubeadm_upgrade_node + - drain + - kubelet_install + - crio_install + - kubelet_restart + - uncordon + - validation + +# --------------------------------------------------------------------------- +# BSS groups +# --------------------------------------------------------------------------- +bss_cp_groups: + - service_kube_control_plane_first + - service_kube_control_plane + +bss_worker_groups: + - service_kube_node + +bss_all_groups: + - service_kube_control_plane_first + - service_kube_control_plane + - service_kube_node + +# --------------------------------------------------------------------------- +# Node group suffixes (from nodes.yaml) +# --------------------------------------------------------------------------- +group_cp_first: "service_kube_control_plane_first_x86_64" +group_cp: "service_kube_control_plane_x86_64" +group_worker: "service_kube_node_x86_64" + +# --------------------------------------------------------------------------- +# File permissions +# --------------------------------------------------------------------------- +file_perm_644: "0644" +file_perm_600: "0600" +file_perm_755: "0755" +dir_perm_755: "0755" + +# --------------------------------------------------------------------------- +# Messages +# --------------------------------------------------------------------------- +msg_lock_held_same_host: >- + Another K8s upgrade is running (PID {{ existing_lock.pid }} since + {{ existing_lock.started_at }}). Aborting. +msg_lock_held_other_host: >- + Lock held by {{ existing_lock.host }} (PID {{ existing_lock.pid }} since + {{ existing_lock.started_at }}). If stale, delete manually: + rm {{ lock_file }} +msg_stale_lock_removed: >- + Removing stale lock from PID {{ existing_lock.pid }} + (process no longer running on this host). +msg_lock_verification_failed: >- + Lock file verification failed — possible race condition. Aborting. +msg_preflight_pulp_missing: >- + Required Pulp distribution '{{ item }}' not found. + The prep phase (Phase 1) should have synced this. + Check prep phase output or run 'ansible-playbook local_repo/local_repo.yml' manually. +msg_preflight_version_mismatch: >- + Cluster is at {{ k8s_from_version }}, not at expected version. + Verify the cluster state before proceeding. +msg_etcd_quorum_lost: >- + etcd quorum lost! Do NOT proceed. Manual intervention required. +msg_backup_missing: >- + Backup file {{ item }} not found. Cannot proceed with upgrade. +msg_node_upgrade_failed: >- + Node {{ current_node_name }} failed at step {{ current_step }}. + Error: {{ step_result.stderr | default(step_result.msg | default('unknown')) }} +msg_addon_calico_failed: >- + Calico upgrade failed. Networking may be degraded. + Do NOT proceed to worker upgrades. +msg_addon_metallb_warning: >- + MetalLB upgrade failed. Continuing — LoadBalancer services may be + affected but cluster is functional. +msg_addon_metallb_failed: >- + MetalLB upgrade failed. LoadBalancer services may be degraded. + Do NOT proceed to worker upgrades. +msg_addon_helm_warning: >- + Helm upgrade failed. Continuing — Helm is not required for cluster + operation. +msg_addon_helm_failed: >- + Helm upgrade failed. Helm-based operations may not work. + Do NOT proceed to worker upgrades. +msg_upgrade_complete: >- + K8s upgrade from {{ k8s_from_version }} to {{ k8s_target_version }} + completed successfully. All nodes Ready. +msg_bss_update_failed: >- + BSS update for group {{ bss_group }} failed. Nodes will boot old + image on reboot until BSS is updated manually. +msg_cloud_init_timeout: >- + Cloud-init did not complete within {{ cloud_init_timeout }} seconds. + Check /var/log/cloud-init-output.log on the node for details. +msg_cloud_init_failed: >- + Cloud-init did not complete successfully on node {{ current_node_name }}. + Check /var/log/cloud-init-output.log for details. +msg_rollback_etcd_binary_failed: >- + Saved etcdctl binary failed. Falling back to podman-based restore. diff --git a/upgrade/upgrade.yml b/upgrade/upgrade.yml index 84d6631c4f..7d34253e5e 100644 --- a/upgrade/upgrade.yml +++ b/upgrade/upgrade.yml @@ -64,7 +64,7 @@ build_stream: [oim] build_image: [oim] provision: [oim, build_image] - k8s: [oim] + k8s: [oim, provision, local_repo, build_image] telemetry: [oim, k8s] slurm: [oim, k8s] tasks: @@ -136,7 +136,7 @@ ansible.builtin.file: path: /opt/omnia/.data state: directory - mode: '0755' + mode: "0755" # ─── Create upgrade lock only if it doesn't already exist ─── # (omnia.sh --upgrade may have already created it; that's expected.) @@ -154,7 +154,7 @@ host: "{{ inventory_hostname }}" created_by: ansible-playbook dest: "{{ upgrade_lock_path }}" - mode: '0644' + mode: "0644" when: not upgrade_lock_stat.stat.exists # ───────────────────────────────────────────────────────────── @@ -188,7 +188,7 @@ telemetry: "pending" slurm: "pending" dest: "{{ manifest_path }}" - mode: '0644' + mode: "0644" when: not manifest_stat.stat.exists - name: Read upgrade_manifest.yml (final canonical version) @@ -256,11 +256,8 @@ when: - item in tag_dependencies - tag_dependencies[item] | difference(requested_tags) | length > 0 - - tag_dependencies[item] | reject('in', - (manifest.component_status | default({})) - | dict2items | selectattr('value', 'equalto', 'completed') - | map(attribute='key') | list - ) | list | length > 0 + - tag_dependencies[item] | reject('in', (manifest.component_status | default({})) | dict2items | selectattr('value', 'equalto', 'completed') | + map(attribute='key') | list ) | list | length > 0 - name: Report already-upgraded components (will be skipped) ansible.builtin.debug: @@ -414,7 +411,7 @@ ansible.builtin.copy: content: "{{ manifest | combine(manifest_updates) | to_nice_yaml }}" dest: "{{ manifest_path }}" - mode: '0644' + mode: "0644" vars: manifest_updates: component_status: "{{ cleaned_component_status }}" diff --git a/utils/roles/update_cloud_init_bss/tasks/main.yml b/utils/roles/update_cloud_init_bss/tasks/main.yml index 91cdddec07..3d7314cfe5 100644 --- a/utils/roles/update_cloud_init_bss/tasks/main.yml +++ b/utils/roles/update_cloud_init_bss/tasks/main.yml @@ -41,7 +41,8 @@ - (bss_file_path is defined and bss_file_path | length > 0) or (cloud_init_file_path is defined and cloud_init_file_path | length > 0) or (ci_defaults_file_path is defined and ci_defaults_file_path | length > 0) or - (ci_common_file_path is defined and ci_common_file_path | length > 0) + (ci_common_file_path is defined and ci_common_file_path | length > 0) or + (hostname_file_path is defined and hostname_file_path | length > 0) fail_msg: "{{ no_input_files_msg }}" - name: Update ci-defaults configuration @@ -71,3 +72,10 @@ - update_cloud_init | bool - cloud_init_file_path is defined - cloud_init_file_path | length > 0 + +- name: Update hostname configuration - {{ functional_group_name }} + ansible.builtin.include_tasks: update_hostname.yml + when: + - update_hostname | bool + - hostname_file_path is defined + - hostname_file_path | length > 0 diff --git a/utils/roles/update_cloud_init_bss/tasks/update_ci_common.yml b/utils/roles/update_cloud_init_bss/tasks/update_ci_common.yml index 039d9a1c7b..afc967bfe7 100644 --- a/utils/roles/update_cloud_init_bss/tasks/update_ci_common.yml +++ b/utils/roles/update_cloud_init_bss/tasks/update_ci_common.yml @@ -51,13 +51,3 @@ ansible.builtin.fail: msg: "{{ ci_common_update_fail_msg }}" when: ci_common_set_result.rc != 0 - -- name: Verify ci-group-common config - ansible.builtin.command: /usr/bin/ochami cloud-init group get config common - changed_when: false - register: ci_common_verify_output - -- name: Display ci-common verification - ansible.builtin.debug: - msg: "{{ ci_common_verify_output.stdout_lines }}" - verbosity: 1 diff --git a/utils/roles/update_cloud_init_bss/tasks/update_hostname.yml b/utils/roles/update_cloud_init_bss/tasks/update_hostname.yml new file mode 100644 index 0000000000..fad6e9d69f --- /dev/null +++ b/utils/roles/update_cloud_init_bss/tasks/update_hostname.yml @@ -0,0 +1,44 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# Update hostname configuration for nodes +# ============================================================================ +# Uses the ochami CLI to set hostname from the provided hostname YAML file. +# +# Required variables: +# hostname_file_path — Path to pre-rendered hostname YAML file (on target host) +# functional_group_name — Name of the functional group +# ============================================================================ + +- name: Verify hostname file exists - {{ functional_group_name }} + ansible.builtin.stat: + path: "{{ hostname_file_path }}" + register: hostname_file_stat + +- name: Fail if hostname file not found - {{ functional_group_name }} + ansible.builtin.fail: + msg: "{{ hostname_file_missing_msg }}" + when: not hostname_file_stat.stat.exists + +- name: Set hostname configuration - {{ functional_group_name }} + ansible.builtin.command: > + /usr/bin/ochami cloud-init node set -f yaml -d @{{ hostname_file_path }} + changed_when: true + register: hostname_set_result + +- name: Fail if hostname update failed - {{ functional_group_name }} + ansible.builtin.fail: + msg: "{{ hostname_update_fail_msg }}" + when: hostname_set_result.rc != 0 diff --git a/utils/roles/update_cloud_init_bss/vars/main.yml b/utils/roles/update_cloud_init_bss/vars/main.yml index 8725170132..0413124789 100644 --- a/utils/roles/update_cloud_init_bss/vars/main.yml +++ b/utils/roles/update_cloud_init_bss/vars/main.yml @@ -18,6 +18,7 @@ update_bss: true update_cloud_init: true update_ci_defaults: false update_ci_common: false +update_hostname: false # Messages bss_update_fail_msg: > @@ -47,8 +48,14 @@ ci_common_update_fail_msg: > ci_common_file_missing_msg: > Cloud-init common file not found at '{{ ci_common_file_path | default('undefined') }}'. Ensure the file exists on the target host before calling this utility. +hostname_update_fail_msg: > + Failed to update hostname configuration for '{{ functional_group_name }}'. + Check ochami cloud-init service status. +hostname_file_missing_msg: > + Hostname file not found at '{{ hostname_file_path | default('undefined') }}'. + Ensure the file exists on the target host before calling this utility. no_input_files_msg: > At least one of bss_file_path, cloud_init_file_path, ci_defaults_file_path, - or ci_common_file_path must be provided. + ci_common_file_path, or hostname_file_path must be provided. Pass them as extra variables: -e bss_file_path=/path/to/bss.yaml -e cloud_init_file_path=/path/to/ci-group.yaml