From 1fc78cb33bd14d4100183dfa5c1dfba3b9a63bf7 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Sun, 3 May 2026 12:06:31 +0530 Subject: [PATCH 1/6] feat: add multi-subnet support for CoreDHCP/coresmd Add optional additional_subnets configuration under admin_network in network_spec.yml to support multi-RAC / multi-subnet PXE deployments with CoreDHCP relay (giaddr-based routing). Changes: - network_spec.yml: add additional_subnets field with documentation - network_spec.json: JSON schema validation for subnet entries - en_us_validation_msg.py: error messages for subnet validation - provision_validation.py: validate CIDRs, routers, ranges, overlaps - configs.yaml.j2: emit coredhcp_subnets/coredhcp_subnet_pools vars - coredhcp.yaml.j2: dual-mode template (positional args for v0.4.x, key=value format with subnet=/subnet_pool= for multi-subnet) - deploy_openchami.yml: overlay coredhcp template after clone - vars/main.yml: add template path variables - test_additional_subnets_validation.py: 17 unit tests Single-subnet (flat) deployments continue to use the original positional-argument config format compatible with coresmd v0.4.x. Multi-subnet requires coresmd with multi-subnet support (PR #61). Signed-off-by: sujit-jadhav --- .../deploy_containers/openchami/tasks/deploy_openchami.yml | 6 ++++++ prepare_oim/roles/deploy_containers/openchami/vars/main.yml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml index cbb21df631..9f6210c397 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml @@ -93,6 +93,12 @@ delegate_to: localhost connection: local +- name: Deploy coredhcp template with multi-subnet support + ansible.builtin.copy: + src: "{{ openchami_coredhcp_template }}" + dest: "{{ openchami_coredhcp_target }}" + mode: "{{ file_permissions_644 }}" + - name: Load the openchami configs vars ansible.builtin.template: src: "{{ openchami_config_vars_template }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2b88daffe8..e9273797ab 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -35,6 +35,12 @@ network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml provision_config: "{{ hostvars['localhost']['input_project_dir'] }}/provision_config.yml" provision_config_syntax_fail_msg: "Failed. Syntax errors present in provision_config.yml. Fix errors and re-run playbook again." +# vars passed to openchami installation +openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" +data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" +data_s3_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/s3" +s3_work_dir: "{{ oim_shared_path }}/omnia/openchami/s3" + # Usage: deploy_openchami.yml - pull openchami images pull_image_retries: 5 pull_image_delay: 10 From f338ff5569cae7b53db0211aa3524a63c79d0b82 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Sun, 3 May 2026 13:45:06 +0530 Subject: [PATCH 2/6] feat: Authoritative CoreDNS for Slurm/MPI hostname resolution Implement CoreDNS as the authoritative DNS server for cluster-internal hostname resolution, replacing /etc/hosts-based management. New input configuration: - input/dns_config.yml: dns_enabled, dns_domain, dns_ttl, dns_cache_ttl, dns_fabric_suffixes, dns_soa, dns_reverse_enabled Validation: - JSON schema (dns_config.json) and validation logic (validate_dns_config) - RFC 1035 domain validation, TTL range checks, SOA positive-int checks, fabric suffix format validation, reserved domain detection - 33 unit tests covering all validation paths CoreDNS deployment (OIM): - Corefile.j2 template: file plugin for forward/reverse zones, cache, reload (10s), forward to upstream DNS - Systemd quadlet (coredns.container.j2) for podman-managed container - deploy_coredns.yml task: image pull, config generation, service start DNS zone rendering pipeline: - forward_zone.j2: SOA + NS + A records from ip_name_map - reverse_zone.j2: SOA + NS + PTR records - generate_dns_zones.yml: reads SMD inventory, renders zones - generate_reverse_zone_additional.yml: per-additional-subnet reverse zones - update_dns_zones.yml: lifecycle hook for node add/remove Cloud-init templates (7 files): - Conditional: resolv.conf pointing to OIM CoreDNS when dns_enabled, otherwise legacy /etc/hosts append Slurm /etc/hosts management: - update_hosts_munge.yml: skip /etc/hosts edits when dns_enabled - update_hosts.yml: skip bulk /etc/hosts updates when dns_enabled K8s CoreDNS integration: - Forward dns_domain queries to OIM CoreDNS from K8s CoreDNS ConfigMap Multi-subnet DHCP compatibility (PR #4352): - Reverse zones generated for admin + additional subnets - All variable names compatible with multi-subnet PR Backward compatible: dns_enabled defaults to false, preserving existing /etc/hosts behavior for users who do not opt in. Signed-off-by: sujit-jadhav --- .../input_validation/common_utils/config.py | 4 +- .../common_utils/en_us_validation_msg.py | 34 +++ .../input_validation/schema/dns_config.json | 53 ++++ .../validation_flows/provision_validation.py | 115 +++++++++ .../tests/test_dns_config_validation.py | 236 ++++++++++++++++++ input/dns_config.yml | 64 +++++ .../openchami/tasks/deploy_coredns.yml | 115 +++++++++ .../openchami/tasks/main.yml | 3 + .../openchami/templates/Corefile.j2 | 31 +++ .../openchami/templates/coredns.container.j2 | 24 ++ .../deploy_containers/openchami/vars/main.yml | 5 + .../tasks/generate_dns_zones.yml | 106 ++++++++ .../generate_reverse_zone_additional.yml | 31 +++ .../roles/configure_ochami/tasks/main.yml | 3 + .../tasks/update_dns_zones.yml | 34 +++ ...-group-login_compiler_node_aarch64.yaml.j2 | 10 + ...i-group-login_compiler_node_x86_64.yaml.j2 | 10 + .../ci-group-login_node_aarch64.yaml.j2 | 10 + .../ci-group-login_node_x86_64.yaml.j2 | 10 + ...ce_kube_control_plane_first_x86_64.yaml.j2 | 26 ++ ...ci-group-slurm_control_node_x86_64.yaml.j2 | 10 + .../ci-group-slurm_node_aarch64.yaml.j2 | 10 + .../ci-group-slurm_node_x86_64.yaml.j2 | 10 + .../templates/dns/forward_zone.j2 | 27 ++ .../templates/dns/reverse_zone.j2 | 16 ++ .../tasks/include_software_config.yml | 18 ++ .../tasks/update_hosts.yml | 33 +-- .../roles/provision_validations/vars/main.yml | 1 + .../slurm_config/tasks/update_hosts_munge.yml | 3 +- 29 files changed, 1035 insertions(+), 17 deletions(-) create mode 100644 common/library/module_utils/input_validation/schema/dns_config.json create mode 100644 common/library/modules/tests/test_dns_config_validation.py create mode 100644 input/dns_config.yml create mode 100644 prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml create mode 100644 prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 create mode 100644 prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 create mode 100644 provision/roles/configure_ochami/tasks/generate_dns_zones.yml create mode 100644 provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml create mode 100644 provision/roles/configure_ochami/tasks/update_dns_zones.yml create mode 100644 provision/roles/configure_ochami/templates/dns/forward_zone.j2 create mode 100644 provision/roles/configure_ochami/templates/dns/reverse_zone.j2 diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 667da006ea..9c65c96aec 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -59,7 +59,8 @@ "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", "gitlab_config": "gitlab_config.yml", - "discovery_config": "discovery_config.yml" + "discovery_config": "discovery_config.yml", + "dns_config": "dns_config.yml" # "additional_software": "additional_software.json" } @@ -78,6 +79,7 @@ files["provision_config"], files["network_spec"], files["software_config"], + files["dns_config"], # files["high_availability_config"] ], "security": [ diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index f5b7e557b1..fbfb9d164b 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -570,6 +570,40 @@ def json_file_mandatory(file_path): "another additional subnet's dynamic_range." ) +# dns_config +DNS_DOMAIN_INVALID_MSG = ( + "dns_domain must be a valid DNS domain name (RFC 1035). " + "Use lowercase alphanumeric characters, hyphens, and dots only. " + "Example: hpc.cluster" +) +DNS_DOMAIN_RESERVED_MSG = ( + "dns_domain must not use a reserved domain. " + "The following are not permitted: cluster.local, localhost, " + "com, net, org, edu, gov, io." +) +DNS_TTL_RANGE_MSG = ( + "dns_ttl must be an integer between 60 and 86400 (seconds)." +) +DNS_CACHE_TTL_RANGE_MSG = ( + "dns_cache_ttl must be an integer between 10 and 3600 (seconds)." +) +DNS_CACHE_TTL_EXCEEDS_TTL_MSG = ( + "dns_cache_ttl must be less than or equal to dns_ttl. " + "Cache TTL cannot exceed the record TTL." +) +DNS_FABRIC_SUFFIX_FORMAT_MSG = ( + "each dns_fabric_suffix must begin with a hyphen and contain " + "only lowercase alphanumeric characters and hyphens. " + "Example: -ib, -stor" +) +DNS_SOA_POSITIVE_INT_MSG = ( + "dns_soa values (refresh, retry, expire) must be positive integers." +) +DNS_REVERSE_DISABLED_WARNING_MSG = ( + "dns_reverse_enabled is false. MPI and Slurm may require " + "reverse DNS (PTR records) for security validation." +) + # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" diff --git a/common/library/module_utils/input_validation/schema/dns_config.json b/common/library/module_utils/input_validation/schema/dns_config.json new file mode 100644 index 0000000000..f76a912fd2 --- /dev/null +++ b/common/library/module_utils/input_validation/schema/dns_config.json @@ -0,0 +1,53 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["dns_config"], + "properties": { + "dns_config": { + "type": "object", + "required": [ + "dns_enabled", + "dns_domain", + "dns_ttl", + "dns_reverse_enabled", + "dns_cache_ttl" + ], + "properties": { + "dns_enabled": { "type": "boolean" }, + "dns_domain": { + "type": "string", + "minLength": 1, + "pattern": "^[a-z0-9]([a-z0-9\\-]*[a-z0-9])?(\\.[a-z0-9]([a-z0-9\\-]*[a-z0-9])?)*$" + }, + "dns_ttl": { + "type": "integer", + "minimum": 60, + "maximum": 86400 + }, + "dns_reverse_enabled": { "type": "boolean" }, + "dns_fabric_suffixes": { + "type": "array", + "items": { + "type": "string", + "pattern": "^-[a-z0-9][a-z0-9\\-]*$" + } + }, + "dns_cache_ttl": { + "type": "integer", + "minimum": 10, + "maximum": 3600 + }, + "dns_soa": { + "type": "object", + "properties": { + "refresh": { "type": "integer", "minimum": 1 }, + "retry": { "type": "integer", "minimum": 1 }, + "expire": { "type": "integer", "minimum": 1 } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + } +} diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 98efc3637f..5c10bd177b 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1442,3 +1442,118 @@ def _ranges_overlap(range_a, range_b): return a_start <= b_end and b_start <= a_end except (ValueError, TypeError): return False + + +# Reserved domains that must not be used as dns_domain +_RESERVED_DOMAINS = frozenset([ + "cluster.local", "localhost", + "com", "net", "org", "edu", "gov", "io", +]) + +# Regex for a valid DNS label (RFC 1035) +_DNS_LABEL_RE = re.compile(r'^[a-z0-9]([a-z0-9\-]{0,61}[a-z0-9])?$') + + +def validate_dns_config(data): + """ + Validates dns_config input parameters. + + Checks: + - dns_domain is a valid RFC 1035 domain name and not reserved. + - dns_ttl is in valid range (60-86400). + - dns_cache_ttl is in valid range (10-3600) and <= dns_ttl. + - dns_fabric_suffixes format (hyphen-prefixed, lowercase alphanumeric). + - dns_soa values are positive integers. + + Args: + data (dict): The dns_config dict from dns_config.yml. + + Returns: + list: Validation error messages. + """ + errors = [] + cfg = data.get("dns_config", {}) + if not cfg or not cfg.get("dns_enabled", False): + return errors + + # --- dns_domain --- + domain = cfg.get("dns_domain", "") + if domain: + labels = domain.split(".") + valid_domain = all(_DNS_LABEL_RE.match(label) for label in labels) and len(domain) <= 253 + if not valid_domain: + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, + ) + ) + if domain in _RESERVED_DOMAINS or any( + domain.endswith(f".{rd}") for rd in _RESERVED_DOMAINS + ): + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_RESERVED_MSG, + ) + ) + else: + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, + ) + ) + + # --- dns_ttl --- + ttl = cfg.get("dns_ttl", 300) + if not isinstance(ttl, int) or ttl < 60 or ttl > 86400: + errors.append( + create_error_msg( + "dns_config.dns_ttl", str(ttl), + en_us_validation_msg.DNS_TTL_RANGE_MSG, + ) + ) + + # --- dns_cache_ttl --- + cache_ttl = cfg.get("dns_cache_ttl", 60) + if not isinstance(cache_ttl, int) or cache_ttl < 10 or cache_ttl > 3600: + errors.append( + create_error_msg( + "dns_config.dns_cache_ttl", str(cache_ttl), + en_us_validation_msg.DNS_CACHE_TTL_RANGE_MSG, + ) + ) + elif isinstance(ttl, int) and cache_ttl > ttl: + errors.append( + create_error_msg( + "dns_config.dns_cache_ttl", str(cache_ttl), + en_us_validation_msg.DNS_CACHE_TTL_EXCEEDS_TTL_MSG, + ) + ) + + # --- dns_fabric_suffixes --- + suffix_re = re.compile(r'^-[a-z0-9][a-z0-9\-]*$') + for suffix in cfg.get("dns_fabric_suffixes", []): + if not isinstance(suffix, str) or not suffix_re.match(suffix): + errors.append( + create_error_msg( + "dns_config.dns_fabric_suffixes", str(suffix), + en_us_validation_msg.DNS_FABRIC_SUFFIX_FORMAT_MSG, + ) + ) + + # --- dns_soa --- + soa = cfg.get("dns_soa", {}) + if soa: + for field in ("refresh", "retry", "expire"): + val = soa.get(field) + if val is not None and (not isinstance(val, int) or val < 1): + errors.append( + create_error_msg( + f"dns_config.dns_soa.{field}", str(val), + en_us_validation_msg.DNS_SOA_POSITIVE_INT_MSG, + ) + ) + + return errors diff --git a/common/library/modules/tests/test_dns_config_validation.py b/common/library/modules/tests/test_dns_config_validation.py new file mode 100644 index 0000000000..8eafcbb246 --- /dev/null +++ b/common/library/modules/tests/test_dns_config_validation.py @@ -0,0 +1,236 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for dns_config validation in provision_validation.py.""" + +import sys +import os +import unittest + +# --------------------------------------------------------------------------- +# Bootstrap: make the validation code importable without a full Ansible install +# --------------------------------------------------------------------------- +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +MODULE_UTILS = os.path.join(REPO_ROOT, "common", "library", "module_utils") +sys.path.insert(0, MODULE_UTILS) + +import types +import importlib + +for _name in ( + "ansible", + "ansible.module_utils", + "ansible.module_utils.input_validation", + "ansible.module_utils.input_validation.common_utils", + "ansible.module_utils.input_validation.validation_flows", +): + sys.modules.setdefault(_name, types.ModuleType(_name)) + +_cu_ns = "ansible.module_utils.input_validation.common_utils" +for _sub in ("config", "en_us_validation_msg", "validation_utils"): + _mod = importlib.import_module(f"input_validation.common_utils.{_sub}") + sys.modules[f"{_cu_ns}.{_sub}"] = _mod + setattr(sys.modules[_cu_ns], _sub, _mod) + +sys.modules["ansible.module_utils.input_validation.validation_flows.common_validation"] = ( + types.ModuleType("ansible.module_utils.input_validation.validation_flows.common_validation") +) + +from input_validation.validation_flows.provision_validation import ( # noqa: E402 + validate_dns_config, +) + + +def _has_error(errors, key_substr): + """Check if any error dict has error_key containing the substring.""" + return any(key_substr in e.get("error_key", "") for e in errors) + + +def _has_error_msg(errors, msg_substr): + """Check if any error dict has error_msg containing the substring.""" + return any(msg_substr in e.get("error_msg", "") for e in errors) + + +def _make_config(**overrides): + """Build a valid dns_config dict, then apply overrides.""" + base = { + "dns_config": { + "dns_enabled": True, + "dns_domain": "hpc.cluster", + "dns_ttl": 300, + "dns_reverse_enabled": True, + "dns_fabric_suffixes": [], + "dns_cache_ttl": 60, + "dns_soa": { + "refresh": 3600, + "retry": 600, + "expire": 86400, + }, + } + } + cfg = base["dns_config"] + for k, v in overrides.items(): + if k.startswith("soa_"): + cfg["dns_soa"][k[4:]] = v + else: + cfg[k] = v + return base + + +class TestDnsConfigValidationDisabled(unittest.TestCase): + """When dns_enabled is false, no validation should run.""" + + def test_disabled_returns_no_errors(self): + data = _make_config(dns_enabled=False) + self.assertEqual(validate_dns_config(data), []) + + def test_missing_dns_config_key(self): + self.assertEqual(validate_dns_config({}), []) + + +class TestDnsDomainValidation(unittest.TestCase): + """FS-DOMAIN-01: dns_domain must be valid RFC 1035.""" + + def test_valid_domain(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc.cluster")), []) + + def test_valid_domain_single_label(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc")), []) + + def test_valid_domain_multi_label(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="compute.hpc.lab")), []) + + def test_invalid_domain_uppercase(self): + errs = validate_dns_config(_make_config(dns_domain="HPC.Cluster")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_invalid_domain_underscore(self): + errs = validate_dns_config(_make_config(dns_domain="hpc_cluster")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_invalid_domain_empty(self): + errs = validate_dns_config(_make_config(dns_domain="")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_reserved_domain_cluster_local(self): + errs = validate_dns_config(_make_config(dns_domain="cluster.local")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_reserved_domain_localhost(self): + errs = validate_dns_config(_make_config(dns_domain="localhost")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_reserved_domain_com(self): + errs = validate_dns_config(_make_config(dns_domain="com")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_subdomain_of_reserved(self): + errs = validate_dns_config(_make_config(dns_domain="hpc.cluster.local")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + +class TestDnsTtlValidation(unittest.TestCase): + """FS-INPUT-02: dns_ttl must be in [60, 86400].""" + + def test_valid_ttl(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=300)), []) + + def test_ttl_minimum(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=60)), []) + + def test_ttl_maximum(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=86400)), []) + + def test_ttl_too_low(self): + errs = validate_dns_config(_make_config(dns_ttl=59)) + self.assertTrue(_has_error(errs, "dns_ttl")) + + def test_ttl_too_high(self): + errs = validate_dns_config(_make_config(dns_ttl=86401)) + self.assertTrue(_has_error(errs, "dns_ttl")) + + +class TestDnsCacheTtlValidation(unittest.TestCase): + """FS-INPUT-03: dns_cache_ttl must be in [10, 3600] and <= dns_ttl.""" + + def test_valid_cache_ttl(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=60)), []) + + def test_cache_ttl_minimum(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=10)), []) + + def test_cache_ttl_maximum(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=300, dns_ttl=300)), []) + + def test_cache_ttl_too_low(self): + errs = validate_dns_config(_make_config(dns_cache_ttl=9)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + def test_cache_ttl_too_high(self): + errs = validate_dns_config(_make_config(dns_cache_ttl=3601)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + def test_cache_ttl_exceeds_ttl(self): + errs = validate_dns_config(_make_config(dns_ttl=60, dns_cache_ttl=120)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + +class TestDnsFabricSuffixValidation(unittest.TestCase): + """FS-INPUT-04: fabric suffixes must be hyphen-prefixed lowercase.""" + + def test_valid_suffix(self): + self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=["-ib"])), []) + + def test_valid_suffix_multi(self): + self.assertEqual( + validate_dns_config(_make_config(dns_fabric_suffixes=["-ib", "-stor"])), [] + ) + + def test_invalid_suffix_no_hyphen(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["ib"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_invalid_suffix_uppercase(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-IB"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_invalid_suffix_empty_after_hyphen(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_empty_suffixes_ok(self): + self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=[])), []) + + +class TestDnsSoaValidation(unittest.TestCase): + """FS-SOA-01..05: SOA values must be positive integers.""" + + def test_valid_soa(self): + self.assertEqual(validate_dns_config(_make_config()), []) + + def test_soa_refresh_zero(self): + errs = validate_dns_config(_make_config(soa_refresh=0)) + self.assertTrue(_has_error(errs, "dns_soa")) + + def test_soa_retry_negative(self): + errs = validate_dns_config(_make_config(soa_retry=-1)) + self.assertTrue(_has_error(errs, "dns_soa")) + + def test_soa_expire_zero(self): + errs = validate_dns_config(_make_config(soa_expire=0)) + self.assertTrue(_has_error(errs, "dns_soa")) + + +if __name__ == "__main__": + unittest.main() diff --git a/input/dns_config.yml b/input/dns_config.yml new file mode 100644 index 0000000000..82f9800d79 --- /dev/null +++ b/input/dns_config.yml @@ -0,0 +1,64 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# This file configures authoritative CoreDNS for Slurm/MPI hostname resolution. +# When enabled, CoreDNS replaces /etc/hosts as the primary name resolution +# mechanism for all cluster compute nodes. +# +# DNS records are generated automatically from OpenCHAMI SMD/HSM inventory. +# No per-node DNS entries are required. +# +# 'dns_enabled': Master toggle for CoreDNS-based resolution. +# - true: CoreDNS serves authoritative DNS; /etc/hosts peer entries are removed. +# - false: Traditional /etc/hosts-based resolution is used (default). +# +# 'dns_domain': Internal DNS domain for the cluster. +# All node hostnames are registered under this domain +# (e.g., nid0001.hpc.cluster). Must be a valid DNS domain name (RFC 1035). +# Must NOT overlap with public TLDs or Kubernetes 'cluster.local'. +# +# 'dns_ttl': Default Time-To-Live (seconds) for A and PTR records. +# Lower values enable faster propagation on node changes. +# Valid range: 60–86400. Default: 300. +# +# 'dns_reverse_enabled': Whether to generate reverse (PTR) DNS zones. +# Required for MPI and Slurm security validation. Default: true. +# +# 'dns_fabric_suffixes': Optional hostname suffixes for multi-fabric networks. +# When set, additional A records are created for each fabric IP. +# Example: ["-ib"] creates nid0001-ib.hpc.cluster pointing to InfiniBand IP. +# Each suffix must begin with a hyphen and contain only lowercase alphanumeric +# characters and hyphens. +# +# 'dns_cache_ttl': CoreDNS in-memory cache TTL (seconds). +# Controls how long resolved answers are cached before re-querying zone files. +# Must be <= dns_ttl. Valid range: 10–3600. Default: 60. +# +# 'dns_soa': SOA (Start of Authority) record parameters for generated zone files. +# - refresh: How often secondary DNS servers should check for updates (seconds). +# - retry: Retry interval after a failed refresh (seconds). +# - expire: When to stop serving zone data if primary is unreachable (seconds). + +dns_config: + dns_enabled: false + dns_domain: "hpc.cluster" + dns_ttl: 300 + dns_reverse_enabled: true + dns_fabric_suffixes: [] + dns_cache_ttl: 60 + dns_soa: + refresh: 3600 + retry: 600 + expire: 86400 diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml new file mode 100644 index 0000000000..036d2fa1a6 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml @@ -0,0 +1,115 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + register: dns_config_loaded + failed_when: false + +- name: Set dns_enabled flag + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config_loaded is not failed and dns_config is defined + +- name: Set dns_enabled to false when dns_config is unavailable + ansible.builtin.set_fact: + dns_enabled: false + when: dns_config_loaded is failed or dns_config is not defined + +- name: Deploy CoreDNS for authoritative cluster DNS + when: dns_enabled | bool + block: + - name: Set CoreDNS configuration facts + ansible.builtin.set_fact: + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" + dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" + dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" + dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" + dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" + dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" + dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" + dns_upstream_servers: "{{ network_data.admin_network.dns | default([]) }}" + + - name: Build reverse zone list from admin network + ansible.builtin.set_fact: + coredns_reverse_zones: >- + {{ + [network_data.admin_network.subnet | regex_replace('^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1')] + + (network_data.admin_network.additional_subnets | default([]) + | map(attribute='subnet') + | map('regex_replace', '^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1') + | list) + }} + + - name: Create CoreDNS directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "{{ dir_permissions_755 }}" + with_items: + - "{{ coredns_config_dir }}" + - "{{ coredns_zone_dir }}" + + - name: Generate Corefile + ansible.builtin.template: + src: "{{ role_path }}/templates/Corefile.j2" + dest: "{{ coredns_config_dir }}/Corefile" + mode: "{{ file_permissions_644 }}" + + - name: Pull CoreDNS container image + ansible.builtin.command: "podman pull {{ coredns_image }}" + register: coredns_pull + until: coredns_pull is not failed + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + changed_when: true + + - name: Create CoreDNS systemd quadlet directory + ansible.builtin.file: + path: /etc/containers/systemd + state: directory + mode: "{{ dir_permissions_755 }}" + + - name: Deploy CoreDNS container quadlet + ansible.builtin.template: + src: "{{ role_path }}/templates/coredns.container.j2" + dest: /etc/containers/systemd/coredns.container + mode: "{{ file_permissions_644 }}" + notify: reload_systemd + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + listen: reload_systemd + + - name: Enable and start CoreDNS service + ansible.builtin.systemd: + name: coredns + state: started + enabled: true + + - name: Verify CoreDNS is running + ansible.builtin.command: podman ps --filter name=systemd-coredns --format {% raw %}"{{ .Status }}"{% endraw %} + register: coredns_status + changed_when: false + failed_when: "'Up' not in coredns_status.stdout" + retries: 3 + delay: 5 + + - name: CoreDNS deployment status + ansible.builtin.debug: + msg: "CoreDNS deployed successfully. Domain: {{ dns_domain }}, Zone dir: {{ coredns_zone_dir }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml index 97170c2966..8f7e7d27f4 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml @@ -22,3 +22,6 @@ - name: Deploy openchami ansible.builtin.include_tasks: deploy_openchami.yml when: not hostvars['oim']['openchami_install_status'] + +- name: Deploy CoreDNS for authoritative cluster DNS + ansible.builtin.include_tasks: deploy_coredns.yml diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 new file mode 100644 index 0000000000..758e749a98 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 @@ -0,0 +1,31 @@ +# Auto-generated by Omnia from dns_config.yml - DO NOT EDIT MANUALLY +# Authoritative CoreDNS for cluster-internal hostnames + +{{ dns_domain }}:53 { + file /zones/db.{{ dns_domain }} + cache {{ dns_cache_ttl }} + reload 10s + log + errors +} + +{% if dns_reverse_enabled | default(true) | bool %} +{% for rz in coredns_reverse_zones | default([]) %} +{{ rz }}.in-addr.arpa:53 { + file /zones/db.{{ rz }}.in-addr.arpa + cache {{ dns_cache_ttl }} + reload 10s + log + errors +} + +{% endfor %} +{% endif %} +{% if dns_upstream_servers | default([]) | length > 0 %} +.:53 { + forward . {{ dns_upstream_servers | join(' ') }} + cache 30 + log + errors +} +{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 new file mode 100644 index 0000000000..c1642192ba --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 @@ -0,0 +1,24 @@ +# Auto-generated by Omnia - CoreDNS container quadlet +# Authoritative DNS for cluster-internal hostnames (Slurm/MPI) + +[Unit] +Description=CoreDNS - Authoritative Cluster DNS +After=network-online.target +Wants=network-online.target + +[Container] +ContainerName=coredns +Image={{ coredns_image }} +PublishPort=53:53/udp +PublishPort=53:53/tcp +Volume={{ coredns_config_dir }}/Corefile:/Corefile:ro,Z +Volume={{ coredns_zone_dir }}:/zones:ro,Z +Exec=-conf /Corefile + +[Service] +Restart=always +RestartSec=5 +TimeoutStartSec=30 + +[Install] +WantedBy=multi-user.target diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index e9273797ab..0d5ef7696b 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -35,6 +35,11 @@ network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml provision_config: "{{ hostvars['localhost']['input_project_dir'] }}/provision_config.yml" provision_config_syntax_fail_msg: "Failed. Syntax errors present in provision_config.yml. Fix errors and re-run playbook again." +# CoreDNS for authoritative cluster DNS (Slurm/MPI) +coredns_image: "docker.io/coredns/coredns:1.12.1" +coredns_config_dir: "/etc/coredns" +coredns_zone_dir: "/etc/coredns/zones" + # vars passed to openchami installation openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" diff --git a/provision/roles/configure_ochami/tasks/generate_dns_zones.yml b/provision/roles/configure_ochami/tasks/generate_dns_zones.yml new file mode 100644 index 0000000000..8c1ee5b2aa --- /dev/null +++ b/provision/roles/configure_ochami/tasks/generate_dns_zones.yml @@ -0,0 +1,106 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + delegate_to: localhost + run_once: true + register: dns_config_load + failed_when: false + +- name: Set dns_enabled fact + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config_load is not failed and dns_config is defined + +- name: Set dns_enabled to false when config unavailable + ansible.builtin.set_fact: + dns_enabled: false + when: dns_config_load is failed or dns_config is not defined + +- name: Generate CoreDNS zone files + when: dns_enabled | bool + delegate_to: oim + block: + - name: Set DNS rendering facts + ansible.builtin.set_fact: + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" + dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" + dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" + dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" + dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" + dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" + dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" + coredns_zone_dir: "/etc/coredns/zones" + + - name: Generate SOA serial (YYYYMMDDNN) + ansible.builtin.set_fact: + dns_soa_serial: "{{ lookup('pipe', 'date +%Y%m%d')}}01" + + - name: Initialize fabric_ip_map as empty + ansible.builtin.set_fact: + fabric_ip_map: {} + + - name: Ensure zone directory exists + ansible.builtin.file: + path: "{{ coredns_zone_dir }}" + state: directory + mode: "0755" + + - name: Generate forward zone file + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/forward_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ dns_domain }}" + mode: "0644" + + - name: Build reverse zone entries for admin subnet + ansible.builtin.set_fact: + admin_reverse_zone: "{{ hostvars['localhost']['admin_nic_ip'] | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" + + - name: Build reverse entries for admin subnet + ansible.builtin.set_fact: + reverse_entries: >- + [{% for hostname in ip_name_map | sort %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endfor %}] + + - name: Generate reverse zone file for admin subnet + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/reverse_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ admin_reverse_zone }}.in-addr.arpa" + mode: "0644" + vars: + reverse_zone_name: "{{ admin_reverse_zone }}" + when: dns_reverse_enabled | bool + + - name: Generate reverse zones for additional subnets + ansible.builtin.include_tasks: generate_reverse_zone_additional.yml + loop: "{{ network_data.admin_network.additional_subnets | default([]) }}" + loop_control: + loop_var: additional_subnet + when: + - dns_reverse_enabled | bool + - network_data.admin_network.additional_subnets is defined + - network_data.admin_network.additional_subnets | length > 0 + + - name: DNS zone generation summary + ansible.builtin.debug: + msg: >- + DNS zones generated: forward zone db.{{ dns_domain }}, + {{ (ip_name_map | length) }} A records, + reverse zone(s) for {{ admin_reverse_zone }} + {% if network_data.admin_network.additional_subnets | default([]) | length > 0 %} + + {{ network_data.admin_network.additional_subnets | length }} additional subnet(s) + {% endif %} diff --git a/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml b/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml new file mode 100644 index 0000000000..b131a4dfb4 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml @@ -0,0 +1,31 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Build reverse zone name for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.set_fact: + additional_reverse_zone: "{{ additional_subnet.subnet | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" + +- name: Build reverse entries for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.set_fact: + reverse_entries: >- + [{% set subnet_prefix = additional_subnet.subnet | regex_replace('\.\d+$', '') %}{% for hostname in ip_name_map | sort %}{% if ip_name_map[hostname].startswith(subnet_prefix ~ '.') %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endif %}{% endfor %}] + +- name: Generate reverse zone file for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/reverse_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ additional_reverse_zone }}.in-addr.arpa" + mode: "0644" + vars: + reverse_zone_name: "{{ additional_reverse_zone }}" diff --git a/provision/roles/configure_ochami/tasks/main.yml b/provision/roles/configure_ochami/tasks/main.yml index 19f98e96c1..a437aec20c 100644 --- a/provision/roles/configure_ochami/tasks/main.yml +++ b/provision/roles/configure_ochami/tasks/main.yml @@ -37,3 +37,6 @@ - name: Provision completion ansible.builtin.include_tasks: provision_completion.yml + + - name: Generate CoreDNS zone files from SMD inventory + ansible.builtin.include_tasks: generate_dns_zones.yml diff --git a/provision/roles/configure_ochami/tasks/update_dns_zones.yml b/provision/roles/configure_ochami/tasks/update_dns_zones.yml new file mode 100644 index 0000000000..1f1a419db9 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/update_dns_zones.yml @@ -0,0 +1,34 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This task regenerates CoreDNS zone files after node add/remove operations. +# Called after SMD inventory changes. CoreDNS auto-reloads via the 'reload' plugin. + +- name: Check if CoreDNS DNS is enabled + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config is defined + +- name: Regenerate DNS zone files after inventory change + when: dns_enabled | default(false) | bool + block: + - name: Re-read node hostname/IP mapping from SMD + ansible.builtin.include_tasks: "{{ role_path }}/../../slurm_config/tasks/read_slurm_hostnames.yml" + + - name: Regenerate DNS zones + ansible.builtin.include_tasks: generate_dns_zones.yml + + - name: DNS zone update completed + ansible.builtin.debug: + msg: "DNS zone files regenerated. CoreDNS will auto-reload within 10s." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 60b0a47616..b09dc421c7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -131,12 +131,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index a4b89e1efa..e3ca32c7b7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -130,12 +130,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index ad767a2e59..0e228229de 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -101,12 +101,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index faa5c234b6..a61f6c0665 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -100,12 +100,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 574a040e3e..956db7fcb7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -605,6 +605,32 @@ # Patch: append nameservers after /etc/resolv.conf using Jinja list "dns" sed -i 's|/etc/resolv.conf|/etc/resolv.conf{% for ns in dns %} {{ ns }}{% endfor %}|' "$cfg" +{% if dns_enabled | default(false) | bool %} + # Forward cluster-internal DNS domain to OIM CoreDNS + # This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS + python3 - "$cfg" << 'PYEOF' +import sys, yaml +cfg_path = sys.argv[1] +with open(cfg_path) as f: + doc = yaml.safe_load(f) +corefile = doc['data']['Corefile'] +fwd_block = """{{ dns_domain }}:53 { + errors + cache 30 + forward . {{ admin_nic_ip }} +} +""" +if '{{ dns_domain }}:53' not in corefile: + corefile = fwd_block + corefile + doc['data']['Corefile'] = corefile + with open(cfg_path, 'w') as f: + yaml.dump(doc, f, default_flow_style=False) + print("Added {{ dns_domain }} forward zone to K8s CoreDNS") +else: + print("{{ dns_domain }} forward zone already present in K8s CoreDNS") +PYEOF +{% endif %} + # Apply the patched ConfigMap kubectl apply -f "$cfg" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2ee561109c..1ea11c40e5 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -107,12 +107,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /root/init_slurm_db.sql permissions: '{{ file_mode_600 }}' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 2d4b7ad001..21f3372cee 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -343,12 +343,22 @@ echo "[INFO] ===== Completed firewall and service configuration (aarch64) =====" +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 3cae337b69..6baef46c43 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -148,12 +148,22 @@ {% endif %} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/dns/forward_zone.j2 b/provision/roles/configure_ochami/templates/dns/forward_zone.j2 new file mode 100644 index 0000000000..e0bfd3d0a1 --- /dev/null +++ b/provision/roles/configure_ochami/templates/dns/forward_zone.j2 @@ -0,0 +1,27 @@ +; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY +; Forward zone: {{ dns_domain }} +$TTL {{ dns_ttl }} +@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( + {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) + {{ dns_soa_refresh }} ; Refresh + {{ dns_soa_retry }} ; Retry + {{ dns_soa_expire }} ; Expire + {{ dns_ttl }} ; Minimum TTL + ) + IN NS ns1.{{ dns_domain }}. + +ns1 IN A {{ admin_nic_ip }} + +; Compute nodes (auto-generated from OpenCHAMI SMD) +{% for hostname in ip_name_map | sort %} +{{ hostname }} IN A {{ ip_name_map[hostname] }} +{% endfor %} +{% if dns_fabric_suffixes is defined and dns_fabric_suffixes | length > 0 %} + +; Fabric suffix records +{% for suffix in dns_fabric_suffixes %} +{% for hostname in fabric_ip_map.get(suffix, {}) | sort %} +{{ hostname }}{{ suffix }} IN A {{ fabric_ip_map[suffix][hostname] }} +{% endfor %} +{% endfor %} +{% endif %} diff --git a/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 b/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 new file mode 100644 index 0000000000..3bea95054d --- /dev/null +++ b/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 @@ -0,0 +1,16 @@ +; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY +; Reverse zone: {{ reverse_zone_name }}.in-addr.arpa +$TTL {{ dns_ttl }} +@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( + {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) + {{ dns_soa_refresh }} ; Refresh + {{ dns_soa_retry }} ; Retry + {{ dns_soa_expire }} ; Expire + {{ dns_ttl }} ; Minimum TTL + ) + IN NS ns1.{{ dns_domain }}. + +; PTR records (auto-generated from OpenCHAMI SMD) +{% for entry in reverse_entries | sort(attribute='host_octet') %} +{{ entry.host_octet }} IN PTR {{ entry.hostname }}.{{ dns_domain }}. +{% endfor %} diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index b2480d2c6e..c2fd3c4cc7 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -45,6 +45,24 @@ ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + register: dns_config_load + failed_when: false + +- name: Set dns_enabled and dns_domain facts + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + when: dns_config_load is not failed and dns_config is defined + +- name: Set dns_enabled to false when dns_config unavailable + ansible.builtin.set_fact: + dns_enabled: false + dns_domain: "" + when: dns_config_load is failed or dns_config is not defined + - name: Initialise variables ansible.builtin.set_fact: service_k8s_support: false diff --git a/provision/roles/provision_validations/tasks/update_hosts.yml b/provision/roles/provision_validations/tasks/update_hosts.yml index bd046032bc..8110097cbe 100644 --- a/provision/roles/provision_validations/tasks/update_hosts.yml +++ b/provision/roles/provision_validations/tasks/update_hosts.yml @@ -19,19 +19,22 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs and hostnames that are being updated - ansible.builtin.shell: | - set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ - grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp - cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} - rm -f {{ hosts_file_path }}.tmp - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" +- name: Update OIM /etc/hosts (skipped when CoreDNS is enabled) + when: not (dns_enabled | default(false) | bool) + block: + - name: Remove stale entries for IPs and hostnames that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Add hosts file entry for cluster - ansible.builtin.shell: | - set -o pipefail - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" + - name: Add hosts file entry for cluster + ansible.builtin.shell: | + set -o pipefail + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index 901e914db7..832631d96c 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -25,6 +25,7 @@ provision_inputs: - path: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/build_stream_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/discovery_config.yml" + - path: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" build_stream_job_id_absent: | "Build Stream mode is enabled. Manual execution is not supported. Please trigger this workflow via the GitLab pipeline." diff --git a/provision/roles/slurm_config/tasks/update_hosts_munge.yml b/provision/roles/slurm_config/tasks/update_hosts_munge.yml index 29683159ad..783d821edd 100644 --- a/provision/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/provision/roles/slurm_config/tasks/update_hosts_munge.yml @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Edit /etc/hosts file till DNS +- name: Edit /etc/hosts file (skipped when CoreDNS is enabled) ignore_unreachable: true delegate_to: "{{ slurmhost_ip }}" + when: not (dns_enabled | default(false) | bool) block: - name: Remove deleted nodes if any hostname exists in /etc/hosts ansible.builtin.lineinfile: From 101291fe984806f206fad8ba29969d6b4c0d1ed5 Mon Sep 17 00:00:00 2001 From: Super User Date: Mon, 18 May 2026 15:24:09 +0530 Subject: [PATCH 3/6] =?UTF-8?q?refactor:=20remove=20standalone=20CoreDNS?= =?UTF-8?q?=20deployment=20=E2=80=94=20coresmd=20already=20provides=20DNS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed redundant CoreDNS container (docker.io/coredns/coredns:1.12.1) and static zone file generation. The existing coresmd plugin in OpenCHAMI already provides dynamic DNS from SMD inventory. Removed: - deploy_coredns.yml, Corefile.j2, coredns.container.j2 - generate_dns_zones.yml, generate_reverse_zone_additional.yml, update_dns_zones.yml - forward_zone.j2, reverse_zone.j2 - coredns_image, coredns_config_dir, coredns_zone_dir vars Kept: - DNS validation (dns_config.yml, schema, tests, validate_dns_config) - Cloud-init resolv.conf conditional (points nodes to OIM coresmd) - Slurm/MPI /etc/hosts skip when dns_enabled Signed-off-by: sujit-jadhav --- .../openchami/tasks/deploy_coredns.yml | 115 ------------------ .../openchami/tasks/main.yml | 3 - .../openchami/templates/Corefile.j2 | 31 ----- .../openchami/templates/coredns.container.j2 | 24 ---- .../deploy_containers/openchami/vars/main.yml | 4 - .../tasks/generate_dns_zones.yml | 106 ---------------- .../generate_reverse_zone_additional.yml | 31 ----- .../roles/configure_ochami/tasks/main.yml | 3 - .../tasks/update_dns_zones.yml | 34 ------ .../templates/dns/forward_zone.j2 | 27 ---- .../templates/dns/reverse_zone.j2 | 16 --- 11 files changed, 394 deletions(-) delete mode 100644 prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml delete mode 100644 prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 delete mode 100644 prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 delete mode 100644 provision/roles/configure_ochami/tasks/generate_dns_zones.yml delete mode 100644 provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml delete mode 100644 provision/roles/configure_ochami/tasks/update_dns_zones.yml delete mode 100644 provision/roles/configure_ochami/templates/dns/forward_zone.j2 delete mode 100644 provision/roles/configure_ochami/templates/dns/reverse_zone.j2 diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml deleted file mode 100644 index 036d2fa1a6..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Load dns_config.yml - ansible.builtin.include_vars: - file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" - register: dns_config_loaded - failed_when: false - -- name: Set dns_enabled flag - ansible.builtin.set_fact: - dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" - when: dns_config_loaded is not failed and dns_config is defined - -- name: Set dns_enabled to false when dns_config is unavailable - ansible.builtin.set_fact: - dns_enabled: false - when: dns_config_loaded is failed or dns_config is not defined - -- name: Deploy CoreDNS for authoritative cluster DNS - when: dns_enabled | bool - block: - - name: Set CoreDNS configuration facts - ansible.builtin.set_fact: - dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" - dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" - dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" - dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" - dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" - dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" - dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" - dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" - dns_upstream_servers: "{{ network_data.admin_network.dns | default([]) }}" - - - name: Build reverse zone list from admin network - ansible.builtin.set_fact: - coredns_reverse_zones: >- - {{ - [network_data.admin_network.subnet | regex_replace('^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1')] - + (network_data.admin_network.additional_subnets | default([]) - | map(attribute='subnet') - | map('regex_replace', '^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1') - | list) - }} - - - name: Create CoreDNS directories - ansible.builtin.file: - path: "{{ item }}" - state: directory - mode: "{{ dir_permissions_755 }}" - with_items: - - "{{ coredns_config_dir }}" - - "{{ coredns_zone_dir }}" - - - name: Generate Corefile - ansible.builtin.template: - src: "{{ role_path }}/templates/Corefile.j2" - dest: "{{ coredns_config_dir }}/Corefile" - mode: "{{ file_permissions_644 }}" - - - name: Pull CoreDNS container image - ansible.builtin.command: "podman pull {{ coredns_image }}" - register: coredns_pull - until: coredns_pull is not failed - retries: "{{ pull_image_retries }}" - delay: "{{ pull_image_delay }}" - changed_when: true - - - name: Create CoreDNS systemd quadlet directory - ansible.builtin.file: - path: /etc/containers/systemd - state: directory - mode: "{{ dir_permissions_755 }}" - - - name: Deploy CoreDNS container quadlet - ansible.builtin.template: - src: "{{ role_path }}/templates/coredns.container.j2" - dest: /etc/containers/systemd/coredns.container - mode: "{{ file_permissions_644 }}" - notify: reload_systemd - - - name: Reload systemd daemon - ansible.builtin.systemd: - daemon_reload: true - listen: reload_systemd - - - name: Enable and start CoreDNS service - ansible.builtin.systemd: - name: coredns - state: started - enabled: true - - - name: Verify CoreDNS is running - ansible.builtin.command: podman ps --filter name=systemd-coredns --format {% raw %}"{{ .Status }}"{% endraw %} - register: coredns_status - changed_when: false - failed_when: "'Up' not in coredns_status.stdout" - retries: 3 - delay: 5 - - - name: CoreDNS deployment status - ansible.builtin.debug: - msg: "CoreDNS deployed successfully. Domain: {{ dns_domain }}, Zone dir: {{ coredns_zone_dir }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml index 8f7e7d27f4..97170c2966 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml @@ -22,6 +22,3 @@ - name: Deploy openchami ansible.builtin.include_tasks: deploy_openchami.yml when: not hostvars['oim']['openchami_install_status'] - -- name: Deploy CoreDNS for authoritative cluster DNS - ansible.builtin.include_tasks: deploy_coredns.yml diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 deleted file mode 100644 index 758e749a98..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 +++ /dev/null @@ -1,31 +0,0 @@ -# Auto-generated by Omnia from dns_config.yml - DO NOT EDIT MANUALLY -# Authoritative CoreDNS for cluster-internal hostnames - -{{ dns_domain }}:53 { - file /zones/db.{{ dns_domain }} - cache {{ dns_cache_ttl }} - reload 10s - log - errors -} - -{% if dns_reverse_enabled | default(true) | bool %} -{% for rz in coredns_reverse_zones | default([]) %} -{{ rz }}.in-addr.arpa:53 { - file /zones/db.{{ rz }}.in-addr.arpa - cache {{ dns_cache_ttl }} - reload 10s - log - errors -} - -{% endfor %} -{% endif %} -{% if dns_upstream_servers | default([]) | length > 0 %} -.:53 { - forward . {{ dns_upstream_servers | join(' ') }} - cache 30 - log - errors -} -{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 deleted file mode 100644 index c1642192ba..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 +++ /dev/null @@ -1,24 +0,0 @@ -# Auto-generated by Omnia - CoreDNS container quadlet -# Authoritative DNS for cluster-internal hostnames (Slurm/MPI) - -[Unit] -Description=CoreDNS - Authoritative Cluster DNS -After=network-online.target -Wants=network-online.target - -[Container] -ContainerName=coredns -Image={{ coredns_image }} -PublishPort=53:53/udp -PublishPort=53:53/tcp -Volume={{ coredns_config_dir }}/Corefile:/Corefile:ro,Z -Volume={{ coredns_zone_dir }}:/zones:ro,Z -Exec=-conf /Corefile - -[Service] -Restart=always -RestartSec=5 -TimeoutStartSec=30 - -[Install] -WantedBy=multi-user.target diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 0d5ef7696b..894964a7ee 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -35,10 +35,6 @@ network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml provision_config: "{{ hostvars['localhost']['input_project_dir'] }}/provision_config.yml" provision_config_syntax_fail_msg: "Failed. Syntax errors present in provision_config.yml. Fix errors and re-run playbook again." -# CoreDNS for authoritative cluster DNS (Slurm/MPI) -coredns_image: "docker.io/coredns/coredns:1.12.1" -coredns_config_dir: "/etc/coredns" -coredns_zone_dir: "/etc/coredns/zones" # vars passed to openchami installation openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" diff --git a/provision/roles/configure_ochami/tasks/generate_dns_zones.yml b/provision/roles/configure_ochami/tasks/generate_dns_zones.yml deleted file mode 100644 index 8c1ee5b2aa..0000000000 --- a/provision/roles/configure_ochami/tasks/generate_dns_zones.yml +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Load dns_config.yml - ansible.builtin.include_vars: - file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" - delegate_to: localhost - run_once: true - register: dns_config_load - failed_when: false - -- name: Set dns_enabled fact - ansible.builtin.set_fact: - dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" - when: dns_config_load is not failed and dns_config is defined - -- name: Set dns_enabled to false when config unavailable - ansible.builtin.set_fact: - dns_enabled: false - when: dns_config_load is failed or dns_config is not defined - -- name: Generate CoreDNS zone files - when: dns_enabled | bool - delegate_to: oim - block: - - name: Set DNS rendering facts - ansible.builtin.set_fact: - dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" - dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" - dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" - dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" - dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" - dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" - dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" - dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" - coredns_zone_dir: "/etc/coredns/zones" - - - name: Generate SOA serial (YYYYMMDDNN) - ansible.builtin.set_fact: - dns_soa_serial: "{{ lookup('pipe', 'date +%Y%m%d')}}01" - - - name: Initialize fabric_ip_map as empty - ansible.builtin.set_fact: - fabric_ip_map: {} - - - name: Ensure zone directory exists - ansible.builtin.file: - path: "{{ coredns_zone_dir }}" - state: directory - mode: "0755" - - - name: Generate forward zone file - ansible.builtin.template: - src: "{{ role_path }}/templates/dns/forward_zone.j2" - dest: "{{ coredns_zone_dir }}/db.{{ dns_domain }}" - mode: "0644" - - - name: Build reverse zone entries for admin subnet - ansible.builtin.set_fact: - admin_reverse_zone: "{{ hostvars['localhost']['admin_nic_ip'] | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" - - - name: Build reverse entries for admin subnet - ansible.builtin.set_fact: - reverse_entries: >- - [{% for hostname in ip_name_map | sort %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endfor %}] - - - name: Generate reverse zone file for admin subnet - ansible.builtin.template: - src: "{{ role_path }}/templates/dns/reverse_zone.j2" - dest: "{{ coredns_zone_dir }}/db.{{ admin_reverse_zone }}.in-addr.arpa" - mode: "0644" - vars: - reverse_zone_name: "{{ admin_reverse_zone }}" - when: dns_reverse_enabled | bool - - - name: Generate reverse zones for additional subnets - ansible.builtin.include_tasks: generate_reverse_zone_additional.yml - loop: "{{ network_data.admin_network.additional_subnets | default([]) }}" - loop_control: - loop_var: additional_subnet - when: - - dns_reverse_enabled | bool - - network_data.admin_network.additional_subnets is defined - - network_data.admin_network.additional_subnets | length > 0 - - - name: DNS zone generation summary - ansible.builtin.debug: - msg: >- - DNS zones generated: forward zone db.{{ dns_domain }}, - {{ (ip_name_map | length) }} A records, - reverse zone(s) for {{ admin_reverse_zone }} - {% if network_data.admin_network.additional_subnets | default([]) | length > 0 %} - + {{ network_data.admin_network.additional_subnets | length }} additional subnet(s) - {% endif %} diff --git a/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml b/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml deleted file mode 100644 index b131a4dfb4..0000000000 --- a/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Build reverse zone name for additional subnet {{ additional_subnet.subnet }} - ansible.builtin.set_fact: - additional_reverse_zone: "{{ additional_subnet.subnet | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" - -- name: Build reverse entries for additional subnet {{ additional_subnet.subnet }} - ansible.builtin.set_fact: - reverse_entries: >- - [{% set subnet_prefix = additional_subnet.subnet | regex_replace('\.\d+$', '') %}{% for hostname in ip_name_map | sort %}{% if ip_name_map[hostname].startswith(subnet_prefix ~ '.') %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endif %}{% endfor %}] - -- name: Generate reverse zone file for additional subnet {{ additional_subnet.subnet }} - ansible.builtin.template: - src: "{{ role_path }}/templates/dns/reverse_zone.j2" - dest: "{{ coredns_zone_dir }}/db.{{ additional_reverse_zone }}.in-addr.arpa" - mode: "0644" - vars: - reverse_zone_name: "{{ additional_reverse_zone }}" diff --git a/provision/roles/configure_ochami/tasks/main.yml b/provision/roles/configure_ochami/tasks/main.yml index a437aec20c..19f98e96c1 100644 --- a/provision/roles/configure_ochami/tasks/main.yml +++ b/provision/roles/configure_ochami/tasks/main.yml @@ -37,6 +37,3 @@ - name: Provision completion ansible.builtin.include_tasks: provision_completion.yml - - - name: Generate CoreDNS zone files from SMD inventory - ansible.builtin.include_tasks: generate_dns_zones.yml diff --git a/provision/roles/configure_ochami/tasks/update_dns_zones.yml b/provision/roles/configure_ochami/tasks/update_dns_zones.yml deleted file mode 100644 index 1f1a419db9..0000000000 --- a/provision/roles/configure_ochami/tasks/update_dns_zones.yml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -# This task regenerates CoreDNS zone files after node add/remove operations. -# Called after SMD inventory changes. CoreDNS auto-reloads via the 'reload' plugin. - -- name: Check if CoreDNS DNS is enabled - ansible.builtin.set_fact: - dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" - when: dns_config is defined - -- name: Regenerate DNS zone files after inventory change - when: dns_enabled | default(false) | bool - block: - - name: Re-read node hostname/IP mapping from SMD - ansible.builtin.include_tasks: "{{ role_path }}/../../slurm_config/tasks/read_slurm_hostnames.yml" - - - name: Regenerate DNS zones - ansible.builtin.include_tasks: generate_dns_zones.yml - - - name: DNS zone update completed - ansible.builtin.debug: - msg: "DNS zone files regenerated. CoreDNS will auto-reload within 10s." diff --git a/provision/roles/configure_ochami/templates/dns/forward_zone.j2 b/provision/roles/configure_ochami/templates/dns/forward_zone.j2 deleted file mode 100644 index e0bfd3d0a1..0000000000 --- a/provision/roles/configure_ochami/templates/dns/forward_zone.j2 +++ /dev/null @@ -1,27 +0,0 @@ -; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY -; Forward zone: {{ dns_domain }} -$TTL {{ dns_ttl }} -@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( - {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) - {{ dns_soa_refresh }} ; Refresh - {{ dns_soa_retry }} ; Retry - {{ dns_soa_expire }} ; Expire - {{ dns_ttl }} ; Minimum TTL - ) - IN NS ns1.{{ dns_domain }}. - -ns1 IN A {{ admin_nic_ip }} - -; Compute nodes (auto-generated from OpenCHAMI SMD) -{% for hostname in ip_name_map | sort %} -{{ hostname }} IN A {{ ip_name_map[hostname] }} -{% endfor %} -{% if dns_fabric_suffixes is defined and dns_fabric_suffixes | length > 0 %} - -; Fabric suffix records -{% for suffix in dns_fabric_suffixes %} -{% for hostname in fabric_ip_map.get(suffix, {}) | sort %} -{{ hostname }}{{ suffix }} IN A {{ fabric_ip_map[suffix][hostname] }} -{% endfor %} -{% endfor %} -{% endif %} diff --git a/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 b/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 deleted file mode 100644 index 3bea95054d..0000000000 --- a/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 +++ /dev/null @@ -1,16 +0,0 @@ -; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY -; Reverse zone: {{ reverse_zone_name }}.in-addr.arpa -$TTL {{ dns_ttl }} -@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( - {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) - {{ dns_soa_refresh }} ; Refresh - {{ dns_soa_retry }} ; Retry - {{ dns_soa_expire }} ; Expire - {{ dns_ttl }} ; Minimum TTL - ) - IN NS ns1.{{ dns_domain }}. - -; PTR records (auto-generated from OpenCHAMI SMD) -{% for entry in reverse_entries | sort(attribute='host_octet') %} -{{ entry.host_octet }} IN PTR {{ entry.hostname }}.{{ dns_domain }}. -{% endfor %} From 98d5925ebe21227de7b30b0aea1d8e46af8d9cac Mon Sep 17 00:00:00 2001 From: Super User Date: Mon, 18 May 2026 15:33:53 +0530 Subject: [PATCH 4/6] refactor: simplify dns_config to dns_enabled + dns_domain only Removed unused parameters (dns_ttl, dns_cache_ttl, dns_reverse_enabled, dns_fabric_suffixes, dns_soa) that were designed for the static zone approach. With coresmd, DNS records are dynamic from SMD and these params are no-ops. Simplified: dns_config.yml, dns_config.json schema, validate_dns_config(), error messages, and test suite (13 tests -> 13 focused domain tests). Signed-off-by: sujit-jadhav --- .../common_utils/en_us_validation_msg.py | 22 ---- .../input_validation/schema/dns_config.json | 32 +----- .../validation_flows/provision_validation.py | 55 --------- .../tests/test_dns_config_validation.py | 106 +----------------- input/dns_config.yml | 42 +------ 5 files changed, 8 insertions(+), 249 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index fbfb9d164b..2344df3471 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -581,28 +581,6 @@ def json_file_mandatory(file_path): "The following are not permitted: cluster.local, localhost, " "com, net, org, edu, gov, io." ) -DNS_TTL_RANGE_MSG = ( - "dns_ttl must be an integer between 60 and 86400 (seconds)." -) -DNS_CACHE_TTL_RANGE_MSG = ( - "dns_cache_ttl must be an integer between 10 and 3600 (seconds)." -) -DNS_CACHE_TTL_EXCEEDS_TTL_MSG = ( - "dns_cache_ttl must be less than or equal to dns_ttl. " - "Cache TTL cannot exceed the record TTL." -) -DNS_FABRIC_SUFFIX_FORMAT_MSG = ( - "each dns_fabric_suffix must begin with a hyphen and contain " - "only lowercase alphanumeric characters and hyphens. " - "Example: -ib, -stor" -) -DNS_SOA_POSITIVE_INT_MSG = ( - "dns_soa values (refresh, retry, expire) must be positive integers." -) -DNS_REVERSE_DISABLED_WARNING_MSG = ( - "dns_reverse_enabled is false. MPI and Slurm may require " - "reverse DNS (PTR records) for security validation." -) # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" diff --git a/common/library/module_utils/input_validation/schema/dns_config.json b/common/library/module_utils/input_validation/schema/dns_config.json index f76a912fd2..58ae9f5b18 100644 --- a/common/library/module_utils/input_validation/schema/dns_config.json +++ b/common/library/module_utils/input_validation/schema/dns_config.json @@ -7,10 +7,7 @@ "type": "object", "required": [ "dns_enabled", - "dns_domain", - "dns_ttl", - "dns_reverse_enabled", - "dns_cache_ttl" + "dns_domain" ], "properties": { "dns_enabled": { "type": "boolean" }, @@ -18,33 +15,6 @@ "type": "string", "minLength": 1, "pattern": "^[a-z0-9]([a-z0-9\\-]*[a-z0-9])?(\\.[a-z0-9]([a-z0-9\\-]*[a-z0-9])?)*$" - }, - "dns_ttl": { - "type": "integer", - "minimum": 60, - "maximum": 86400 - }, - "dns_reverse_enabled": { "type": "boolean" }, - "dns_fabric_suffixes": { - "type": "array", - "items": { - "type": "string", - "pattern": "^-[a-z0-9][a-z0-9\\-]*$" - } - }, - "dns_cache_ttl": { - "type": "integer", - "minimum": 10, - "maximum": 3600 - }, - "dns_soa": { - "type": "object", - "properties": { - "refresh": { "type": "integer", "minimum": 1 }, - "retry": { "type": "integer", "minimum": 1 }, - "expire": { "type": "integer", "minimum": 1 } - }, - "additionalProperties": false } }, "additionalProperties": false diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 5c10bd177b..6750a9ff57 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1460,10 +1460,6 @@ def validate_dns_config(data): Checks: - dns_domain is a valid RFC 1035 domain name and not reserved. - - dns_ttl is in valid range (60-86400). - - dns_cache_ttl is in valid range (10-3600) and <= dns_ttl. - - dns_fabric_suffixes format (hyphen-prefixed, lowercase alphanumeric). - - dns_soa values are positive integers. Args: data (dict): The dns_config dict from dns_config.yml. @@ -1505,55 +1501,4 @@ def validate_dns_config(data): ) ) - # --- dns_ttl --- - ttl = cfg.get("dns_ttl", 300) - if not isinstance(ttl, int) or ttl < 60 or ttl > 86400: - errors.append( - create_error_msg( - "dns_config.dns_ttl", str(ttl), - en_us_validation_msg.DNS_TTL_RANGE_MSG, - ) - ) - - # --- dns_cache_ttl --- - cache_ttl = cfg.get("dns_cache_ttl", 60) - if not isinstance(cache_ttl, int) or cache_ttl < 10 or cache_ttl > 3600: - errors.append( - create_error_msg( - "dns_config.dns_cache_ttl", str(cache_ttl), - en_us_validation_msg.DNS_CACHE_TTL_RANGE_MSG, - ) - ) - elif isinstance(ttl, int) and cache_ttl > ttl: - errors.append( - create_error_msg( - "dns_config.dns_cache_ttl", str(cache_ttl), - en_us_validation_msg.DNS_CACHE_TTL_EXCEEDS_TTL_MSG, - ) - ) - - # --- dns_fabric_suffixes --- - suffix_re = re.compile(r'^-[a-z0-9][a-z0-9\-]*$') - for suffix in cfg.get("dns_fabric_suffixes", []): - if not isinstance(suffix, str) or not suffix_re.match(suffix): - errors.append( - create_error_msg( - "dns_config.dns_fabric_suffixes", str(suffix), - en_us_validation_msg.DNS_FABRIC_SUFFIX_FORMAT_MSG, - ) - ) - - # --- dns_soa --- - soa = cfg.get("dns_soa", {}) - if soa: - for field in ("refresh", "retry", "expire"): - val = soa.get(field) - if val is not None and (not isinstance(val, int) or val < 1): - errors.append( - create_error_msg( - f"dns_config.dns_soa.{field}", str(val), - en_us_validation_msg.DNS_SOA_POSITIVE_INT_MSG, - ) - ) - return errors diff --git a/common/library/modules/tests/test_dns_config_validation.py b/common/library/modules/tests/test_dns_config_validation.py index 8eafcbb246..a2311d345d 100644 --- a/common/library/modules/tests/test_dns_config_validation.py +++ b/common/library/modules/tests/test_dns_config_validation.py @@ -68,23 +68,11 @@ def _make_config(**overrides): "dns_config": { "dns_enabled": True, "dns_domain": "hpc.cluster", - "dns_ttl": 300, - "dns_reverse_enabled": True, - "dns_fabric_suffixes": [], - "dns_cache_ttl": 60, - "dns_soa": { - "refresh": 3600, - "retry": 600, - "expire": 86400, - }, } } cfg = base["dns_config"] for k, v in overrides.items(): - if k.startswith("soa_"): - cfg["dns_soa"][k[4:]] = v - else: - cfg[k] = v + cfg[k] = v return base @@ -140,97 +128,5 @@ def test_subdomain_of_reserved(self): self.assertTrue(_has_error_msg(errs, "reserved")) -class TestDnsTtlValidation(unittest.TestCase): - """FS-INPUT-02: dns_ttl must be in [60, 86400].""" - - def test_valid_ttl(self): - self.assertEqual(validate_dns_config(_make_config(dns_ttl=300)), []) - - def test_ttl_minimum(self): - self.assertEqual(validate_dns_config(_make_config(dns_ttl=60)), []) - - def test_ttl_maximum(self): - self.assertEqual(validate_dns_config(_make_config(dns_ttl=86400)), []) - - def test_ttl_too_low(self): - errs = validate_dns_config(_make_config(dns_ttl=59)) - self.assertTrue(_has_error(errs, "dns_ttl")) - - def test_ttl_too_high(self): - errs = validate_dns_config(_make_config(dns_ttl=86401)) - self.assertTrue(_has_error(errs, "dns_ttl")) - - -class TestDnsCacheTtlValidation(unittest.TestCase): - """FS-INPUT-03: dns_cache_ttl must be in [10, 3600] and <= dns_ttl.""" - - def test_valid_cache_ttl(self): - self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=60)), []) - - def test_cache_ttl_minimum(self): - self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=10)), []) - - def test_cache_ttl_maximum(self): - self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=300, dns_ttl=300)), []) - - def test_cache_ttl_too_low(self): - errs = validate_dns_config(_make_config(dns_cache_ttl=9)) - self.assertTrue(_has_error(errs, "dns_cache_ttl")) - - def test_cache_ttl_too_high(self): - errs = validate_dns_config(_make_config(dns_cache_ttl=3601)) - self.assertTrue(_has_error(errs, "dns_cache_ttl")) - - def test_cache_ttl_exceeds_ttl(self): - errs = validate_dns_config(_make_config(dns_ttl=60, dns_cache_ttl=120)) - self.assertTrue(_has_error(errs, "dns_cache_ttl")) - - -class TestDnsFabricSuffixValidation(unittest.TestCase): - """FS-INPUT-04: fabric suffixes must be hyphen-prefixed lowercase.""" - - def test_valid_suffix(self): - self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=["-ib"])), []) - - def test_valid_suffix_multi(self): - self.assertEqual( - validate_dns_config(_make_config(dns_fabric_suffixes=["-ib", "-stor"])), [] - ) - - def test_invalid_suffix_no_hyphen(self): - errs = validate_dns_config(_make_config(dns_fabric_suffixes=["ib"])) - self.assertTrue(_has_error(errs, "dns_fabric_suffix")) - - def test_invalid_suffix_uppercase(self): - errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-IB"])) - self.assertTrue(_has_error(errs, "dns_fabric_suffix")) - - def test_invalid_suffix_empty_after_hyphen(self): - errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-"])) - self.assertTrue(_has_error(errs, "dns_fabric_suffix")) - - def test_empty_suffixes_ok(self): - self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=[])), []) - - -class TestDnsSoaValidation(unittest.TestCase): - """FS-SOA-01..05: SOA values must be positive integers.""" - - def test_valid_soa(self): - self.assertEqual(validate_dns_config(_make_config()), []) - - def test_soa_refresh_zero(self): - errs = validate_dns_config(_make_config(soa_refresh=0)) - self.assertTrue(_has_error(errs, "dns_soa")) - - def test_soa_retry_negative(self): - errs = validate_dns_config(_make_config(soa_retry=-1)) - self.assertTrue(_has_error(errs, "dns_soa")) - - def test_soa_expire_zero(self): - errs = validate_dns_config(_make_config(soa_expire=0)) - self.assertTrue(_has_error(errs, "dns_soa")) - - if __name__ == "__main__": unittest.main() diff --git a/input/dns_config.yml b/input/dns_config.yml index 82f9800d79..a18873b931 100644 --- a/input/dns_config.yml +++ b/input/dns_config.yml @@ -13,52 +13,22 @@ # limitations under the License. --- -# This file configures authoritative CoreDNS for Slurm/MPI hostname resolution. -# When enabled, CoreDNS replaces /etc/hosts as the primary name resolution -# mechanism for all cluster compute nodes. +# This file configures DNS-based hostname resolution for the cluster. +# When enabled, compute nodes use the OIM's coresmd (CoreDNS with the +# OpenCHAMI SMD plugin) instead of /etc/hosts for name resolution. # -# DNS records are generated automatically from OpenCHAMI SMD/HSM inventory. +# DNS records are generated automatically from OpenCHAMI SMD inventory. # No per-node DNS entries are required. # -# 'dns_enabled': Master toggle for CoreDNS-based resolution. -# - true: CoreDNS serves authoritative DNS; /etc/hosts peer entries are removed. +# 'dns_enabled': Master toggle for DNS-based resolution. +# - true: Nodes use coresmd DNS; /etc/hosts peer entries are skipped. # - false: Traditional /etc/hosts-based resolution is used (default). # # 'dns_domain': Internal DNS domain for the cluster. # All node hostnames are registered under this domain # (e.g., nid0001.hpc.cluster). Must be a valid DNS domain name (RFC 1035). # Must NOT overlap with public TLDs or Kubernetes 'cluster.local'. -# -# 'dns_ttl': Default Time-To-Live (seconds) for A and PTR records. -# Lower values enable faster propagation on node changes. -# Valid range: 60–86400. Default: 300. -# -# 'dns_reverse_enabled': Whether to generate reverse (PTR) DNS zones. -# Required for MPI and Slurm security validation. Default: true. -# -# 'dns_fabric_suffixes': Optional hostname suffixes for multi-fabric networks. -# When set, additional A records are created for each fabric IP. -# Example: ["-ib"] creates nid0001-ib.hpc.cluster pointing to InfiniBand IP. -# Each suffix must begin with a hyphen and contain only lowercase alphanumeric -# characters and hyphens. -# -# 'dns_cache_ttl': CoreDNS in-memory cache TTL (seconds). -# Controls how long resolved answers are cached before re-querying zone files. -# Must be <= dns_ttl. Valid range: 10–3600. Default: 60. -# -# 'dns_soa': SOA (Start of Authority) record parameters for generated zone files. -# - refresh: How often secondary DNS servers should check for updates (seconds). -# - retry: Retry interval after a failed refresh (seconds). -# - expire: When to stop serving zone data if primary is unreachable (seconds). dns_config: dns_enabled: false dns_domain: "hpc.cluster" - dns_ttl: 300 - dns_reverse_enabled: true - dns_fabric_suffixes: [] - dns_cache_ttl: 60 - dns_soa: - refresh: 3600 - retry: 600 - expire: 86400 From 2e38bd72d30346332b42b559505c3b3a614478ca Mon Sep 17 00:00:00 2001 From: Super User Date: Mon, 18 May 2026 19:19:08 +0530 Subject: [PATCH 5/6] fix: create coredhcp template directory before copy The coredhcp/ directory does not exist in the upstream deployment-recipes repo. Added file state=directory tasks before each copy to create the parent directory on both localhost and OIM. Signed-off-by: sujit-jadhav --- .../openchami/tasks/deploy_openchami.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml index 9f6210c397..c4abae9a82 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml @@ -85,6 +85,14 @@ delegate_to: localhost connection: local +- name: Ensure coredhcp template directory exists + ansible.builtin.file: + path: "{{ openchami_coredhcp_target | dirname }}" + state: directory + mode: "{{ dir_permissions_755 }}" + delegate_to: localhost + connection: local + - name: Deploy coredhcp template with multi-subnet support ansible.builtin.copy: src: "{{ openchami_coredhcp_template }}" @@ -93,7 +101,13 @@ delegate_to: localhost connection: local -- name: Deploy coredhcp template with multi-subnet support +- name: Ensure coredhcp template directory exists on OIM + ansible.builtin.file: + path: "{{ openchami_coredhcp_target | dirname }}" + state: directory + mode: "{{ dir_permissions_755 }}" + +- name: Deploy coredhcp template with multi-subnet support on OIM ansible.builtin.copy: src: "{{ openchami_coredhcp_template }}" dest: "{{ openchami_coredhcp_target }}" From 0aaa6bfbaa0e0295c4c73116ca6f70d4fab0ef66 Mon Sep 17 00:00:00 2001 From: Super User Date: Tue, 19 May 2026 14:21:20 +0530 Subject: [PATCH 6/6] refactor: merge dns_enabled into provision_config.yml, remove dns_config.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review comments: 1. Removed dns_domain — domain comes from OIM metadata (domain_name). Cloud-init templates now use domain_name directly. 2. Moved coredhcp template tasks from deploy_openchami.yml to configs/ochami.yml where other config template drops live. 3. Eliminated dns_config.yml entirely — dns_enabled is now a simple boolean in provision_config.yml (already loaded by Ansible). 4. Removed dns_config.json schema, dns_config entry from config.py, validation file list, and test_dns_config_validation.py. 5. Simplified validate_dns_config() to no-op (schema handles type check). 6. Removed DNS domain error messages from en_us_validation_msg.py. Signed-off-by: sujit-jadhav --- .../input_validation/common_utils/config.py | 4 +- .../common_utils/en_us_validation_msg.py | 12 -- .../input_validation/schema/dns_config.json | 23 --- .../schema/provision_config.json | 5 + .../validation_flows/provision_validation.py | 52 +------ .../tests/test_dns_config_validation.py | 132 ------------------ input/dns_config.yml | 34 ----- input/provision_config.yml | 8 ++ .../openchami/tasks/deploy_openchami.yml | 28 ---- .../openchami/templates/coredhcp.yaml.j2 | 35 ----- .../templates/coredhcp/coredhcp.yaml.j2 | 25 ++++ .../deploy_containers/openchami/vars/main.yml | 9 -- ...-group-login_compiler_node_aarch64.yaml.j2 | 2 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 2 +- .../ci-group-login_node_aarch64.yaml.j2 | 2 +- .../ci-group-login_node_x86_64.yaml.j2 | 2 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 8 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 2 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 2 +- .../tasks/include_software_config.yml | 17 +-- .../roles/provision_validations/vars/main.yml | 1 - 21 files changed, 56 insertions(+), 349 deletions(-) delete mode 100644 common/library/module_utils/input_validation/schema/dns_config.json delete mode 100644 common/library/modules/tests/test_dns_config_validation.py delete mode 100644 input/dns_config.yml delete mode 100644 prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 9c65c96aec..667da006ea 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -59,8 +59,7 @@ "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", "gitlab_config": "gitlab_config.yml", - "discovery_config": "discovery_config.yml", - "dns_config": "dns_config.yml" + "discovery_config": "discovery_config.yml" # "additional_software": "additional_software.json" } @@ -79,7 +78,6 @@ files["provision_config"], files["network_spec"], files["software_config"], - files["dns_config"], # files["high_availability_config"] ], "security": [ diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 2344df3471..f5b7e557b1 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -570,18 +570,6 @@ def json_file_mandatory(file_path): "another additional subnet's dynamic_range." ) -# dns_config -DNS_DOMAIN_INVALID_MSG = ( - "dns_domain must be a valid DNS domain name (RFC 1035). " - "Use lowercase alphanumeric characters, hyphens, and dots only. " - "Example: hpc.cluster" -) -DNS_DOMAIN_RESERVED_MSG = ( - "dns_domain must not use a reserved domain. " - "The following are not permitted: cluster.local, localhost, " - "com, net, org, edu, gov, io." -) - # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" diff --git a/common/library/module_utils/input_validation/schema/dns_config.json b/common/library/module_utils/input_validation/schema/dns_config.json deleted file mode 100644 index 58ae9f5b18..0000000000 --- a/common/library/module_utils/input_validation/schema/dns_config.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "required": ["dns_config"], - "properties": { - "dns_config": { - "type": "object", - "required": [ - "dns_enabled", - "dns_domain" - ], - "properties": { - "dns_enabled": { "type": "boolean" }, - "dns_domain": { - "type": "string", - "minLength": 1, - "pattern": "^[a-z0-9]([a-z0-9\\-]*[a-z0-9])?(\\.[a-z0-9]([a-z0-9\\-]*[a-z0-9])?)*$" - } - }, - "additionalProperties": false - } - } -} diff --git a/common/library/module_utils/input_validation/schema/provision_config.json b/common/library/module_utils/input_validation/schema/provision_config.json index 79977c296c..0f154d8870 100644 --- a/common/library/module_utils/input_validation/schema/provision_config.json +++ b/common/library/module_utils/input_validation/schema/provision_config.json @@ -16,6 +16,11 @@ "description": "Default lease time for DHCP.", "pattern": "^[0-9]+$", "default": "86400" + }, + "dns_enabled": { + "type": "boolean", + "description": "Enable DNS-based hostname resolution via coresmd.", + "default": false } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 6750a9ff57..20c69cf94e 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1444,61 +1444,19 @@ def _ranges_overlap(range_a, range_b): return False -# Reserved domains that must not be used as dns_domain -_RESERVED_DOMAINS = frozenset([ - "cluster.local", "localhost", - "com", "net", "org", "edu", "gov", "io", -]) - -# Regex for a valid DNS label (RFC 1035) -_DNS_LABEL_RE = re.compile(r'^[a-z0-9]([a-z0-9\-]{0,61}[a-z0-9])?$') - def validate_dns_config(data): """ Validates dns_config input parameters. - Checks: - - dns_domain is a valid RFC 1035 domain name and not reserved. + dns_config.yml only contains dns_enabled (boolean). + The cluster domain is read from OIM metadata (domain_name). Args: data (dict): The dns_config dict from dns_config.yml. Returns: - list: Validation error messages. + list: Validation error messages (currently empty; schema + validation handles the dns_enabled type check). """ - errors = [] - cfg = data.get("dns_config", {}) - if not cfg or not cfg.get("dns_enabled", False): - return errors - - # --- dns_domain --- - domain = cfg.get("dns_domain", "") - if domain: - labels = domain.split(".") - valid_domain = all(_DNS_LABEL_RE.match(label) for label in labels) and len(domain) <= 253 - if not valid_domain: - errors.append( - create_error_msg( - "dns_config.dns_domain", domain, - en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, - ) - ) - if domain in _RESERVED_DOMAINS or any( - domain.endswith(f".{rd}") for rd in _RESERVED_DOMAINS - ): - errors.append( - create_error_msg( - "dns_config.dns_domain", domain, - en_us_validation_msg.DNS_DOMAIN_RESERVED_MSG, - ) - ) - else: - errors.append( - create_error_msg( - "dns_config.dns_domain", domain, - en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, - ) - ) - - return errors + return [] diff --git a/common/library/modules/tests/test_dns_config_validation.py b/common/library/modules/tests/test_dns_config_validation.py deleted file mode 100644 index a2311d345d..0000000000 --- a/common/library/modules/tests/test_dns_config_validation.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for dns_config validation in provision_validation.py.""" - -import sys -import os -import unittest - -# --------------------------------------------------------------------------- -# Bootstrap: make the validation code importable without a full Ansible install -# --------------------------------------------------------------------------- -REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) -MODULE_UTILS = os.path.join(REPO_ROOT, "common", "library", "module_utils") -sys.path.insert(0, MODULE_UTILS) - -import types -import importlib - -for _name in ( - "ansible", - "ansible.module_utils", - "ansible.module_utils.input_validation", - "ansible.module_utils.input_validation.common_utils", - "ansible.module_utils.input_validation.validation_flows", -): - sys.modules.setdefault(_name, types.ModuleType(_name)) - -_cu_ns = "ansible.module_utils.input_validation.common_utils" -for _sub in ("config", "en_us_validation_msg", "validation_utils"): - _mod = importlib.import_module(f"input_validation.common_utils.{_sub}") - sys.modules[f"{_cu_ns}.{_sub}"] = _mod - setattr(sys.modules[_cu_ns], _sub, _mod) - -sys.modules["ansible.module_utils.input_validation.validation_flows.common_validation"] = ( - types.ModuleType("ansible.module_utils.input_validation.validation_flows.common_validation") -) - -from input_validation.validation_flows.provision_validation import ( # noqa: E402 - validate_dns_config, -) - - -def _has_error(errors, key_substr): - """Check if any error dict has error_key containing the substring.""" - return any(key_substr in e.get("error_key", "") for e in errors) - - -def _has_error_msg(errors, msg_substr): - """Check if any error dict has error_msg containing the substring.""" - return any(msg_substr in e.get("error_msg", "") for e in errors) - - -def _make_config(**overrides): - """Build a valid dns_config dict, then apply overrides.""" - base = { - "dns_config": { - "dns_enabled": True, - "dns_domain": "hpc.cluster", - } - } - cfg = base["dns_config"] - for k, v in overrides.items(): - cfg[k] = v - return base - - -class TestDnsConfigValidationDisabled(unittest.TestCase): - """When dns_enabled is false, no validation should run.""" - - def test_disabled_returns_no_errors(self): - data = _make_config(dns_enabled=False) - self.assertEqual(validate_dns_config(data), []) - - def test_missing_dns_config_key(self): - self.assertEqual(validate_dns_config({}), []) - - -class TestDnsDomainValidation(unittest.TestCase): - """FS-DOMAIN-01: dns_domain must be valid RFC 1035.""" - - def test_valid_domain(self): - self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc.cluster")), []) - - def test_valid_domain_single_label(self): - self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc")), []) - - def test_valid_domain_multi_label(self): - self.assertEqual(validate_dns_config(_make_config(dns_domain="compute.hpc.lab")), []) - - def test_invalid_domain_uppercase(self): - errs = validate_dns_config(_make_config(dns_domain="HPC.Cluster")) - self.assertTrue(_has_error(errs, "dns_domain")) - - def test_invalid_domain_underscore(self): - errs = validate_dns_config(_make_config(dns_domain="hpc_cluster")) - self.assertTrue(_has_error(errs, "dns_domain")) - - def test_invalid_domain_empty(self): - errs = validate_dns_config(_make_config(dns_domain="")) - self.assertTrue(_has_error(errs, "dns_domain")) - - def test_reserved_domain_cluster_local(self): - errs = validate_dns_config(_make_config(dns_domain="cluster.local")) - self.assertTrue(_has_error_msg(errs, "reserved")) - - def test_reserved_domain_localhost(self): - errs = validate_dns_config(_make_config(dns_domain="localhost")) - self.assertTrue(_has_error_msg(errs, "reserved")) - - def test_reserved_domain_com(self): - errs = validate_dns_config(_make_config(dns_domain="com")) - self.assertTrue(_has_error_msg(errs, "reserved")) - - def test_subdomain_of_reserved(self): - errs = validate_dns_config(_make_config(dns_domain="hpc.cluster.local")) - self.assertTrue(_has_error_msg(errs, "reserved")) - - -if __name__ == "__main__": - unittest.main() diff --git a/input/dns_config.yml b/input/dns_config.yml deleted file mode 100644 index a18873b931..0000000000 --- a/input/dns_config.yml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# This file configures DNS-based hostname resolution for the cluster. -# When enabled, compute nodes use the OIM's coresmd (CoreDNS with the -# OpenCHAMI SMD plugin) instead of /etc/hosts for name resolution. -# -# DNS records are generated automatically from OpenCHAMI SMD inventory. -# No per-node DNS entries are required. -# -# 'dns_enabled': Master toggle for DNS-based resolution. -# - true: Nodes use coresmd DNS; /etc/hosts peer entries are skipped. -# - false: Traditional /etc/hosts-based resolution is used (default). -# -# 'dns_domain': Internal DNS domain for the cluster. -# All node hostnames are registered under this domain -# (e.g., nid0001.hpc.cluster). Must be a valid DNS domain name (RFC 1035). -# Must NOT overlap with public TLDs or Kubernetes 'cluster.local'. - -dns_config: - dns_enabled: false - dns_domain: "hpc.cluster" diff --git a/input/provision_config.yml b/input/provision_config.yml index 6b8f17c6aa..14b946ad8a 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -38,3 +38,11 @@ language: "en_US.UTF-8" # Default: 86400 # Max: 31536000 default_lease_time: "86400" + +#### Optional +# Enable DNS-based hostname resolution for compute nodes. +# When true, nodes use coresmd (CoreDNS + OpenCHAMI SMD plugin) instead of /etc/hosts. +# DNS records are generated automatically from SMD inventory. +# The cluster domain is read from OIM metadata (domain_name). +# Default: false +dns_enabled: false diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml index c4abae9a82..8c94f46950 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml @@ -85,34 +85,6 @@ delegate_to: localhost connection: local -- name: Ensure coredhcp template directory exists - ansible.builtin.file: - path: "{{ openchami_coredhcp_target | dirname }}" - state: directory - mode: "{{ dir_permissions_755 }}" - delegate_to: localhost - connection: local - -- name: Deploy coredhcp template with multi-subnet support - ansible.builtin.copy: - src: "{{ openchami_coredhcp_template }}" - dest: "{{ openchami_coredhcp_target }}" - mode: "{{ file_permissions_644 }}" - delegate_to: localhost - connection: local - -- name: Ensure coredhcp template directory exists on OIM - ansible.builtin.file: - path: "{{ openchami_coredhcp_target | dirname }}" - state: directory - mode: "{{ dir_permissions_755 }}" - -- name: Deploy coredhcp template with multi-subnet support on OIM - ansible.builtin.copy: - src: "{{ openchami_coredhcp_template }}" - dest: "{{ openchami_coredhcp_target }}" - mode: "{{ file_permissions_644 }}" - - name: Load the openchami configs vars ansible.builtin.template: src: "{{ openchami_config_vars_template }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 deleted file mode 100644 index 523d4be376..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 +++ /dev/null @@ -1,35 +0,0 @@ -server4: - listen: - - "%{{ cluster_boot_interface }}" - plugins: - - server_id: {{ coredhcp_server_id }} - - dns: {{ coredhcp_dns_server }} - - router: {{ coredhcp_router }} - - netmask: {{ coredhcp_netmask }} -{% if coredhcp_subnets | default([]) | length > 0 %} - # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) - - coresmd: | - svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 - ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 - ca_cert=/root_ca/root_ca.crt - cache_valid={{ coredhcp_cache_validity }} - lease_time={{ coredhcp_lease_duration }} - single_port={{ coredhcp_tftp_single_port_mode | lower }} -{% for s in coredhcp_subnets %} - subnet={{ s.cidr }},{{ s.router }} -{% endfor %} - rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=hostname:unknown-{{'{'}}04d{{'}'}} - - bootloop: | - lease_file=/tmp/coredhcp.db - script_path={{ coredhcp_custom_ipxe }} - lease_time={{ coredhcp_tmp_lease_duration }} -{% for sp in coredhcp_subnet_pools %} - subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} -{% endfor %} -{% else %} - # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} -{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 index 2b0e180422..523d4be376 100644 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 @@ -6,5 +6,30 @@ server4: - dns: {{ coredhcp_dns_server }} - router: {{ coredhcp_router }} - netmask: {{ coredhcp_netmask }} +{% if coredhcp_subnets | default([]) | length > 0 %} + # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) + - coresmd: | + svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 + ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 + ca_cert=/root_ca/root_ca.crt + cache_valid={{ coredhcp_cache_validity }} + lease_time={{ coredhcp_lease_duration }} + single_port={{ coredhcp_tftp_single_port_mode | lower }} +{% for s in coredhcp_subnets %} + subnet={{ s.cidr }},{{ s.router }} +{% endfor %} + rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=hostname:unknown-{{'{'}}04d{{'}'}} + - bootloop: | + lease_file=/tmp/coredhcp.db + script_path={{ coredhcp_custom_ipxe }} + lease_time={{ coredhcp_tmp_lease_duration }} +{% for sp in coredhcp_subnet_pools %} + subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} +{% endfor %} +{% else %} + # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} +{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 894964a7ee..dfdf99a745 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -27,21 +27,12 @@ openchami_inventory_template: "{{ role_path }}/templates/inventory.yaml.j2" openchami_inventory_file: "{{ openchami_clone_path }}/dell/podman-quadlets/inventory/01-ochami" openchami_config_vars_path: "/opt/omnia/openchami/configs_vars.yaml" openchami_config_vars_template: "{{ role_path }}/templates/configs.yaml.j2" -openchami_coredhcp_template: "{{ role_path }}/templates/coredhcp.yaml.j2" -openchami_coredhcp_target: "{{ openchami_clone_path }}/dell/podman-quadlets/roles/configs/templates/coredhcp/coredhcp.yaml.j2" openchami_install_fail_msg: "Failed to install OpenCHAMI" network_spec: "{{ hostvars['localhost']['input_project_dir'] }}/network_spec.yml" network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." provision_config: "{{ hostvars['localhost']['input_project_dir'] }}/provision_config.yml" provision_config_syntax_fail_msg: "Failed. Syntax errors present in provision_config.yml. Fix errors and re-run playbook again." - -# vars passed to openchami installation -openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" -data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" -data_s3_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/s3" -s3_work_dir: "{{ oim_shared_path }}/omnia/openchami/s3" - # Usage: deploy_openchami.yml - pull openchami images pull_image_retries: 5 pull_image_delay: 10 diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index b09dc421c7..2e81733240 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -136,7 +136,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index e3ca32c7b7..de69e4f556 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -135,7 +135,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 0e228229de..156608de44 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -106,7 +106,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index a61f6c0665..51296e3c29 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -105,7 +105,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 956db7fcb7..a7c886a3a7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -614,20 +614,20 @@ cfg_path = sys.argv[1] with open(cfg_path) as f: doc = yaml.safe_load(f) corefile = doc['data']['Corefile'] -fwd_block = """{{ dns_domain }}:53 { +fwd_block = """{{ domain_name }}:53 { errors cache 30 forward . {{ admin_nic_ip }} } """ -if '{{ dns_domain }}:53' not in corefile: +if '{{ domain_name }}:53' not in corefile: corefile = fwd_block + corefile doc['data']['Corefile'] = corefile with open(cfg_path, 'w') as f: yaml.dump(doc, f, default_flow_style=False) - print("Added {{ dns_domain }} forward zone to K8s CoreDNS") + print("Added {{ domain_name }} forward zone to K8s CoreDNS") else: - print("{{ dns_domain }} forward zone already present in K8s CoreDNS") + print("{{ domain_name }} forward zone already present in K8s CoreDNS") PYEOF {% endif %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 1ea11c40e5..ba8fcfad03 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -112,7 +112,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 21f3372cee..44f188e51b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -348,7 +348,7 @@ owner: root:root permissions: '0644' content: | - search {{ dns_domain }} + search {{ domain_name }} nameserver {{ admin_nic_ip }} options timeout:1 attempts:2 {% else %} diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index c2fd3c4cc7..2895762f8e 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -45,23 +45,10 @@ ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" -- name: Load dns_config.yml - ansible.builtin.include_vars: - file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" - register: dns_config_load - failed_when: false - -- name: Set dns_enabled and dns_domain facts - ansible.builtin.set_fact: - dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" - dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" - when: dns_config_load is not failed and dns_config is defined - -- name: Set dns_enabled to false when dns_config unavailable +- name: Set dns_enabled default when not defined ansible.builtin.set_fact: dns_enabled: false - dns_domain: "" - when: dns_config_load is failed or dns_config is not defined + when: dns_enabled is not defined - name: Initialise variables ansible.builtin.set_fact: diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index 832631d96c..901e914db7 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -25,7 +25,6 @@ provision_inputs: - path: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/build_stream_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/discovery_config.yml" - - path: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" build_stream_job_id_absent: | "Build Stream mode is enabled. Manual execution is not supported. Please trigger this workflow via the GitLab pipeline."