diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 47990cafdc..3da28cca5e 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -59,7 +59,8 @@ "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", "gitlab_config": "gitlab_config.yml", - "discovery_config": "discovery_config.yml" + "discovery_config": "discovery_config.yml", + "dns_config": "dns_config.yml" # "additional_software": "additional_software.json" } @@ -78,6 +79,7 @@ files["provision_config"], files["network_spec"], files["software_config"], + files["dns_config"], # files["high_availability_config"] ], "security": [ diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 62c6ab9e2e..31b464f149 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -455,6 +455,40 @@ def json_file_mandatory(file_path): "another additional subnet's dynamic_range." ) +# dns_config +DNS_DOMAIN_INVALID_MSG = ( + "dns_domain must be a valid DNS domain name (RFC 1035). " + "Use lowercase alphanumeric characters, hyphens, and dots only. " + "Example: hpc.cluster" +) +DNS_DOMAIN_RESERVED_MSG = ( + "dns_domain must not use a reserved domain. " + "The following are not permitted: cluster.local, localhost, " + "com, net, org, edu, gov, io." +) +DNS_TTL_RANGE_MSG = ( + "dns_ttl must be an integer between 60 and 86400 (seconds)." +) +DNS_CACHE_TTL_RANGE_MSG = ( + "dns_cache_ttl must be an integer between 10 and 3600 (seconds)." +) +DNS_CACHE_TTL_EXCEEDS_TTL_MSG = ( + "dns_cache_ttl must be less than or equal to dns_ttl. " + "Cache TTL cannot exceed the record TTL." +) +DNS_FABRIC_SUFFIX_FORMAT_MSG = ( + "each dns_fabric_suffix must begin with a hyphen and contain " + "only lowercase alphanumeric characters and hyphens. " + "Example: -ib, -stor" +) +DNS_SOA_POSITIVE_INT_MSG = ( + "dns_soa values (refresh, retry, expire) must be positive integers." +) +DNS_REVERSE_DISABLED_WARNING_MSG = ( + "dns_reverse_enabled is false. MPI and Slurm may require " + "reverse DNS (PTR records) for security validation." +) + # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" MYSQLDB_USER_FAIL_MSG = "username should not be kept 'root'." diff --git a/common/library/module_utils/input_validation/schema/dns_config.json b/common/library/module_utils/input_validation/schema/dns_config.json new file mode 100644 index 0000000000..f76a912fd2 --- /dev/null +++ b/common/library/module_utils/input_validation/schema/dns_config.json @@ -0,0 +1,53 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["dns_config"], + "properties": { + "dns_config": { + "type": "object", + "required": [ + "dns_enabled", + "dns_domain", + "dns_ttl", + "dns_reverse_enabled", + "dns_cache_ttl" + ], + "properties": { + "dns_enabled": { "type": "boolean" }, + "dns_domain": { + "type": "string", + "minLength": 1, + "pattern": "^[a-z0-9]([a-z0-9\\-]*[a-z0-9])?(\\.[a-z0-9]([a-z0-9\\-]*[a-z0-9])?)*$" + }, + "dns_ttl": { + "type": "integer", + "minimum": 60, + "maximum": 86400 + }, + "dns_reverse_enabled": { "type": "boolean" }, + "dns_fabric_suffixes": { + "type": "array", + "items": { + "type": "string", + "pattern": "^-[a-z0-9][a-z0-9\\-]*$" + } + }, + "dns_cache_ttl": { + "type": "integer", + "minimum": 10, + "maximum": 3600 + }, + "dns_soa": { + "type": "object", + "properties": { + "refresh": { "type": "integer", "minimum": 1 }, + "retry": { "type": "integer", "minimum": 1 }, + "expire": { "type": "integer", "minimum": 1 } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + } +} diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 98efc3637f..5c10bd177b 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1442,3 +1442,118 @@ def _ranges_overlap(range_a, range_b): return a_start <= b_end and b_start <= a_end except (ValueError, TypeError): return False + + +# Reserved domains that must not be used as dns_domain +_RESERVED_DOMAINS = frozenset([ + "cluster.local", "localhost", + "com", "net", "org", "edu", "gov", "io", +]) + +# Regex for a valid DNS label (RFC 1035) +_DNS_LABEL_RE = re.compile(r'^[a-z0-9]([a-z0-9\-]{0,61}[a-z0-9])?$') + + +def validate_dns_config(data): + """ + Validates dns_config input parameters. + + Checks: + - dns_domain is a valid RFC 1035 domain name and not reserved. + - dns_ttl is in valid range (60-86400). + - dns_cache_ttl is in valid range (10-3600) and <= dns_ttl. + - dns_fabric_suffixes format (hyphen-prefixed, lowercase alphanumeric). + - dns_soa values are positive integers. + + Args: + data (dict): The dns_config dict from dns_config.yml. + + Returns: + list: Validation error messages. + """ + errors = [] + cfg = data.get("dns_config", {}) + if not cfg or not cfg.get("dns_enabled", False): + return errors + + # --- dns_domain --- + domain = cfg.get("dns_domain", "") + if domain: + labels = domain.split(".") + valid_domain = all(_DNS_LABEL_RE.match(label) for label in labels) and len(domain) <= 253 + if not valid_domain: + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, + ) + ) + if domain in _RESERVED_DOMAINS or any( + domain.endswith(f".{rd}") for rd in _RESERVED_DOMAINS + ): + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_RESERVED_MSG, + ) + ) + else: + errors.append( + create_error_msg( + "dns_config.dns_domain", domain, + en_us_validation_msg.DNS_DOMAIN_INVALID_MSG, + ) + ) + + # --- dns_ttl --- + ttl = cfg.get("dns_ttl", 300) + if not isinstance(ttl, int) or ttl < 60 or ttl > 86400: + errors.append( + create_error_msg( + "dns_config.dns_ttl", str(ttl), + en_us_validation_msg.DNS_TTL_RANGE_MSG, + ) + ) + + # --- dns_cache_ttl --- + cache_ttl = cfg.get("dns_cache_ttl", 60) + if not isinstance(cache_ttl, int) or cache_ttl < 10 or cache_ttl > 3600: + errors.append( + create_error_msg( + "dns_config.dns_cache_ttl", str(cache_ttl), + en_us_validation_msg.DNS_CACHE_TTL_RANGE_MSG, + ) + ) + elif isinstance(ttl, int) and cache_ttl > ttl: + errors.append( + create_error_msg( + "dns_config.dns_cache_ttl", str(cache_ttl), + en_us_validation_msg.DNS_CACHE_TTL_EXCEEDS_TTL_MSG, + ) + ) + + # --- dns_fabric_suffixes --- + suffix_re = re.compile(r'^-[a-z0-9][a-z0-9\-]*$') + for suffix in cfg.get("dns_fabric_suffixes", []): + if not isinstance(suffix, str) or not suffix_re.match(suffix): + errors.append( + create_error_msg( + "dns_config.dns_fabric_suffixes", str(suffix), + en_us_validation_msg.DNS_FABRIC_SUFFIX_FORMAT_MSG, + ) + ) + + # --- dns_soa --- + soa = cfg.get("dns_soa", {}) + if soa: + for field in ("refresh", "retry", "expire"): + val = soa.get(field) + if val is not None and (not isinstance(val, int) or val < 1): + errors.append( + create_error_msg( + f"dns_config.dns_soa.{field}", str(val), + en_us_validation_msg.DNS_SOA_POSITIVE_INT_MSG, + ) + ) + + return errors diff --git a/common/library/modules/tests/test_dns_config_validation.py b/common/library/modules/tests/test_dns_config_validation.py new file mode 100644 index 0000000000..8eafcbb246 --- /dev/null +++ b/common/library/modules/tests/test_dns_config_validation.py @@ -0,0 +1,236 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for dns_config validation in provision_validation.py.""" + +import sys +import os +import unittest + +# --------------------------------------------------------------------------- +# Bootstrap: make the validation code importable without a full Ansible install +# --------------------------------------------------------------------------- +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +MODULE_UTILS = os.path.join(REPO_ROOT, "common", "library", "module_utils") +sys.path.insert(0, MODULE_UTILS) + +import types +import importlib + +for _name in ( + "ansible", + "ansible.module_utils", + "ansible.module_utils.input_validation", + "ansible.module_utils.input_validation.common_utils", + "ansible.module_utils.input_validation.validation_flows", +): + sys.modules.setdefault(_name, types.ModuleType(_name)) + +_cu_ns = "ansible.module_utils.input_validation.common_utils" +for _sub in ("config", "en_us_validation_msg", "validation_utils"): + _mod = importlib.import_module(f"input_validation.common_utils.{_sub}") + sys.modules[f"{_cu_ns}.{_sub}"] = _mod + setattr(sys.modules[_cu_ns], _sub, _mod) + +sys.modules["ansible.module_utils.input_validation.validation_flows.common_validation"] = ( + types.ModuleType("ansible.module_utils.input_validation.validation_flows.common_validation") +) + +from input_validation.validation_flows.provision_validation import ( # noqa: E402 + validate_dns_config, +) + + +def _has_error(errors, key_substr): + """Check if any error dict has error_key containing the substring.""" + return any(key_substr in e.get("error_key", "") for e in errors) + + +def _has_error_msg(errors, msg_substr): + """Check if any error dict has error_msg containing the substring.""" + return any(msg_substr in e.get("error_msg", "") for e in errors) + + +def _make_config(**overrides): + """Build a valid dns_config dict, then apply overrides.""" + base = { + "dns_config": { + "dns_enabled": True, + "dns_domain": "hpc.cluster", + "dns_ttl": 300, + "dns_reverse_enabled": True, + "dns_fabric_suffixes": [], + "dns_cache_ttl": 60, + "dns_soa": { + "refresh": 3600, + "retry": 600, + "expire": 86400, + }, + } + } + cfg = base["dns_config"] + for k, v in overrides.items(): + if k.startswith("soa_"): + cfg["dns_soa"][k[4:]] = v + else: + cfg[k] = v + return base + + +class TestDnsConfigValidationDisabled(unittest.TestCase): + """When dns_enabled is false, no validation should run.""" + + def test_disabled_returns_no_errors(self): + data = _make_config(dns_enabled=False) + self.assertEqual(validate_dns_config(data), []) + + def test_missing_dns_config_key(self): + self.assertEqual(validate_dns_config({}), []) + + +class TestDnsDomainValidation(unittest.TestCase): + """FS-DOMAIN-01: dns_domain must be valid RFC 1035.""" + + def test_valid_domain(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc.cluster")), []) + + def test_valid_domain_single_label(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="hpc")), []) + + def test_valid_domain_multi_label(self): + self.assertEqual(validate_dns_config(_make_config(dns_domain="compute.hpc.lab")), []) + + def test_invalid_domain_uppercase(self): + errs = validate_dns_config(_make_config(dns_domain="HPC.Cluster")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_invalid_domain_underscore(self): + errs = validate_dns_config(_make_config(dns_domain="hpc_cluster")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_invalid_domain_empty(self): + errs = validate_dns_config(_make_config(dns_domain="")) + self.assertTrue(_has_error(errs, "dns_domain")) + + def test_reserved_domain_cluster_local(self): + errs = validate_dns_config(_make_config(dns_domain="cluster.local")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_reserved_domain_localhost(self): + errs = validate_dns_config(_make_config(dns_domain="localhost")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_reserved_domain_com(self): + errs = validate_dns_config(_make_config(dns_domain="com")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + def test_subdomain_of_reserved(self): + errs = validate_dns_config(_make_config(dns_domain="hpc.cluster.local")) + self.assertTrue(_has_error_msg(errs, "reserved")) + + +class TestDnsTtlValidation(unittest.TestCase): + """FS-INPUT-02: dns_ttl must be in [60, 86400].""" + + def test_valid_ttl(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=300)), []) + + def test_ttl_minimum(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=60)), []) + + def test_ttl_maximum(self): + self.assertEqual(validate_dns_config(_make_config(dns_ttl=86400)), []) + + def test_ttl_too_low(self): + errs = validate_dns_config(_make_config(dns_ttl=59)) + self.assertTrue(_has_error(errs, "dns_ttl")) + + def test_ttl_too_high(self): + errs = validate_dns_config(_make_config(dns_ttl=86401)) + self.assertTrue(_has_error(errs, "dns_ttl")) + + +class TestDnsCacheTtlValidation(unittest.TestCase): + """FS-INPUT-03: dns_cache_ttl must be in [10, 3600] and <= dns_ttl.""" + + def test_valid_cache_ttl(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=60)), []) + + def test_cache_ttl_minimum(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=10)), []) + + def test_cache_ttl_maximum(self): + self.assertEqual(validate_dns_config(_make_config(dns_cache_ttl=300, dns_ttl=300)), []) + + def test_cache_ttl_too_low(self): + errs = validate_dns_config(_make_config(dns_cache_ttl=9)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + def test_cache_ttl_too_high(self): + errs = validate_dns_config(_make_config(dns_cache_ttl=3601)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + def test_cache_ttl_exceeds_ttl(self): + errs = validate_dns_config(_make_config(dns_ttl=60, dns_cache_ttl=120)) + self.assertTrue(_has_error(errs, "dns_cache_ttl")) + + +class TestDnsFabricSuffixValidation(unittest.TestCase): + """FS-INPUT-04: fabric suffixes must be hyphen-prefixed lowercase.""" + + def test_valid_suffix(self): + self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=["-ib"])), []) + + def test_valid_suffix_multi(self): + self.assertEqual( + validate_dns_config(_make_config(dns_fabric_suffixes=["-ib", "-stor"])), [] + ) + + def test_invalid_suffix_no_hyphen(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["ib"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_invalid_suffix_uppercase(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-IB"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_invalid_suffix_empty_after_hyphen(self): + errs = validate_dns_config(_make_config(dns_fabric_suffixes=["-"])) + self.assertTrue(_has_error(errs, "dns_fabric_suffix")) + + def test_empty_suffixes_ok(self): + self.assertEqual(validate_dns_config(_make_config(dns_fabric_suffixes=[])), []) + + +class TestDnsSoaValidation(unittest.TestCase): + """FS-SOA-01..05: SOA values must be positive integers.""" + + def test_valid_soa(self): + self.assertEqual(validate_dns_config(_make_config()), []) + + def test_soa_refresh_zero(self): + errs = validate_dns_config(_make_config(soa_refresh=0)) + self.assertTrue(_has_error(errs, "dns_soa")) + + def test_soa_retry_negative(self): + errs = validate_dns_config(_make_config(soa_retry=-1)) + self.assertTrue(_has_error(errs, "dns_soa")) + + def test_soa_expire_zero(self): + errs = validate_dns_config(_make_config(soa_expire=0)) + self.assertTrue(_has_error(errs, "dns_soa")) + + +if __name__ == "__main__": + unittest.main() diff --git a/input/dns_config.yml b/input/dns_config.yml new file mode 100644 index 0000000000..82f9800d79 --- /dev/null +++ b/input/dns_config.yml @@ -0,0 +1,64 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# This file configures authoritative CoreDNS for Slurm/MPI hostname resolution. +# When enabled, CoreDNS replaces /etc/hosts as the primary name resolution +# mechanism for all cluster compute nodes. +# +# DNS records are generated automatically from OpenCHAMI SMD/HSM inventory. +# No per-node DNS entries are required. +# +# 'dns_enabled': Master toggle for CoreDNS-based resolution. +# - true: CoreDNS serves authoritative DNS; /etc/hosts peer entries are removed. +# - false: Traditional /etc/hosts-based resolution is used (default). +# +# 'dns_domain': Internal DNS domain for the cluster. +# All node hostnames are registered under this domain +# (e.g., nid0001.hpc.cluster). Must be a valid DNS domain name (RFC 1035). +# Must NOT overlap with public TLDs or Kubernetes 'cluster.local'. +# +# 'dns_ttl': Default Time-To-Live (seconds) for A and PTR records. +# Lower values enable faster propagation on node changes. +# Valid range: 60–86400. Default: 300. +# +# 'dns_reverse_enabled': Whether to generate reverse (PTR) DNS zones. +# Required for MPI and Slurm security validation. Default: true. +# +# 'dns_fabric_suffixes': Optional hostname suffixes for multi-fabric networks. +# When set, additional A records are created for each fabric IP. +# Example: ["-ib"] creates nid0001-ib.hpc.cluster pointing to InfiniBand IP. +# Each suffix must begin with a hyphen and contain only lowercase alphanumeric +# characters and hyphens. +# +# 'dns_cache_ttl': CoreDNS in-memory cache TTL (seconds). +# Controls how long resolved answers are cached before re-querying zone files. +# Must be <= dns_ttl. Valid range: 10–3600. Default: 60. +# +# 'dns_soa': SOA (Start of Authority) record parameters for generated zone files. +# - refresh: How often secondary DNS servers should check for updates (seconds). +# - retry: Retry interval after a failed refresh (seconds). +# - expire: When to stop serving zone data if primary is unreachable (seconds). + +dns_config: + dns_enabled: false + dns_domain: "hpc.cluster" + dns_ttl: 300 + dns_reverse_enabled: true + dns_fabric_suffixes: [] + dns_cache_ttl: 60 + dns_soa: + refresh: 3600 + retry: 600 + expire: 86400 diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml new file mode 100644 index 0000000000..036d2fa1a6 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_coredns.yml @@ -0,0 +1,115 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + register: dns_config_loaded + failed_when: false + +- name: Set dns_enabled flag + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config_loaded is not failed and dns_config is defined + +- name: Set dns_enabled to false when dns_config is unavailable + ansible.builtin.set_fact: + dns_enabled: false + when: dns_config_loaded is failed or dns_config is not defined + +- name: Deploy CoreDNS for authoritative cluster DNS + when: dns_enabled | bool + block: + - name: Set CoreDNS configuration facts + ansible.builtin.set_fact: + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" + dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" + dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" + dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" + dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" + dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" + dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" + dns_upstream_servers: "{{ network_data.admin_network.dns | default([]) }}" + + - name: Build reverse zone list from admin network + ansible.builtin.set_fact: + coredns_reverse_zones: >- + {{ + [network_data.admin_network.subnet | regex_replace('^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1')] + + (network_data.admin_network.additional_subnets | default([]) + | map(attribute='subnet') + | map('regex_replace', '^(\d+)\.(\d+)\.(\d+)\.\d+$', '\3.\2.\1') + | list) + }} + + - name: Create CoreDNS directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "{{ dir_permissions_755 }}" + with_items: + - "{{ coredns_config_dir }}" + - "{{ coredns_zone_dir }}" + + - name: Generate Corefile + ansible.builtin.template: + src: "{{ role_path }}/templates/Corefile.j2" + dest: "{{ coredns_config_dir }}/Corefile" + mode: "{{ file_permissions_644 }}" + + - name: Pull CoreDNS container image + ansible.builtin.command: "podman pull {{ coredns_image }}" + register: coredns_pull + until: coredns_pull is not failed + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + changed_when: true + + - name: Create CoreDNS systemd quadlet directory + ansible.builtin.file: + path: /etc/containers/systemd + state: directory + mode: "{{ dir_permissions_755 }}" + + - name: Deploy CoreDNS container quadlet + ansible.builtin.template: + src: "{{ role_path }}/templates/coredns.container.j2" + dest: /etc/containers/systemd/coredns.container + mode: "{{ file_permissions_644 }}" + notify: reload_systemd + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + listen: reload_systemd + + - name: Enable and start CoreDNS service + ansible.builtin.systemd: + name: coredns + state: started + enabled: true + + - name: Verify CoreDNS is running + ansible.builtin.command: podman ps --filter name=systemd-coredns --format {% raw %}"{{ .Status }}"{% endraw %} + register: coredns_status + changed_when: false + failed_when: "'Up' not in coredns_status.stdout" + retries: 3 + delay: 5 + + - name: CoreDNS deployment status + ansible.builtin.debug: + msg: "CoreDNS deployed successfully. Domain: {{ dns_domain }}, Zone dir: {{ coredns_zone_dir }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml index 11eca7c9e6..b952b13362 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/main.yml @@ -16,3 +16,6 @@ - name: Deploy openchami ansible.builtin.include_tasks: deploy_openchami.yml when: not hostvars['oim']['openchami_install_status'] + +- name: Deploy CoreDNS for authoritative cluster DNS + ansible.builtin.include_tasks: deploy_coredns.yml diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 new file mode 100644 index 0000000000..758e749a98 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/templates/Corefile.j2 @@ -0,0 +1,31 @@ +# Auto-generated by Omnia from dns_config.yml - DO NOT EDIT MANUALLY +# Authoritative CoreDNS for cluster-internal hostnames + +{{ dns_domain }}:53 { + file /zones/db.{{ dns_domain }} + cache {{ dns_cache_ttl }} + reload 10s + log + errors +} + +{% if dns_reverse_enabled | default(true) | bool %} +{% for rz in coredns_reverse_zones | default([]) %} +{{ rz }}.in-addr.arpa:53 { + file /zones/db.{{ rz }}.in-addr.arpa + cache {{ dns_cache_ttl }} + reload 10s + log + errors +} + +{% endfor %} +{% endif %} +{% if dns_upstream_servers | default([]) | length > 0 %} +.:53 { + forward . {{ dns_upstream_servers | join(' ') }} + cache 30 + log + errors +} +{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 new file mode 100644 index 0000000000..c1642192ba --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredns.container.j2 @@ -0,0 +1,24 @@ +# Auto-generated by Omnia - CoreDNS container quadlet +# Authoritative DNS for cluster-internal hostnames (Slurm/MPI) + +[Unit] +Description=CoreDNS - Authoritative Cluster DNS +After=network-online.target +Wants=network-online.target + +[Container] +ContainerName=coredns +Image={{ coredns_image }} +PublishPort=53:53/udp +PublishPort=53:53/tcp +Volume={{ coredns_config_dir }}/Corefile:/Corefile:ro,Z +Volume={{ coredns_zone_dir }}:/zones:ro,Z +Exec=-conf /Corefile + +[Service] +Restart=always +RestartSec=5 +TimeoutStartSec=30 + +[Install] +WantedBy=multi-user.target diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 06c73c7692..890a0a0b14 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -32,6 +32,11 @@ openchami_coredhcp_template: "{{ role_path }}/templates/coredhcp.yaml.j2" openchami_coredhcp_target: "{{ openchami_clone_path }}/dell/podman-quadlets/roles/configs/templates/coredhcp/coredhcp.yaml.j2" openchami_install_fail_msg: "Failed to install OpenCHAMI. Please check the logs at {{ openchami_configs_log_path }}" +# CoreDNS for authoritative cluster DNS (Slurm/MPI) +coredns_image: "docker.io/coredns/coredns:1.12.1" +coredns_config_dir: "/etc/coredns" +coredns_zone_dir: "/etc/coredns/zones" + # vars passed to openchami installation openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" diff --git a/provision/roles/configure_ochami/tasks/generate_dns_zones.yml b/provision/roles/configure_ochami/tasks/generate_dns_zones.yml new file mode 100644 index 0000000000..8c1ee5b2aa --- /dev/null +++ b/provision/roles/configure_ochami/tasks/generate_dns_zones.yml @@ -0,0 +1,106 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + delegate_to: localhost + run_once: true + register: dns_config_load + failed_when: false + +- name: Set dns_enabled fact + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config_load is not failed and dns_config is defined + +- name: Set dns_enabled to false when config unavailable + ansible.builtin.set_fact: + dns_enabled: false + when: dns_config_load is failed or dns_config is not defined + +- name: Generate CoreDNS zone files + when: dns_enabled | bool + delegate_to: oim + block: + - name: Set DNS rendering facts + ansible.builtin.set_fact: + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + dns_ttl: "{{ dns_config.dns_ttl | default(300) }}" + dns_reverse_enabled: "{{ dns_config.dns_reverse_enabled | default(true) }}" + dns_cache_ttl: "{{ dns_config.dns_cache_ttl | default(60) }}" + dns_fabric_suffixes: "{{ dns_config.dns_fabric_suffixes | default([]) }}" + dns_soa_refresh: "{{ dns_config.dns_soa.refresh | default(3600) }}" + dns_soa_retry: "{{ dns_config.dns_soa.retry | default(600) }}" + dns_soa_expire: "{{ dns_config.dns_soa.expire | default(86400) }}" + coredns_zone_dir: "/etc/coredns/zones" + + - name: Generate SOA serial (YYYYMMDDNN) + ansible.builtin.set_fact: + dns_soa_serial: "{{ lookup('pipe', 'date +%Y%m%d')}}01" + + - name: Initialize fabric_ip_map as empty + ansible.builtin.set_fact: + fabric_ip_map: {} + + - name: Ensure zone directory exists + ansible.builtin.file: + path: "{{ coredns_zone_dir }}" + state: directory + mode: "0755" + + - name: Generate forward zone file + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/forward_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ dns_domain }}" + mode: "0644" + + - name: Build reverse zone entries for admin subnet + ansible.builtin.set_fact: + admin_reverse_zone: "{{ hostvars['localhost']['admin_nic_ip'] | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" + + - name: Build reverse entries for admin subnet + ansible.builtin.set_fact: + reverse_entries: >- + [{% for hostname in ip_name_map | sort %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endfor %}] + + - name: Generate reverse zone file for admin subnet + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/reverse_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ admin_reverse_zone }}.in-addr.arpa" + mode: "0644" + vars: + reverse_zone_name: "{{ admin_reverse_zone }}" + when: dns_reverse_enabled | bool + + - name: Generate reverse zones for additional subnets + ansible.builtin.include_tasks: generate_reverse_zone_additional.yml + loop: "{{ network_data.admin_network.additional_subnets | default([]) }}" + loop_control: + loop_var: additional_subnet + when: + - dns_reverse_enabled | bool + - network_data.admin_network.additional_subnets is defined + - network_data.admin_network.additional_subnets | length > 0 + + - name: DNS zone generation summary + ansible.builtin.debug: + msg: >- + DNS zones generated: forward zone db.{{ dns_domain }}, + {{ (ip_name_map | length) }} A records, + reverse zone(s) for {{ admin_reverse_zone }} + {% if network_data.admin_network.additional_subnets | default([]) | length > 0 %} + + {{ network_data.admin_network.additional_subnets | length }} additional subnet(s) + {% endif %} diff --git a/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml b/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml new file mode 100644 index 0000000000..b131a4dfb4 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/generate_reverse_zone_additional.yml @@ -0,0 +1,31 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Build reverse zone name for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.set_fact: + additional_reverse_zone: "{{ additional_subnet.subnet | regex_replace('^(\\d+)\\.(\\d+)\\.(\\d+)\\.\\d+$', '\\3.\\2.\\1') }}" + +- name: Build reverse entries for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.set_fact: + reverse_entries: >- + [{% set subnet_prefix = additional_subnet.subnet | regex_replace('\.\d+$', '') %}{% for hostname in ip_name_map | sort %}{% if ip_name_map[hostname].startswith(subnet_prefix ~ '.') %}{"host_octet": "{{ ip_name_map[hostname].split('.')[-1] }}", "hostname": "{{ hostname }}"}{% if not loop.last %}, {% endif %}{% endif %}{% endfor %}] + +- name: Generate reverse zone file for additional subnet {{ additional_subnet.subnet }} + ansible.builtin.template: + src: "{{ role_path }}/templates/dns/reverse_zone.j2" + dest: "{{ coredns_zone_dir }}/db.{{ additional_reverse_zone }}.in-addr.arpa" + mode: "0644" + vars: + reverse_zone_name: "{{ additional_reverse_zone }}" diff --git a/provision/roles/configure_ochami/tasks/main.yml b/provision/roles/configure_ochami/tasks/main.yml index 19f98e96c1..a437aec20c 100644 --- a/provision/roles/configure_ochami/tasks/main.yml +++ b/provision/roles/configure_ochami/tasks/main.yml @@ -37,3 +37,6 @@ - name: Provision completion ansible.builtin.include_tasks: provision_completion.yml + + - name: Generate CoreDNS zone files from SMD inventory + ansible.builtin.include_tasks: generate_dns_zones.yml diff --git a/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml b/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml index 24314209aa..07138ac46b 100644 --- a/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml +++ b/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml @@ -111,3 +111,6 @@ - name: Configure the hostname ansible.builtin.command: /usr/bin/ochami cloud-init node set -f yaml -d @"{{ openchami_hostname_vars_path }}" changed_when: true + + - name: Update DNS zones after node provisioning + ansible.builtin.include_tasks: update_dns_zones.yml diff --git a/provision/roles/configure_ochami/tasks/update_dns_zones.yml b/provision/roles/configure_ochami/tasks/update_dns_zones.yml new file mode 100644 index 0000000000..1f1a419db9 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/update_dns_zones.yml @@ -0,0 +1,34 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This task regenerates CoreDNS zone files after node add/remove operations. +# Called after SMD inventory changes. CoreDNS auto-reloads via the 'reload' plugin. + +- name: Check if CoreDNS DNS is enabled + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + when: dns_config is defined + +- name: Regenerate DNS zone files after inventory change + when: dns_enabled | default(false) | bool + block: + - name: Re-read node hostname/IP mapping from SMD + ansible.builtin.include_tasks: "{{ role_path }}/../../slurm_config/tasks/read_slurm_hostnames.yml" + + - name: Regenerate DNS zones + ansible.builtin.include_tasks: generate_dns_zones.yml + + - name: DNS zone update completed + ansible.builtin.debug: + msg: "DNS zone files regenerated. CoreDNS will auto-reload within 10s." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 710edfc39c..a0ac94e16e 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -202,12 +202,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 904f7f5da2..9284bc1e2d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -202,12 +202,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 0db88c90f9..9a3a448358 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -102,12 +102,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 593cef9d00..b9293d88db 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -101,12 +101,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 0d01edee47..f5a4043ae0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -591,6 +591,32 @@ # Patch: append nameservers after /etc/resolv.conf using Jinja list "dns" sed -i 's|/etc/resolv.conf|/etc/resolv.conf{% for ns in dns %} {{ ns }}{% endfor %}|' "$cfg" +{% if dns_enabled | default(false) | bool %} + # Forward cluster-internal DNS domain to OIM CoreDNS + # This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS + python3 - "$cfg" << 'PYEOF' +import sys, yaml +cfg_path = sys.argv[1] +with open(cfg_path) as f: + doc = yaml.safe_load(f) +corefile = doc['data']['Corefile'] +fwd_block = """{{ dns_domain }}:53 { + errors + cache 30 + forward . {{ admin_nic_ip }} +} +""" +if '{{ dns_domain }}:53' not in corefile: + corefile = fwd_block + corefile + doc['data']['Corefile'] = corefile + with open(cfg_path, 'w') as f: + yaml.dump(doc, f, default_flow_style=False) + print("Added {{ dns_domain }} forward zone to K8s CoreDNS") +else: + print("{{ dns_domain }} forward zone already present in K8s CoreDNS") +PYEOF +{% endif %} + # Apply the patched ConfigMap kubectl apply -f "$cfg" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2f0c16b577..f122263e1a 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -295,12 +295,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /root/init_slurm_db.sql permissions: '{{ file_mode_600 }}' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 30a388d7ef..200d04bbf9 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -513,12 +513,22 @@ echo "[INFO] ===== Completed firewall and service configuration (aarch64) =====" +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4756e8f1d3..582572c3f4 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -308,12 +308,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ dns_domain }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/dns/forward_zone.j2 b/provision/roles/configure_ochami/templates/dns/forward_zone.j2 new file mode 100644 index 0000000000..e0bfd3d0a1 --- /dev/null +++ b/provision/roles/configure_ochami/templates/dns/forward_zone.j2 @@ -0,0 +1,27 @@ +; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY +; Forward zone: {{ dns_domain }} +$TTL {{ dns_ttl }} +@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( + {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) + {{ dns_soa_refresh }} ; Refresh + {{ dns_soa_retry }} ; Retry + {{ dns_soa_expire }} ; Expire + {{ dns_ttl }} ; Minimum TTL + ) + IN NS ns1.{{ dns_domain }}. + +ns1 IN A {{ admin_nic_ip }} + +; Compute nodes (auto-generated from OpenCHAMI SMD) +{% for hostname in ip_name_map | sort %} +{{ hostname }} IN A {{ ip_name_map[hostname] }} +{% endfor %} +{% if dns_fabric_suffixes is defined and dns_fabric_suffixes | length > 0 %} + +; Fabric suffix records +{% for suffix in dns_fabric_suffixes %} +{% for hostname in fabric_ip_map.get(suffix, {}) | sort %} +{{ hostname }}{{ suffix }} IN A {{ fabric_ip_map[suffix][hostname] }} +{% endfor %} +{% endfor %} +{% endif %} diff --git a/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 b/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 new file mode 100644 index 0000000000..3bea95054d --- /dev/null +++ b/provision/roles/configure_ochami/templates/dns/reverse_zone.j2 @@ -0,0 +1,16 @@ +; Auto-generated by Omnia from OpenCHAMI SMD - DO NOT EDIT MANUALLY +; Reverse zone: {{ reverse_zone_name }}.in-addr.arpa +$TTL {{ dns_ttl }} +@ IN SOA ns1.{{ dns_domain }}. admin.{{ dns_domain }}. ( + {{ dns_soa_serial }} ; Serial (YYYYMMDDNN) + {{ dns_soa_refresh }} ; Refresh + {{ dns_soa_retry }} ; Retry + {{ dns_soa_expire }} ; Expire + {{ dns_ttl }} ; Minimum TTL + ) + IN NS ns1.{{ dns_domain }}. + +; PTR records (auto-generated from OpenCHAMI SMD) +{% for entry in reverse_entries | sort(attribute='host_octet') %} +{{ entry.host_octet }} IN PTR {{ entry.hostname }}.{{ dns_domain }}. +{% endfor %} diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index b2480d2c6e..c2fd3c4cc7 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -45,6 +45,24 @@ ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" +- name: Load dns_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" + register: dns_config_load + failed_when: false + +- name: Set dns_enabled and dns_domain facts + ansible.builtin.set_fact: + dns_enabled: "{{ dns_config.dns_enabled | default(false) | bool }}" + dns_domain: "{{ dns_config.dns_domain | default('hpc.cluster') }}" + when: dns_config_load is not failed and dns_config is defined + +- name: Set dns_enabled to false when dns_config unavailable + ansible.builtin.set_fact: + dns_enabled: false + dns_domain: "" + when: dns_config_load is failed or dns_config is not defined + - name: Initialise variables ansible.builtin.set_fact: service_k8s_support: false diff --git a/provision/roles/provision_validations/tasks/update_hosts.yml b/provision/roles/provision_validations/tasks/update_hosts.yml index bd046032bc..8110097cbe 100644 --- a/provision/roles/provision_validations/tasks/update_hosts.yml +++ b/provision/roles/provision_validations/tasks/update_hosts.yml @@ -19,19 +19,22 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs and hostnames that are being updated - ansible.builtin.shell: | - set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ - grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp - cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} - rm -f {{ hosts_file_path }}.tmp - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" +- name: Update OIM /etc/hosts (skipped when CoreDNS is enabled) + when: not (dns_enabled | default(false) | bool) + block: + - name: Remove stale entries for IPs and hostnames that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Add hosts file entry for cluster - ansible.builtin.shell: | - set -o pipefail - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" + - name: Add hosts file entry for cluster + ansible.builtin.shell: | + set -o pipefail + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index ceee665ce2..3b8220fa18 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -25,6 +25,7 @@ provision_inputs: - path: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/build_stream_config.yml" - path: "{{ hostvars['localhost']['input_project_dir'] }}/discovery_config.yml" + - path: "{{ hostvars['localhost']['input_project_dir'] }}/dns_config.yml" build_stream_job_id_absent: | "Build Stream mode is enabled. Manual execution is not supported. Please trigger this workflow via the GitLab pipeline." diff --git a/provision/roles/slurm_config/tasks/update_hosts_munge.yml b/provision/roles/slurm_config/tasks/update_hosts_munge.yml index 29683159ad..783d821edd 100644 --- a/provision/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/provision/roles/slurm_config/tasks/update_hosts_munge.yml @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Edit /etc/hosts file till DNS +- name: Edit /etc/hosts file (skipped when CoreDNS is enabled) ignore_unreachable: true delegate_to: "{{ slurmhost_ip }}" + when: not (dns_enabled | default(false) | bool) block: - name: Remove deleted nodes if any hostname exists in /etc/hosts ansible.builtin.lineinfile: