diff --git a/common/library/module_utils/input_validation/schema/provision_config.json b/common/library/module_utils/input_validation/schema/provision_config.json index 79977c296c..0f154d8870 100644 --- a/common/library/module_utils/input_validation/schema/provision_config.json +++ b/common/library/module_utils/input_validation/schema/provision_config.json @@ -16,6 +16,11 @@ "description": "Default lease time for DHCP.", "pattern": "^[0-9]+$", "default": "86400" + }, + "dns_enabled": { + "type": "boolean", + "description": "Enable DNS-based hostname resolution via coresmd.", + "default": false } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 98efc3637f..20c69cf94e 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -1442,3 +1442,21 @@ def _ranges_overlap(range_a, range_b): return a_start <= b_end and b_start <= a_end except (ValueError, TypeError): return False + + + +def validate_dns_config(data): + """ + Validates dns_config input parameters. + + dns_config.yml only contains dns_enabled (boolean). + The cluster domain is read from OIM metadata (domain_name). + + Args: + data (dict): The dns_config dict from dns_config.yml. + + Returns: + list: Validation error messages (currently empty; schema + validation handles the dns_enabled type check). + """ + return [] diff --git a/input/provision_config.yml b/input/provision_config.yml index 6b8f17c6aa..14b946ad8a 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -38,3 +38,11 @@ language: "en_US.UTF-8" # Default: 86400 # Max: 31536000 default_lease_time: "86400" + +#### Optional +# Enable DNS-based hostname resolution for compute nodes. +# When true, nodes use coresmd (CoreDNS + OpenCHAMI SMD plugin) instead of /etc/hosts. +# DNS records are generated automatically from SMD inventory. +# The cluster domain is read from OIM metadata (domain_name). +# Default: false +dns_enabled: false diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml index cbb21df631..8c94f46950 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deploy_openchami.yml @@ -85,14 +85,6 @@ delegate_to: localhost connection: local -- name: Deploy coredhcp template with multi-subnet support - ansible.builtin.copy: - src: "{{ openchami_coredhcp_template }}" - dest: "{{ openchami_coredhcp_target }}" - mode: "{{ file_permissions_644 }}" - delegate_to: localhost - connection: local - - name: Load the openchami configs vars ansible.builtin.template: src: "{{ openchami_config_vars_template }}" diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 deleted file mode 100644 index 523d4be376..0000000000 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp.yaml.j2 +++ /dev/null @@ -1,35 +0,0 @@ -server4: - listen: - - "%{{ cluster_boot_interface }}" - plugins: - - server_id: {{ coredhcp_server_id }} - - dns: {{ coredhcp_dns_server }} - - router: {{ coredhcp_router }} - - netmask: {{ coredhcp_netmask }} -{% if coredhcp_subnets | default([]) | length > 0 %} - # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) - - coresmd: | - svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 - ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 - ca_cert=/root_ca/root_ca.crt - cache_valid={{ coredhcp_cache_validity }} - lease_time={{ coredhcp_lease_duration }} - single_port={{ coredhcp_tftp_single_port_mode | lower }} -{% for s in coredhcp_subnets %} - subnet={{ s.cidr }},{{ s.router }} -{% endfor %} - rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} - rule=hostname:unknown-{{'{'}}04d{{'}'}} - - bootloop: | - lease_file=/tmp/coredhcp.db - script_path={{ coredhcp_custom_ipxe }} - lease_time={{ coredhcp_tmp_lease_duration }} -{% for sp in coredhcp_subnet_pools %} - subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} -{% endfor %} -{% else %} - # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} -{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 index 2b0e180422..523d4be376 100644 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 @@ -6,5 +6,30 @@ server4: - dns: {{ coredhcp_dns_server }} - router: {{ coredhcp_router }} - netmask: {{ coredhcp_netmask }} +{% if coredhcp_subnets | default([]) | length > 0 %} + # Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support) + - coresmd: | + svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443 + ipxe_base_uri=http://{{ cluster_boot_ip }}:8081 + ca_cert=/root_ca/root_ca.crt + cache_valid={{ coredhcp_cache_validity }} + lease_time={{ coredhcp_lease_duration }} + single_port={{ coredhcp_tftp_single_port_mode | lower }} +{% for s in coredhcp_subnets %} + subnet={{ s.cidr }},{{ s.router }} +{% endfor %} + rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}} + rule=hostname:unknown-{{'{'}}04d{{'}'}} + - bootloop: | + lease_file=/tmp/coredhcp.db + script_path={{ coredhcp_custom_ipxe }} + lease_time={{ coredhcp_tmp_lease_duration }} +{% for sp in coredhcp_subnet_pools %} + subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }} +{% endfor %} +{% else %} + # Single-subnet mode: positional argument format compatible with coresmd v0.4.x - coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }} - bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }} +{% endif %} diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2b88daffe8..dfdf99a745 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -27,8 +27,6 @@ openchami_inventory_template: "{{ role_path }}/templates/inventory.yaml.j2" openchami_inventory_file: "{{ openchami_clone_path }}/dell/podman-quadlets/inventory/01-ochami" openchami_config_vars_path: "/opt/omnia/openchami/configs_vars.yaml" openchami_config_vars_template: "{{ role_path }}/templates/configs.yaml.j2" -openchami_coredhcp_template: "{{ role_path }}/templates/coredhcp.yaml.j2" -openchami_coredhcp_target: "{{ openchami_clone_path }}/dell/podman-quadlets/roles/configs/templates/coredhcp/coredhcp.yaml.j2" openchami_install_fail_msg: "Failed to install OpenCHAMI" network_spec: "{{ hostvars['localhost']['input_project_dir'] }}/network_spec.yml" network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 60b0a47616..2e81733240 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -131,12 +131,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index a4b89e1efa..de69e4f556 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -130,12 +130,22 @@ content: | {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index ad767a2e59..156608de44 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -101,12 +101,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index faa5c234b6..51296e3c29 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -100,12 +100,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 574a040e3e..a7c886a3a7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -605,6 +605,32 @@ # Patch: append nameservers after /etc/resolv.conf using Jinja list "dns" sed -i 's|/etc/resolv.conf|/etc/resolv.conf{% for ns in dns %} {{ ns }}{% endfor %}|' "$cfg" +{% if dns_enabled | default(false) | bool %} + # Forward cluster-internal DNS domain to OIM CoreDNS + # This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS + python3 - "$cfg" << 'PYEOF' +import sys, yaml +cfg_path = sys.argv[1] +with open(cfg_path) as f: + doc = yaml.safe_load(f) +corefile = doc['data']['Corefile'] +fwd_block = """{{ domain_name }}:53 { + errors + cache 30 + forward . {{ admin_nic_ip }} +} +""" +if '{{ domain_name }}:53' not in corefile: + corefile = fwd_block + corefile + doc['data']['Corefile'] = corefile + with open(cfg_path, 'w') as f: + yaml.dump(doc, f, default_flow_style=False) + print("Added {{ domain_name }} forward zone to K8s CoreDNS") +else: + print("{{ domain_name }} forward zone already present in K8s CoreDNS") +PYEOF +{% endif %} + # Apply the patched ConfigMap kubectl apply -f "$cfg" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2ee561109c..ba8fcfad03 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -107,12 +107,22 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /root/init_slurm_db.sql permissions: '{{ file_mode_600 }}' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 2d4b7ad001..44f188e51b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -343,12 +343,22 @@ echo "[INFO] ===== Completed firewall and service configuration (aarch64) =====" +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 3cae337b69..6baef46c43 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -148,12 +148,22 @@ {% endif %} {% endif %} +{% if dns_enabled | default(false) | bool %} + - path: /etc/resolv.conf + owner: root:root + permissions: '0644' + content: | + search {{ domain_name }} + nameserver {{ admin_nic_ip }} + options timeout:1 attempts:2 +{% else %} - path: /etc/hosts append: true content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} +{% endif %} - path: /etc/sysconfig/slurmd owner: root:root permissions: '0644' diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index b2480d2c6e..2895762f8e 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -45,6 +45,11 @@ ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" +- name: Set dns_enabled default when not defined + ansible.builtin.set_fact: + dns_enabled: false + when: dns_enabled is not defined + - name: Initialise variables ansible.builtin.set_fact: service_k8s_support: false diff --git a/provision/roles/provision_validations/tasks/update_hosts.yml b/provision/roles/provision_validations/tasks/update_hosts.yml index bd046032bc..8110097cbe 100644 --- a/provision/roles/provision_validations/tasks/update_hosts.yml +++ b/provision/roles/provision_validations/tasks/update_hosts.yml @@ -19,19 +19,22 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs and hostnames that are being updated - ansible.builtin.shell: | - set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ - grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp - cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} - rm -f {{ hosts_file_path }}.tmp - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" +- name: Update OIM /etc/hosts (skipped when CoreDNS is enabled) + when: not (dns_enabled | default(false) | bool) + block: + - name: Remove stale entries for IPs and hostnames that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Add hosts file entry for cluster - ansible.builtin.shell: | - set -o pipefail - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} - changed_when: true - loop: "{{ read_mapping_file.dict | dict2items }}" + - name: Add hosts file entry for cluster + ansible.builtin.shell: | + set -o pipefail + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" diff --git a/provision/roles/slurm_config/tasks/update_hosts_munge.yml b/provision/roles/slurm_config/tasks/update_hosts_munge.yml index 29683159ad..783d821edd 100644 --- a/provision/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/provision/roles/slurm_config/tasks/update_hosts_munge.yml @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Edit /etc/hosts file till DNS +- name: Edit /etc/hosts file (skipped when CoreDNS is enabled) ignore_unreachable: true delegate_to: "{{ slurmhost_ip }}" + when: not (dns_enabled | default(false) | bool) block: - name: Remove deleted nodes if any hostname exists in /etc/hosts ansible.builtin.lineinfile: