Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"description": "Default lease time for DHCP.",
"pattern": "^[0-9]+$",
"default": "86400"
},
"dns_enabled": {
"type": "boolean",
"description": "Enable DNS-based hostname resolution via coresmd.",
"default": false
}
},
"required": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1442,3 +1442,21 @@ def _ranges_overlap(range_a, range_b):
return a_start <= b_end and b_start <= a_end
except (ValueError, TypeError):
return False



def validate_dns_config(data):
"""
Validates dns_config input parameters.

dns_config.yml only contains dns_enabled (boolean).
The cluster domain is read from OIM metadata (domain_name).

Args:
data (dict): The dns_config dict from dns_config.yml.

Returns:
list: Validation error messages (currently empty; schema
validation handles the dns_enabled type check).
"""
return []
8 changes: 8 additions & 0 deletions input/provision_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,11 @@ language: "en_US.UTF-8"
# Default: 86400
# Max: 31536000
default_lease_time: "86400"

#### Optional
# Enable DNS-based hostname resolution for compute nodes.
# When true, nodes use coresmd (CoreDNS + OpenCHAMI SMD plugin) instead of /etc/hosts.
# DNS records are generated automatically from SMD inventory.
# The cluster domain is read from OIM metadata (domain_name).
# Default: false
dns_enabled: false
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,6 @@
delegate_to: localhost
connection: local

- name: Deploy coredhcp template with multi-subnet support
ansible.builtin.copy:
src: "{{ openchami_coredhcp_template }}"
dest: "{{ openchami_coredhcp_target }}"
mode: "{{ file_permissions_644 }}"
delegate_to: localhost
connection: local

- name: Load the openchami configs vars
ansible.builtin.template:
src: "{{ openchami_config_vars_template }}"
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,30 @@ server4:
- dns: {{ coredhcp_dns_server }}
- router: {{ coredhcp_router }}
- netmask: {{ coredhcp_netmask }}
{% if coredhcp_subnets | default([]) | length > 0 %}
# Multi-subnet mode: uses key=value config format (requires coresmd with multi-subnet support)
- coresmd: |
svc_base_uri=https://{{ cluster_name }}.{{ cluster_domain }}:8443
ipxe_base_uri=http://{{ cluster_boot_ip }}:8081
ca_cert=/root_ca/root_ca.crt
cache_valid={{ coredhcp_cache_validity }}
lease_time={{ coredhcp_lease_duration }}
single_port={{ coredhcp_tftp_single_port_mode | lower }}
{% for s in coredhcp_subnets %}
subnet={{ s.cidr }},{{ s.router }}
{% endfor %}
rule=type:Node,hostname:{{ cluster_shortname }}{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}}
rule=type:NodeBMC,hostname:bmc{{'{'}}0{{ coredhcp_nidlength | default(cluster_nidlength | default(3)) }}d{{'}'}}
rule=hostname:unknown-{{'{'}}04d{{'}'}}
- bootloop: |
lease_file=/tmp/coredhcp.db
script_path={{ coredhcp_custom_ipxe }}
lease_time={{ coredhcp_tmp_lease_duration }}
{% for sp in coredhcp_subnet_pools %}
subnet_pool={{ sp.cidr }},{{ sp.start }},{{ sp.end }}
{% endfor %}
{% else %}
# Single-subnet mode: positional argument format compatible with coresmd v0.4.x
- coresmd: https://{{ cluster_name }}.{{ cluster_domain }}:8443 http://{{ cluster_boot_ip }}:8081 /root_ca/root_ca.crt {{ coredhcp_cache_validity }} {{ coredhcp_lease_duration }} {{ coredhcp_tftp_single_port_mode | lower }}
- bootloop: /tmp/coredhcp.db {{ coredhcp_custom_ipxe }} {{ coredhcp_tmp_lease_duration }} {{ coredhcp_dhcp_pool }}
{% endif %}
2 changes: 0 additions & 2 deletions prepare_oim/roles/deploy_containers/openchami/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ openchami_inventory_template: "{{ role_path }}/templates/inventory.yaml.j2"
openchami_inventory_file: "{{ openchami_clone_path }}/dell/podman-quadlets/inventory/01-ochami"
openchami_config_vars_path: "/opt/omnia/openchami/configs_vars.yaml"
openchami_config_vars_template: "{{ role_path }}/templates/configs.yaml.j2"
openchami_coredhcp_template: "{{ role_path }}/templates/coredhcp.yaml.j2"
openchami_coredhcp_target: "{{ openchami_clone_path }}/dell/podman-quadlets/roles/configs/templates/coredhcp/coredhcp.yaml.j2"
openchami_install_fail_msg: "Failed to install OpenCHAMI"
network_spec: "{{ hostvars['localhost']['input_project_dir'] }}/network_spec.yml"
network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,22 @@
content: |
{{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /etc/sysconfig/slurmd
owner: root:root
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,22 @@
content: |
{{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /etc/sysconfig/slurmd
owner: root:root
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,22 @@
{{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
{% endif %}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /etc/sysconfig/slurmd
owner: root:root
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,22 @@
{{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
{% endif %}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /etc/sysconfig/slurmd
owner: root:root
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,32 @@
# Patch: append nameservers after /etc/resolv.conf using Jinja list "dns"
sed -i 's|/etc/resolv.conf|/etc/resolv.conf{% for ns in dns %} {{ ns }}{% endfor %}|' "$cfg"

{% if dns_enabled | default(false) | bool %}
# Forward cluster-internal DNS domain to OIM CoreDNS
# This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS
python3 - "$cfg" << 'PYEOF'
import sys, yaml
cfg_path = sys.argv[1]
with open(cfg_path) as f:
doc = yaml.safe_load(f)
corefile = doc['data']['Corefile']
fwd_block = """{{ domain_name }}:53 {
errors
cache 30
forward . {{ admin_nic_ip }}
}
"""
if '{{ domain_name }}:53' not in corefile:
corefile = fwd_block + corefile
doc['data']['Corefile'] = corefile
with open(cfg_path, 'w') as f:
yaml.dump(doc, f, default_flow_style=False)
print("Added {{ domain_name }} forward zone to K8s CoreDNS")
else:
print("{{ domain_name }} forward zone already present in K8s CoreDNS")
PYEOF
{% endif %}

# Apply the patched ConfigMap
kubectl apply -f "$cfg"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,22 @@
{{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
{% endif %}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /root/init_slurm_db.sql
permissions: '{{ file_mode_600 }}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,22 @@

echo "[INFO] ===== Completed firewall and service configuration (aarch64) ====="

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}

- path: /etc/sysconfig/slurmd
owner: root:root
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,22 @@
{% endif %}
{% endif %}

{% if dns_enabled | default(false) | bool %}
- path: /etc/resolv.conf
owner: root:root
permissions: '0644'
content: |
search {{ domain_name }}
nameserver {{ admin_nic_ip }}
options timeout:1 attempts:2
{% else %}
- path: /etc/hosts
append: true
content: |
{% for key in ip_name_map | sort %}
{{ ip_name_map[key] }} {{ key }}
{% endfor %}
{% endif %}
- path: /etc/sysconfig/slurmd
owner: root:root
permissions: '0644'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@
ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}"
dns: "{{ network_data.admin_network.dns }}"

- name: Set dns_enabled default when not defined
ansible.builtin.set_fact:
dns_enabled: false
when: dns_enabled is not defined

- name: Initialise variables
ansible.builtin.set_fact:
service_k8s_support: false
Expand Down
33 changes: 18 additions & 15 deletions provision/roles/provision_validations/tasks/update_hosts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@
grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
changed_when: true

- name: Remove stale entries for IPs and hostnames that are being updated
ansible.builtin.shell: |
set -o pipefail
grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \
grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp
cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
rm -f {{ hosts_file_path }}.tmp
changed_when: true
loop: "{{ read_mapping_file.dict | dict2items }}"
- name: Update OIM /etc/hosts (skipped when CoreDNS is enabled)
when: not (dns_enabled | default(false) | bool)
block:
- name: Remove stale entries for IPs and hostnames that are being updated
ansible.builtin.shell: |
set -o pipefail
grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \
grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp
cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
rm -f {{ hosts_file_path }}.tmp
changed_when: true
loop: "{{ read_mapping_file.dict | dict2items }}"

- name: Add hosts file entry for cluster
ansible.builtin.shell: |
set -o pipefail
echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
changed_when: true
loop: "{{ read_mapping_file.dict | dict2items }}"
- name: Add hosts file entry for cluster
ansible.builtin.shell: |
set -o pipefail
echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
changed_when: true
loop: "{{ read_mapping_file.dict | dict2items }}"
3 changes: 2 additions & 1 deletion provision/roles/slurm_config/tasks/update_hosts_munge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---
- name: Edit /etc/hosts file till DNS
- name: Edit /etc/hosts file (skipped when CoreDNS is enabled)
ignore_unreachable: true
delegate_to: "{{ slurmhost_ip }}"
when: not (dns_enabled | default(false) | bool)
block:
- name: Remove deleted nodes if any hostname exists in /etc/hosts
ansible.builtin.lineinfile:
Expand Down
Loading