diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index de29f815fe..b3e950b4f3 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -24,11 +24,6 @@ {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"}, {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"}, {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"}, - {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"}, - {"package": "cuda-run", - "type": "iso", - "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run" - }, { "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0", "type": "tarball", diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index dc0c23452f..852944cb70 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -28,11 +28,6 @@ {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"}, {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"}, {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"}, - {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"}, - {"package": "cuda-run", - "type": "iso", - "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run" - }, { "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", "type": "tarball", diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 1a7c33b55b..1750e50be0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -77,96 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null - exit 1 - fi - - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE - - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." - fi - else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" - fi - - echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - fi - - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -246,7 +175,6 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools @@ -265,7 +193,14 @@ - /usr/local/bin/configure_vast_installation.sh - mount -a + +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh + - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} + {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} + # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -397,4 +332,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index fa9cfbee2e..7ee7580733 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -77,96 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null - exit 1 - fi - - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE - - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." - fi - else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" - fi - - echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - fi - - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -246,7 +175,7 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh @@ -262,10 +191,15 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - /usr/local/bin/configure_vast_installation.sh - - mount -a + +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh + - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount @@ -368,6 +302,7 @@ - systemctl restart sshd - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - mkdir -p /etc/containers/registries.conf.d - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 30a388d7ef..c884c40dc9 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -95,37 +95,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -214,7 +198,6 @@ echo "===== NVIDIA GPU setup completed =====" -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -236,12 +219,50 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." + + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi + else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" + fi - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." + fi # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." @@ -274,16 +295,17 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -305,6 +327,53 @@ content: | {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} +{% endif %} + - path: /usr/local/bin/configure_vast_installation.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -321,7 +390,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts /hpc_tools/cuda echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -332,6 +401,7 @@ echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -558,10 +628,7 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh @@ -621,6 +688,16 @@ - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh +{% if dcgm_support %} + /usr/local/bin/install_dcgm.sh +{% endif %} + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4756e8f1d3..401108acae 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -96,37 +96,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -143,12 +127,12 @@ if [ $? -eq 0 ]; then echo "[SUCCESS] CUDA toolkit NFS mount successful" - + # Add to fstab for persistence grep -q "$cuda_nfs_share" /etc/fstab || echo "$cuda_nfs_share /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab - + echo "[INFO] Configuring persistent CUDA environment..." - + # System-wide profile for login shells cat > /etc/profile.d/cuda.sh << 'EOF' export PATH=/usr/local/cuda/bin:$PATH @@ -156,7 +140,7 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/profile.d/cuda.sh - + # Bashrc for non-login shells cat > /etc/bashrc.cuda << 'EOF' if [ -d "/usr/local/cuda/bin" ]; then @@ -166,7 +150,7 @@ fi EOF grep -q "bashrc.cuda" /etc/bashrc || echo "source /etc/bashrc.cuda" >> /etc/bashrc - + # Slurm prolog for job environment mkdir -p /etc/slurm/prolog.d cat > /etc/slurm/prolog.d/cuda.sh << 'EOF' @@ -176,12 +160,12 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/slurm/prolog.d/cuda.sh - + # Apply immediately for current session export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export CUDA_HOME=/usr/local/cuda - + echo "[SUCCESS] Persistent CUDA environment configured" else echo "[ERROR] Failed to mount CUDA toolkit NFS share" @@ -215,8 +199,6 @@ echo "===== NVIDIA GPU setup completed =====" - -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -238,12 +220,50 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." + + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi + else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" + fi - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." + fi # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." @@ -276,16 +296,17 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -308,6 +329,52 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} +{% endif %} + - path: /etc/hosts append: true content: | @@ -349,6 +416,7 @@ echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/ /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -562,10 +630,7 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh @@ -628,6 +693,16 @@ {% endif %} - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh +{% if dcgm_support %} + /usr/local/bin/install_dcgm.sh +{% endif %} + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 new file mode 100644 index 0000000000..c037204a28 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 @@ -0,0 +1,78 @@ +#!/bin/bash +# Distributed lock manager for CUDA toolkit install on shared NFS. +# Backed by atomic mkdir on /hpc_tools/cuda/.nfs_lock_cuda. +# Exposes: acquire | release | wait | is_stale +set -euo pipefail + +LOCK_ROOT="/hpc_tools/cuda" +LOCK_DIR="$LOCK_ROOT/.nfs_lock_cuda" +OWNER_FILE="$LOCK_DIR/owner.txt" +DONE_FILE="$LOCK_ROOT/.done_cuda" +STATUS_LOG="$LOCK_ROOT/.cuda_install_status.log" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +INSTALL_TIMEOUT="${INSTALL_TIMEOUT:-1800}" +POLL_INTERVAL="${POLL_INTERVAL:-5}" +TAKEOVER_MIN="${TAKEOVER_MIN:-5}" +TAKEOVER_MAX="${TAKEOVER_MAX:-15}" +GLOBAL_WAIT_TIMEOUT="${GLOBAL_WAIT_TIMEOUT:-$((INSTALL_TIMEOUT * 2))}" + +log_status() { + # ts host hostname role result + printf '%s %s %s %s %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat "$HOSTNAME_FILE" 2>/dev/null || echo UNKNOWN)" \ + "$1" "$2" >> "$STATUS_LOG" +} + +acquire() { + # Fast path: already done + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 2; } + if mkdir "$LOCK_DIR" 2>/dev/null; then + cat "$HOSTNAME_FILE" > "$OWNER_FILE" + log_status installer lock_acquired + return 0 # we are installer + fi + return 1 # we are waiter +} + +release() { rm -rf "$LOCK_DIR"; } + +is_stale() { + # Owner hostname → hostname from status log → ping + local owner_hostname host + owner_hostname=$(cat "$OWNER_FILE" 2>/dev/null || echo "") + [ -z "$owner_hostname" ] && return 1 + host=$(awk -v h="$owner_hostname" '$3==h {print $2; exit}' "$STATUS_LOG") + [ -z "$host" ] && return 1 + ping -c1 -W2 "$host" >/dev/null 2>&1 && return 1 + return 0 # host unreachable → stale +} + +wait_for_done_or_takeover() { + local started; started=$(date +%s) + while true; do + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 0; } + if [ ! -d "$LOCK_DIR" ]; then + sleep $(( RANDOM % (TAKEOVER_MAX - TAKEOVER_MIN + 1) + TAKEOVER_MIN )) + return 10 # caller should retry acquire + fi + if is_stale; then + log_status waiter crash_detected + release + continue + fi + (( $(date +%s) - started > GLOBAL_WAIT_TIMEOUT )) && { + log_status timeout_waiter fail; return 1; + } + sleep "$POLL_INTERVAL" + done +} + +case "${1:-}" in + acquire) acquire ;; + release) release ;; + wait) wait_for_done_or_takeover ;; + is_stale) is_stale ;; + *) echo "usage: $0 {acquire|release|wait|is_stale}" >&2; exit 64 ;; +esac diff --git a/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 new file mode 100644 index 0000000000..be8fb867b3 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Generate hostname for lock ownership identity. +# Idempotent: uses hostname directly. +set -euo pipefail + +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +hostname > "$HOSTNAME_FILE" + +echo "[INFO] CUDA install hostname for this node: $(cat "$HOSTNAME_FILE")" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 new file mode 100644 index 0000000000..ba2cde8f3d --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 @@ -0,0 +1,42 @@ +#!/bin/bash +# Local NVIDIA driver install. Always runs on Slurm nodes. Idempotent. +# Never touches NFS lock artifacts. Never touches /hpc_tools/cuda contents. +set -euo pipefail + +LOGFILE="/var/log/nvidia_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== NVIDIA driver install =====" + +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Exiting." + exit 0 +fi + +if command -v nvidia-smi >/dev/null 2>&1; then + echo "[INFO] NVIDIA driver already installed. Skipping." +else + echo "[INFO] Installing NVIDIA driver via dnf..." + dnf install -y cuda-drivers + command -v nvidia-smi >/dev/null 2>&1 || { echo "[ERROR] Driver install failed."; exit 1; } +fi + +nvidia-smi -pm 1 || true + +# Mount shared toolkit at /usr/local/cuda (harmless if already mounted) +mkdir -p /usr/local/cuda +CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" +if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true +fi +grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + +cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF +chmod +x /etc/profile.d/cuda.sh + +echo "===== NVIDIA driver install completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 new file mode 100644 index 0000000000..471c3be291 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 @@ -0,0 +1,166 @@ +#!/bin/bash +# Lock-aware CUDA toolkit installer. Publishes to /hpc_tools/cuda on NFS. +# Exits 0 if toolkit is already present (.done_cuda), if this node installed it, +set -euo pipefail + +LOGFILE="/var/log/cuda_toolkit_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +LOCK_ROOT="/hpc_tools/cuda" +DONE_FILE="$LOCK_ROOT/.done_cuda" +LOCK_MGR="/usr/local/bin/cuda_lock_manager.sh" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +# Function to set up CUDA environment variables +setup_cuda_env() { + echo "[INFO] Setting up CUDA environment variables for shared location..." + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/hpc_tools/cuda/bin:$PATH +export LD_LIBRARY_PATH=/hpc_tools/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/hpc_tools/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured successfully" +} + +# Generate hostname for lock ownership (idempotent) +/usr/local/bin/generate_install_uuid.sh + +# Fast-path: already done +[ -f "$DONE_FILE" ] && { + echo "[INFO] CUDA toolkit already installed on shared storage by another node." + echo "[INFO] This node will use the existing CUDA installation." + setup_cuda_env + echo "[INFO] CUDA environment configured successfully." + exit 0 +} + +# Check if running in manual mode (not cloud-init) +MANUAL_MODE="${CUDA_INSTALL_MANUAL:-false}" +if [ "$MANUAL_MODE" = "true" ]; then + echo "[INFO] Running in manual mode - will force acquire lock if held" + FORCE_LOCK=true +else + echo "[INFO] Running in cloud-init mode - will proceed without waiting if lock held" + FORCE_LOCK=false +fi + +# Attempt lock acquisition +set +e; "$LOCK_MGR" acquire; rc=$?; set -e + +# In manual mode, if lock is held, release it explicitly then acquire again +if [ "$FORCE_LOCK" = "true" ] && [ "$rc" = "1" ]; then + echo "[WARN] Lock is held by another node. In manual mode, releasing lock first..." + "$LOCK_MGR" release + echo "[INFO] Lock released. Now acquiring lock..." + set +e; "$LOCK_MGR" acquire; rc=$?; set -e +fi +case $rc in + 0) # installer + echo "[INFO] Acquired lock. Installing toolkit..." + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Check if CUDA toolkit is already installed on NFS + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[INFO] CUDA toolkit already installed on NFS. Exiting." + "$LOCK_MGR" release + exit 0 + fi + + # Install CUDA toolkit to local location using dnf + echo "[INFO] Installing CUDA toolkit to local location using dnf..." + mkdir -p /cuda + # Copy host repository configuration to installroot (only for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + mkdir -p /cuda/etc/yum.repos.d + cp -r /etc/yum.repos.d/* /cuda/etc/yum.repos.d/ 2>/dev/null || true + fi + if timeout "${INSTALL_TIMEOUT:-1800}" dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit; then + echo "[SUCCESS] CUDA toolkit installed successfully." + + # Clean up repository configuration from installroot (if copied for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + rm -rf /cuda/etc/yum.repos.d + fi + + # Copy CUDA toolkit to shared location + echo "[INFO] Copying CUDA toolkit to shared location..." + CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) + if [ -z "$CUDA_SRC_DIR" ]; then + echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" + echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." + cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true + + # Verify CUDA toolkit installation + echo "[INFO] Verifying CUDA toolkit installation..." + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[SUCCESS] CUDA toolkit verified." + else + echo "[ERROR] CUDA toolkit (nvcc) not found after installation." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Atomic publish of .done_cuda (see §4.4). Never use `touch`. + TMP="$LOCK_ROOT/.done_cuda.tmp.$(cat $HOSTNAME_FILE)" + printf 'installed_by=%s\nts=%s\n' \ + "$(hostname -s)" "$(date -Iseconds)" > "$TMP" + sync -f "$TMP" 2>/dev/null || sync + mv -f -- "$TMP" "$DONE_FILE" + "$LOCK_MGR" release + # log pass + printf '%s %s %s installer pass\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + + setup_cuda_env + + umount /shared-cuda-toolkit 2>/dev/null + exit 0 + else + result=$? + "$LOCK_MGR" release + [ "$result" = "124" ] && st="timeout_killed" || st="fail" + printf '%s %s %s installer %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" "$st" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + exit 1 + fi + ;; + 1) # waiter - another node is installing + echo "[INFO] Another node is installing CUDA toolkit. Proceeding with cloud-init without waiting." + echo "[INFO] This node will use the shared CUDA toolkit once installation completes." + setup_cuda_env + echo "[INFO] CUDA environment configured (will work once installation completes)" + + exit 0 + ;; + 2) # already done + echo "[INFO] CUDA toolkit already installed on shared storage." + setup_cuda_env + exit 0 + ;; + *) echo "[ERROR] acquire rc=$rc"; exit 1 ;; +esac diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 new file mode 100644 index 0000000000..158e089805 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 @@ -0,0 +1,98 @@ +#!/bin/bash +LOGFILE="/var/log/dcgm_setup.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA DCGM setup =====" + +# GPU detection gate - DCGM requires NVIDIA GPU hardware +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping DCGM setup." + exit 0 +fi + +# CUDA prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping DCGM setup." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping DCGM setup." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Display nvidia-smi output for verification +echo "========== NVIDIA Driver & GPU Information ==========" +nvidia-smi 2>&1 +echo "=====================================================" + +# Detect CUDA major version for DCGM package selection +echo "[INFO] Detecting CUDA version for DCGM package compatibility..." +# Try to get CUDA version from nvidia-smi +CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + +# Fallback: Try to get CUDA version from nvcc if available +if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi +else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" +fi + +# Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies +echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." +if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." + exit 1 +fi +echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + +# Enable and start DCGM daemon (SB-003) +echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." +systemctl enable {{ dcgm_service_name }} + +RETRIES={{ dcgm_health_check_retries }} +ATTEMPT=0 +DCGM_STARTED=false + +while [ $ATTEMPT -lt $RETRIES ]; do + ATTEMPT=$((ATTEMPT + 1)) + echo "[INFO] Starting {{ dcgm_service_name }} (attempt $ATTEMPT/$RETRIES)..." + systemctl start {{ dcgm_service_name }} + sleep 3 + + if systemctl is-active --quiet {{ dcgm_service_name }}; then + DCGM_STARTED=true + echo "[SUCCESS] {{ dcgm_service_name }}.service is active." + break + else + echo "[WARN] {{ dcgm_service_name }} failed to start on attempt $ATTEMPT." + fi +done + +if [ "$DCGM_STARTED" != "true" ]; then + echo "[ERROR] {{ dcgm_service_name }} failed to start after $RETRIES attempts. Service will stay down (BL-002)." + exit 1 +fi + +# GPU discovery (SB-004) +echo "[INFO] Enumerating GPUs via dcgmi discovery..." +if command -v dcgmi &>/dev/null; then + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." +else + echo "[WARN] dcgmi command not found. Skipping GPU enumeration." +fi + +echo "===== NVIDIA DCGM setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 new file mode 100644 index 0000000000..4a51c179ae --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 @@ -0,0 +1,141 @@ +#!/bin/bash +# NVIDIA Peer Memory (nvidia-peermem) DKMS installation for GPUDirect RDMA support. +# SHALL be installed on all compute nodes where GPU hardware is detected. +# Required on RDMA-capable GPU nodes only. +# Idempotent: skips installation if module is already loaded. +set -euo pipefail + +LOGFILE="/var/log/nvidia_peermem_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA Peer Memory (nvidia-peermem) setup =====" + +# GPU detection gate - only proceed if NVIDIA GPU is present +echo "[INFO] Checking for NVIDIA GPU hardware..." +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping nvidia-peermem installation." + exit 0 +fi + +# NVIDIA driver prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping nvidia-peermem." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping nvidia-peermem." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Check if nvidia-peermem module is already loaded +echo "[INFO] Checking if nvidia-peermem module is already loaded..." +if lsmod | grep -qE 'nv_peer_mem|nvidia_peermem'; then + echo "[INFO] nvidia-peermem module is already loaded. Skipping installation." + # Verify module metadata + if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + else + echo "[WARN] nvidia-peermem module loaded but modinfo failed. This may indicate a corrupted module." + fi + exit 0 +fi + +# Check running kernel +KERNEL_VERSION=$(uname -r) +echo "[INFO] Running kernel version: $KERNEL_VERSION" + +# Check if kernel headers are available (required for DKMS) +if [ ! -d "/lib/modules/$KERNEL_VERSION/build" ]; then + echo "[ERROR] Kernel headers not found for kernel $KERNEL_VERSION." + echo "[ERROR] Required for DKMS build. Please install kernel-devel package." + exit 1 +fi + +# Get NVIDIA driver version from nvidia-smi +NVIDIA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | tr -d ' ') +if [ -z "$NVIDIA_VERSION" ]; then + echo "[ERROR] Could not determine NVIDIA driver version from nvidia-smi." + exit 1 +fi +echo "[INFO] NVIDIA driver version: $NVIDIA_VERSION" + +# Check current DKMS status +echo "[INFO] Checking current DKMS status..." +dkms status || true + +# Add NVIDIA driver to DKMS if not already added +if ! dkms status | grep -q "nvidia/$NVIDIA_VERSION"; then + echo "[INFO] Adding NVIDIA driver $NVIDIA_VERSION to DKMS..." + if ! dkms add -m nvidia -v "$NVIDIA_VERSION"; then + echo "[ERROR] Failed to add NVIDIA driver to DKMS." + exit 1 + fi + echo "[INFO] NVIDIA driver added to DKMS successfully." +else + echo "[INFO] NVIDIA driver $NVIDIA_VERSION already in DKMS." +fi + +# Build NVIDIA module for the running kernel +echo "[INFO] Building NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms build -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to build NVIDIA module for kernel $KERNEL_VERSION." + echo "[ERROR] Check kernel logs for build errors." + exit 1 +fi +echo "[INFO] NVIDIA module built successfully." + +# Install the built module +echo "[INFO] Installing NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms install -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to install NVIDIA module for kernel $KERNEL_VERSION." + exit 1 +fi +echo "[INFO] NVIDIA module installed successfully." + +# Verify nvidia-peermem module metadata +echo "[INFO] Verifying nvidia-peermem module metadata..." +if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + modinfo nvidia-peermem +else + echo "[ERROR] nvidia-peermem module metadata not found after DKMS install." + echo "[ERROR] This may indicate the module was not built or installed correctly." + exit 1 +fi + +# Ensure base NVIDIA modules are loaded first + echo "Loading base NVIDIA modules..." + modprobe nvidia 2>/dev/null || echo "nvidia module not available or failed to load" + modprobe nvidia-uvm 2>/dev/null || echo "nvidia-uvm module not available or failed to load" + modprobe nvidia-modeset 2>/dev/null || echo "nvidia-modeset module not available or failed to load" + modprobe nvidia-drm 2>/dev/null || echo "nvidia-drm module not available or failed to load" + +# Load the nvidia-peermem module +echo "[INFO] Loading nvidia-peermem module..." +if modprobe nvidia-peermem; then + echo "[SUCCESS] nvidia-peermem module loaded successfully." +else + echo "[WARN] Failed to load nvidia-peermem module with modprobe." + echo "[WARN] This may not be critical if RDMA is not required on this node." + echo "[WARN] Check kernel logs for detailed error information." + dmesg | grep -i peermem || true + # Continue with warning unless RDMA dependency exists + # (RDMA dependency check would be environment-specific) +fi + +# Confirm module is loaded +if lsmod | grep -q nvidia_peermem; then + echo "[SUCCESS] nvidia_peermem is loaded in kernel." +else + echo "[WARN] nvidia_peermem not found in lsmod output." + echo "[WARN] Module may have failed to load or may not be required for this configuration." +fi + +# Check kernel logs for peer memory messages or errors +echo "[INFO] Checking kernel logs for peer memory messages..." +dmesg | grep -i peermem || echo "[INFO] No peermem messages found in recent kernel logs." + +echo "===== NVIDIA Peer Memory (nvidia-peermem) setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 new file mode 100644 index 0000000000..79d72db10b --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 @@ -0,0 +1,50 @@ +#!/bin/bash +# Slurm-node entry point. Decides toolkit path based on login_compiler_node_present. +# GPU detection gate ensures CUDA operations only run on nodes with NVIDIA hardware. + +set -euo pipefail + +LOGIN_COMPILER_PRESENT="{{ login_compiler_node_present | lower }}" +SLURM_NODE_PRESENT="{{ slurm_node_present | lower }}" + +[ "$SLURM_NODE_PRESENT" = "true" ] || { echo "[INFO] Not a Slurm node."; exit 0; } + +# GPU detection gate - if no GPU present, skip CUDA toolkit and driver installation +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping CUDA toolkit and driver installation." + exit 0 +fi + +if [ "$LOGIN_COMPILER_PRESENT" = "true" ]; then + echo "[INFO] Login/compiler nodes present → mounting shared toolkit from NFS." + # Mount shared toolkit at /usr/local/cuda + mkdir -p /usr/local/cuda + CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" + if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true + fi + grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + # Export CUDA environment variables + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured from shared NFS toolkit." +else + echo "[INFO] No login/compiler nodes → participating in lock." + # install_cuda_toolkit.sh is lock-aware: + # - if this node wins the lock, it runs the install and publishes .done_cuda + # - if this node loses the lock, it returns immediately without waiting + if ! /usr/local/bin/install_cuda_toolkit.sh; then + echo "[ERROR] install_cuda_toolkit.sh returned non-zero." + exit 1 + fi + echo "[INFO] CUDA toolkit installation handled by another node or completed by this node." + echo "[INFO] Proceeding with driver, DCGM, and nvidia-peermem installation." +fi + +echo "[SUCCESS] CUDA coordinator completed." +exit 0 diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 9be62ddcbe..8de6ceb69a 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -102,15 +102,14 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" -# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role) -cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}" -cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}" - +# Login/compiler node presence flag (set by slurm_config role) +login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}" +slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}" # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 +dcgm_support: "{{ hostvars['localhost'].get('telemetry_sources', {}).get('dcgm', {}).get('metrics_enabled', true) | bool }}" # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 -dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) }}" # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" diff --git a/provision/roles/slurm_config/tasks/create_slurm_dir.yml b/provision/roles/slurm_config/tasks/create_slurm_dir.yml index b68bcbbded..a89b33aeb3 100644 --- a/provision/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/provision/roles/slurm_config/tasks/create_slurm_dir.yml @@ -30,22 +30,6 @@ name: slurm_custom_aarch64 failed_when: false -- name: Extract CUDA runfile name for x86_64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_x86_64 is defined - - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Extract CUDA runfile name for aarch64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_aarch64 is defined - - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" diff --git a/provision/roles/slurm_config/tasks/hpc_tools.yml b/provision/roles/slurm_config/tasks/hpc_tools.yml index 46260da267..37a2c166d7 100644 --- a/provision/roles/slurm_config/tasks/hpc_tools.yml +++ b/provision/roles/slurm_config/tasks/hpc_tools.yml @@ -22,7 +22,6 @@ mode: "{{ common_mode }}" loop: - cuda - - runfile - scripts - container_images - nvidia_sdk diff --git a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml index c61e8d92a9..5b99d35c30 100644 --- a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -92,3 +92,11 @@ ansible.builtin.set_fact: controller_ip: "{{ ip_name_map[ctld_list | first] }}" when: ctld_list | length > 0 + +- name: Set login_compiler_node_present flag + ansible.builtin.set_fact: + login_compiler_node_present: "{{ compiler_login_list | length > 0 }}" + +- name: Set slurm_node_present flag + ansible.builtin.set_fact: + slurm_node_present: "{{ cmpt_list | length > 0 }}" diff --git a/provision/roles/slurm_config/vars/main.yml b/provision/roles/slurm_config/vars/main.yml index 580d776d92..d2f60542ed 100644 --- a/provision/roles/slurm_config/vars/main.yml +++ b/provision/roles/slurm_config/vars/main.yml @@ -171,16 +171,6 @@ parallel_copy_max_workers: 4 parallel_copy_candidates: - # CUDA Runfile (aarch64 repo path) - - name: cuda_runfile_aarch64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - - # CUDA Runfile (x86_64 repo path) - - name: cuda_runfile_x86_64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - # NVIDIA HPC SDK (x86_64 tarball extracted dir) - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"