diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json
index de29f815fe..b3e950b4f3 100644
--- a/input/config/aarch64/rhel/10.0/slurm_custom.json
+++ b/input/config/aarch64/rhel/10.0/slurm_custom.json
@@ -24,11 +24,6 @@
             {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"},
             {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"},
             {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"},
-            {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"},
-            {"package": "cuda-run",
-             "type": "iso",
-             "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run"
-            },
             {
             "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0",
             "type": "tarball",
diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json
index dc0c23452f..852944cb70 100644
--- a/input/config/x86_64/rhel/10.0/slurm_custom.json
+++ b/input/config/x86_64/rhel/10.0/slurm_custom.json
@@ -28,11 +28,6 @@
             {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"},
             {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"},
             {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"},
-            {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"},
-            {"package": "cuda-run",
-             "type": "iso",
-             "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"
-            },
             {
             "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0",
             "type": "tarball",
diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index 1a7c33b55b..1750e50be0 100644
--- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -77,96 +77,25 @@
                 IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa
                 IdentitiesOnly yes
 
-        - path: /usr/local/bin/install_cuda_toolkit.sh
-          permissions: '0755'
+{% if login_compiler_node_present %}
+        - path: /usr/local/bin/generate_install_uuid.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
           content: |
-            #!/bin/bash
-            LOGFILE="/var/log/cuda_toolkit_install.log"
-            exec > >(tee -a "$LOGFILE") 2>&1
-
-            echo "===== Starting CUDA Toolkit installation ====="
-
-            # Check if CUDA toolkit is already installed
-            if command -v nvcc &>/dev/null; then
-                CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//')
-                echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting."
-                exit 0
-            fi
-
-            echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..."
-            mkdir -p /cuda-runfile
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile
-
-            if [ $? -ne 0 ]; then
-                echo "[ERROR] Failed to mount NFS runfile share. Exiting."
-                exit 1
-            fi
-
-            echo "[INFO] Setting up shared CUDA directory..."
-            # Create and mount shared directory for compute nodes
-            mkdir -p /shared-cuda-toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
-
-            if [ $? -ne 0 ]; then
-                echo "[ERROR] Failed to mount NFS cuda share. Exiting."
-                umount /cuda-runfile 2>/dev/null
-                exit 1
-            fi
-
-            echo "[INFO] Installing CUDA toolkit directly to shared NFS location..."
-            if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then
-                mkdir -p /shared-cuda-toolkit/tmp
-                # Install toolkit directly to the NFS-mounted shared location
-                bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override
-
-                if [ $? -eq 0 ]; then
-                    echo "[SUCCESS] CUDA toolkit installed successfully to shared location."
-
-                    # Set up environment variables pointing to shared location
-                    cat > /etc/profile.d/cuda.sh << 'ENDOFFILE'
-            export PATH=/shared-cuda-toolkit/bin:$PATH
-            export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH
-            export CUDA_HOME=/shared-cuda-toolkit
-            ENDOFFILE
-
-                    # Apply environment variables for current session
-                    export PATH=/shared-cuda-toolkit/bin:$PATH
-                    export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH
-                    export CUDA_HOME=/shared-cuda-toolkit
-
-                    echo "[INFO] CUDA environment configured"
-                else
-                    echo "[ERROR] CUDA toolkit installation failed."
-                fi
-            else
-                echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/"
-            fi
-
-            echo "[INFO] Verifying CUDA toolkit installation..."
-            if command -v nvcc &>/dev/null; then
-                CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//')
-                echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION"
-                echo "[INFO] CUDA installation path: $(which nvcc)"
-            else
-                echo "[ERROR] CUDA toolkit (nvcc) not found after installation."
-            fi
-
-            echo "[INFO] Setting up shared CUDA directory for compute nodes..."
-            # Create shared directory for compute nodes to mount
-            mkdir -p /shared-cuda-toolkit
-            # Mount the shared NFS location where compute nodes will access the toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
+            {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }}
 
-            echo "[INFO] Copying CUDA toolkit to shared location..."
-            # Copy the installed CUDA toolkit to the shared location for compute nodes
-            #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/'
-            cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true
-
-            echo "[INFO] Cleaning up temporary mounts..."
-            umount /cuda-runfile 2>/dev/null
-            rmdir /cuda-runfile 2>/dev/null
+        - path: /usr/local/bin/cuda_lock_manager.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }}
 
-            echo "===== CUDA Toolkit installation completed ====="
+        - path: /usr/local/bin/install_cuda_toolkit.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }}
+{% endif %}
 
 {% if hostvars['localhost']['openldap_support'] %}
         - path: /etc/sssd/sssd.conf
@@ -246,7 +175,6 @@
 
       runcmd:
         - /usr/local/bin/set-ssh.sh
-        - /usr/local/bin/install_cuda_toolkit.sh
         # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
         - mkdir -p {{ client_mount_path }}/slurm/ssh
         - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
@@ -265,7 +193,14 @@
         - /usr/local/bin/configure_vast_installation.sh
         - mount -a
 
+
+{% if login_compiler_node_present %}
+        - /usr/local/bin/generate_install_uuid.sh
+        - /usr/local/bin/install_cuda_toolkit.sh
+{% endif %}
+
 {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %}
+
         # Add NFS entry and mount
         - mkdir -p {{ client_mount_path }}
         - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -397,4 +332,4 @@
         # nvidia sdk install
         - /usr/local/bin/install_nvhpc_sdk.sh
         - /usr/local/bin/configure_nvhpc_env.sh
-        - echo "Cloud-Init has completed successfully."
+        - echo "Cloud-Init has completed successfully."
\ No newline at end of file
diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index fa9cfbee2e..7ee7580733 100644
--- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -77,96 +77,25 @@
                 IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa
                 IdentitiesOnly yes
 
-        - path: /usr/local/bin/install_cuda_toolkit.sh
-          permissions: '0755'
+{% if login_compiler_node_present %}
+        - path: /usr/local/bin/generate_install_uuid.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
           content: |
-            #!/bin/bash
-            LOGFILE="/var/log/cuda_toolkit_install.log"
-            exec > >(tee -a "$LOGFILE") 2>&1
-
-            echo "===== Starting CUDA Toolkit installation ====="
-
-            # Check if CUDA toolkit is already installed
-            if command -v nvcc &>/dev/null; then
-                CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//')
-                echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting."
-                exit 0
-            fi
-
-            echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..."
-            mkdir -p /cuda-runfile
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile
-
-            if [ $? -ne 0 ]; then
-                echo "[ERROR] Failed to mount NFS runfile share. Exiting."
-                exit 1
-            fi
-
-            echo "[INFO] Setting up shared CUDA directory..."
-            # Create and mount shared directory for compute nodes
-            mkdir -p /shared-cuda-toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
-
-            if [ $? -ne 0 ]; then
-                echo "[ERROR] Failed to mount NFS cuda share. Exiting."
-                umount /cuda-runfile 2>/dev/null
-                exit 1
-            fi
-
-            echo "[INFO] Installing CUDA toolkit directly to shared NFS location..."
-            if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then
-                mkdir -p /shared-cuda-toolkit/tmp
-                # Install toolkit directly to the NFS-mounted shared location
-                bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override
-
-                if [ $? -eq 0 ]; then
-                    echo "[SUCCESS] CUDA toolkit installed successfully to shared location."
-
-                    # Set up environment variables pointing to shared location
-                    cat > /etc/profile.d/cuda.sh << 'ENDOFFILE'
-            export PATH=/shared-cuda-toolkit/bin:$PATH
-            export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH
-            export CUDA_HOME=/shared-cuda-toolkit
-            ENDOFFILE
-
-                    # Apply environment variables for current session
-                    export PATH=/shared-cuda-toolkit/bin:$PATH
-                    export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH
-                    export CUDA_HOME=/shared-cuda-toolkit
-
-                    echo "[INFO] CUDA environment configured"
-                else
-                    echo "[ERROR] CUDA toolkit installation failed."
-                fi
-            else
-                echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/"
-            fi
-
-            echo "[INFO] Verifying CUDA toolkit installation..."
-            if command -v nvcc &>/dev/null; then
-                CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//')
-                echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION"
-                echo "[INFO] CUDA installation path: $(which nvcc)"
-            else
-                echo "[ERROR] CUDA toolkit (nvcc) not found after installation."
-            fi
-
-            echo "[INFO] Setting up shared CUDA directory for compute nodes..."
-            # Create shared directory for compute nodes to mount
-            mkdir -p /shared-cuda-toolkit
-            # Mount the shared NFS location where compute nodes will access the toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
-
-            echo "[INFO] Copying CUDA toolkit to shared location..."
-            # Copy the installed CUDA toolkit to the shared location for compute nodes
-            #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/'
-            cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true
+            {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }}
 
-            echo "[INFO] Cleaning up temporary mounts..."
-            umount /cuda-runfile 2>/dev/null
-            rmdir /cuda-runfile 2>/dev/null
+        - path: /usr/local/bin/cuda_lock_manager.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }}
 
-            echo "===== CUDA Toolkit installation completed ====="
+        - path: /usr/local/bin/install_cuda_toolkit.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }}
+{% endif %}
 
 {% if hostvars['localhost']['openldap_support'] %}
         - path: /etc/sssd/sssd.conf
@@ -246,7 +175,7 @@
 
       runcmd:
         - /usr/local/bin/set-ssh.sh
-        - /usr/local/bin/install_cuda_toolkit.sh
+
 
         # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
         - mkdir -p {{ client_mount_path }}/slurm/ssh 
@@ -262,10 +191,15 @@
         - echo "{{ cloud_init_nfs_path }}/packages  /var/lib/packages   nfs defaults,_netdev 0 0" >> /etc/fstab
 
         - chmod {{ file_mode }} /etc/fstab
+        - mount -a
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - /usr/local/bin/configure_vast_installation.sh
-        - mount -a
+
+{% if login_compiler_node_present %}
+        - /usr/local/bin/generate_install_uuid.sh
+        - /usr/local/bin/install_cuda_toolkit.sh
+{% endif %}
 
 {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %}
         # Add NFS entry and mount
@@ -368,6 +302,7 @@
         - systemctl restart sshd
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
+
         - mkdir -p /etc/containers/registries.conf.d
         - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf
 
diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index 30a388d7ef..c884c40dc9 100644
--- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -95,37 +95,21 @@
                 exit 0
             fi
 
-            echo "[INFO] NVIDIA GPU detected. Proceeding with setup."
+            echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation."
 
             # Check if NVIDIA driver is already installed
             if command -v nvidia-smi &>/dev/null; then
                 echo "[INFO] NVIDIA driver already installed. Skipping driver installation."
             else
-                echo "[INFO] Mounting NFS runfile directory for driver installation..."
-                mkdir -p /gpu-runfile
-                mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile
-
-                if [ $? -ne 0 ]; then
-                    echo "[ERROR] Failed to mount NFS runfile share. Exiting."
-                    exit 1
-                fi
-
-                echo "[INFO] Installing NVIDIA driver..."
-                if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then
-                    bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build
-                    if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
-                        echo "[SUCCESS] NVIDIA driver installed successfully."
-                        nvidia-smi -pm 1
-                    else
-                        echo "[ERROR] NVIDIA driver installation failed."
-                    fi
+                echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..."
+                dnf install -y cuda-drivers
+                if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
+                    echo "[SUCCESS] NVIDIA driver installed successfully."
+                    nvidia-smi -pm 1
                 else
-                    echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/"
+                    echo "[ERROR] NVIDIA driver installation failed."
+                    exit 1
                 fi
-
-                echo "[INFO] Cleaning up temporary NFS mount..."
-                umount /gpu-runfile 2>/dev/null
-                rmdir /gpu-runfile 2>/dev/null
             fi
 
             echo "[INFO] Setting up CUDA toolkit mount..."
@@ -214,7 +198,6 @@
 
             echo "===== NVIDIA GPU setup completed ====="
 
-{% if dcgm_support %}
         - path: /usr/local/bin/setup_dcgm.sh
           permissions: '0755'
           content: |
@@ -236,12 +219,50 @@
                 exit 0
             fi
             echo "[INFO] NVIDIA driver prerequisite satisfied."
+            
+            # Display nvidia-smi output for verification
+            echo "========== NVIDIA Driver & GPU Information =========="
+            nvidia-smi 2>&1
+            echo "====================================================="
+
+            # Detect CUDA major version for DCGM package selection
+            echo "[INFO] Detecting CUDA version for DCGM package compatibility..."
+            # Try to get CUDA version from nvidia-smi
+            CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1)
+
+            # Fallback: Try to get CUDA version from nvcc if available
+            if [ -z "$CUDA_VERSION" ]; then
+                if command -v nvcc &>/dev/null; then
+                    CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1)
+                    echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION"
+                else
+                    echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc."
+                    echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup."
+                    exit 1
+                fi
+            else
+                echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION"
+            fi
 
-            # Check if datacenter-gpu-manager package is installed
-            if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then
-                echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup."
+            # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies
+            echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..."
+            if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then
+                echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup."
                 exit 1
             fi
+            echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully."
+
+            # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC)
+            if [ "$CUDA_VERSION" -ge "12" ]; then
+                echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..."
+                if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then
+                    echo "[INFO] DCGM multinode plugin installed successfully."
+                else
+                    echo "[WARN] Failed to install multinode plugin. Continuing without it."
+                fi
+            else
+                echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping."
+            fi
 
             # Enable and start DCGM daemon (SB-003)
             echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..."
@@ -274,16 +295,17 @@
             # GPU discovery (SB-004)
             echo "[INFO] Enumerating GPUs via dcgmi discovery..."
             if command -v dcgmi &>/dev/null; then
-                dcgmi discovery -l
-                echo "[SUCCESS] GPU discovery completed."
+                echo "========== GPU Discovery Output =========="
+                dcgmi discovery -l 2>&1
+                GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU")
+                echo "=========================================="
+                echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)."
             else
                 echo "[WARN] dcgmi command not found. Skipping GPU enumeration."
             fi
 
             echo "===== NVIDIA DCGM setup completed ====="
 
-{% endif %}
-
 {% if hostvars['localhost']['openldap_support'] %}
         - path: /etc/sssd/sssd.conf
           owner: root:root
@@ -305,6 +327,53 @@
           content: |
             {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
 {% endif %}
+
+{% if slurm_node_present %}
+        - path: /usr/local/bin/slurm_cuda_coordinator.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_cuda_driver.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_nvidia_peermem.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_dcgm.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }}
+
+{% if not login_compiler_node_present %}
+        - path: /usr/local/bin/generate_install_uuid.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/cuda_lock_manager.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_cuda_toolkit.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }}
+{% endif %}
+{% endif %}
+
         - path: /usr/local/bin/configure_vast_installation.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -321,7 +390,7 @@
             echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) ====="
             mkdir -p {{ client_mount_path }}/slurm/ssh
             echo "[INFO] Creating base directories for Slurm and Munge"
-            mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+            mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts /hpc_tools/cuda
 
             echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths"
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -332,6 +401,7 @@
             echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images  /hpc_tools/container_images   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts  /hpc_tools/scripts   nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda  /hpc_tools/cuda   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/packages  /var/lib/packages   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -558,10 +628,7 @@
       runcmd:
         - rm -rf /var/lib/cloud/instance
         - /usr/local/bin/set-ssh.sh
-        - /usr/local/bin/install_nvidia_driver.sh
-{% if dcgm_support %}
-        - /usr/local/bin/setup_dcgm.sh
-{% endif %}
+        # slurm user and group created in the users module
 
         - /usr/local/bin/configure_vast_installation.sh
         - /usr/local/bin/configure_dirs_and_mounts.sh
@@ -621,6 +688,16 @@
 
         - /usr/local/bin/setup_nvhpc_sdk.sh
         - /usr/local/bin/export_nvhpc_env.sh
+{% if slurm_node_present %}
+        - |
+          set -e
+          /usr/local/bin/slurm_cuda_coordinator.sh
+          /usr/local/bin/install_cuda_driver.sh
+{% if dcgm_support %}
+          /usr/local/bin/install_dcgm.sh
+{% endif %}
+          /usr/local/bin/install_nvidia_peermem.sh
+{% endif %}
         - systemctl restart slurmd
 
         - echo "Cloud-Init has completed successfully."
\ No newline at end of file
diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 4756e8f1d3..401108acae 100644
--- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -96,37 +96,21 @@
                 exit 0
             fi
 
-            echo "[INFO] NVIDIA GPU detected. Proceeding with setup."
+            echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation."
 
             # Check if NVIDIA driver is already installed
             if command -v nvidia-smi &>/dev/null; then
                 echo "[INFO] NVIDIA driver already installed. Skipping driver installation."
             else
-                echo "[INFO] Mounting NFS runfile directory for driver installation..."
-                mkdir -p /gpu-runfile
-                mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile
-
-                if [ $? -ne 0 ]; then
-                    echo "[ERROR] Failed to mount NFS runfile share. Exiting."
-                    exit 1
-                fi
-
-                echo "[INFO] Installing NVIDIA driver..."
-                if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then
-                    bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build
-                    if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
-                        echo "[SUCCESS] NVIDIA driver installed successfully."
-                        nvidia-smi -pm 1
-                    else
-                        echo "[ERROR] NVIDIA driver installation failed."
-                    fi
+                echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..."
+                dnf install -y cuda-drivers
+                if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
+                    echo "[SUCCESS] NVIDIA driver installed successfully."
+                    nvidia-smi -pm 1
                 else
-                    echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/"
+                    echo "[ERROR] NVIDIA driver installation failed."
+                    exit 1
                 fi
-
-                echo "[INFO] Cleaning up temporary NFS mount..."
-                umount /gpu-runfile 2>/dev/null
-                rmdir /gpu-runfile 2>/dev/null
             fi
 
             echo "[INFO] Setting up CUDA toolkit mount..."
@@ -143,12 +127,12 @@
 
             if [ $? -eq 0 ]; then
                 echo "[SUCCESS] CUDA toolkit NFS mount successful"
-
+                
                 # Add to fstab for persistence
                 grep -q "$cuda_nfs_share" /etc/fstab || echo "$cuda_nfs_share /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab
-
+                
                 echo "[INFO] Configuring persistent CUDA environment..."
-
+                
                 # System-wide profile for login shells
                 cat > /etc/profile.d/cuda.sh << 'EOF'
             export PATH=/usr/local/cuda/bin:$PATH
@@ -156,7 +140,7 @@
             export CUDA_HOME=/usr/local/cuda
             EOF
                 chmod +x /etc/profile.d/cuda.sh
-
+                
                 # Bashrc for non-login shells
                 cat > /etc/bashrc.cuda << 'EOF'
             if [ -d "/usr/local/cuda/bin" ]; then
@@ -166,7 +150,7 @@
             fi
             EOF
                 grep -q "bashrc.cuda" /etc/bashrc || echo "source /etc/bashrc.cuda" >> /etc/bashrc
-
+                
                 # Slurm prolog for job environment
                 mkdir -p /etc/slurm/prolog.d
                 cat > /etc/slurm/prolog.d/cuda.sh << 'EOF'
@@ -176,12 +160,12 @@
             export CUDA_HOME=/usr/local/cuda
             EOF
                 chmod +x /etc/slurm/prolog.d/cuda.sh
-
+                
                 # Apply immediately for current session
                 export PATH=/usr/local/cuda/bin:$PATH
                 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
                 export CUDA_HOME=/usr/local/cuda
-
+                
                 echo "[SUCCESS] Persistent CUDA environment configured"
             else
                 echo "[ERROR] Failed to mount CUDA toolkit NFS share"
@@ -215,8 +199,6 @@
 
             echo "===== NVIDIA GPU setup completed ====="
 
-
-{% if dcgm_support %}
         - path: /usr/local/bin/setup_dcgm.sh
           permissions: '0755'
           content: |
@@ -238,12 +220,50 @@
                 exit 0
             fi
             echo "[INFO] NVIDIA driver prerequisite satisfied."
+            
+            # Display nvidia-smi output for verification
+            echo "========== NVIDIA Driver & GPU Information =========="
+            nvidia-smi 2>&1
+            echo "====================================================="
+
+            # Detect CUDA major version for DCGM package selection
+            echo "[INFO] Detecting CUDA version for DCGM package compatibility..."
+            # Try to get CUDA version from nvidia-smi
+            CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1)
+
+            # Fallback: Try to get CUDA version from nvcc if available
+            if [ -z "$CUDA_VERSION" ]; then
+                if command -v nvcc &>/dev/null; then
+                    CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1)
+                    echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION"
+                else
+                    echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc."
+                    echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup."
+                    exit 1
+                fi
+            else
+                echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION"
+            fi
 
-            # Check if datacenter-gpu-manager package is installed
-            if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then
-                echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup."
+            # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies
+            echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..."
+            if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then
+                echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup."
                 exit 1
             fi
+            echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully."
+
+            # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC)
+            if [ "$CUDA_VERSION" -ge "12" ]; then
+                echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..."
+                if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then
+                    echo "[INFO] DCGM multinode plugin installed successfully."
+                else
+                    echo "[WARN] Failed to install multinode plugin. Continuing without it."
+                fi
+            else
+                echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping."
+            fi
 
             # Enable and start DCGM daemon (SB-003)
             echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..."
@@ -276,16 +296,17 @@
             # GPU discovery (SB-004)
             echo "[INFO] Enumerating GPUs via dcgmi discovery..."
             if command -v dcgmi &>/dev/null; then
-                dcgmi discovery -l
-                echo "[SUCCESS] GPU discovery completed."
+                echo "========== GPU Discovery Output =========="
+                dcgmi discovery -l 2>&1
+                GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU")
+                echo "=========================================="
+                echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)."
             else
                 echo "[WARN] dcgmi command not found. Skipping GPU enumeration."
             fi
 
             echo "===== NVIDIA DCGM setup completed ====="
 
-{% endif %}
-
 {% if hostvars['localhost']['openldap_support'] %}
         - path: /etc/sssd/sssd.conf
           owner: root:root
@@ -308,6 +329,52 @@
             {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
 {% endif %}
 
+{% if slurm_node_present %}
+        - path: /usr/local/bin/slurm_cuda_coordinator.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_cuda_driver.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_nvidia_peermem.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_dcgm.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }}
+
+{% if not login_compiler_node_present %}
+        - path: /usr/local/bin/generate_install_uuid.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/cuda_lock_manager.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_cuda_toolkit.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }}
+{% endif %}
+{% endif %}
+
         - path: /etc/hosts
           append: true
           content: |
@@ -349,6 +416,7 @@
             echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images  /hpc_tools/container_images   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts  /hpc_tools/scripts   nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path}}/hpc_tools/  /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/packages  /var/lib/packages   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -562,10 +630,7 @@
       runcmd:
         - rm -rf /var/lib/cloud/instance
         - /usr/local/bin/set-ssh.sh
-        - /usr/local/bin/install_nvidia_driver.sh
-{% if dcgm_support %}
-        - /usr/local/bin/setup_dcgm.sh
-{% endif %}
+        
         # slurm user and group created in the users module
         - /usr/local/bin/configure_vast_installation.sh
         - /usr/local/bin/configure_dirs_and_mounts.sh
@@ -628,6 +693,16 @@
 {% endif %}
         - /usr/local/bin/setup_nvhpc_sdk.sh
         - /usr/local/bin/export_nvhpc_env.sh
+{% if slurm_node_present %}
+        - |
+          set -e
+          /usr/local/bin/slurm_cuda_coordinator.sh
+          /usr/local/bin/install_cuda_driver.sh
+{% if dcgm_support %}
+          /usr/local/bin/install_dcgm.sh
+{% endif %}
+          /usr/local/bin/install_nvidia_peermem.sh
+{% endif %}
         - systemctl restart slurmd
 
         - echo "Cloud-Init has completed successfully."
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2
new file mode 100644
index 0000000000..c037204a28
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Distributed lock manager for CUDA toolkit install on shared NFS.
+# Backed by atomic mkdir on /hpc_tools/cuda/.nfs_lock_cuda.
+# Exposes: acquire | release | wait | is_stale
+set -euo pipefail
+
+LOCK_ROOT="/hpc_tools/cuda"
+LOCK_DIR="$LOCK_ROOT/.nfs_lock_cuda"
+OWNER_FILE="$LOCK_DIR/owner.txt"
+DONE_FILE="$LOCK_ROOT/.done_cuda"
+STATUS_LOG="$LOCK_ROOT/.cuda_install_status.log"
+HOSTNAME_FILE="/var/run/cuda_install_hostname"
+
+INSTALL_TIMEOUT="${INSTALL_TIMEOUT:-1800}"
+POLL_INTERVAL="${POLL_INTERVAL:-5}"
+TAKEOVER_MIN="${TAKEOVER_MIN:-5}"
+TAKEOVER_MAX="${TAKEOVER_MAX:-15}"
+GLOBAL_WAIT_TIMEOUT="${GLOBAL_WAIT_TIMEOUT:-$((INSTALL_TIMEOUT * 2))}"
+
+log_status() {
+    # ts  host  hostname  role  result
+    printf '%s  %s  %s  %s  %s\n' \
+        "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \
+        "$(cat "$HOSTNAME_FILE" 2>/dev/null || echo UNKNOWN)" \
+        "$1" "$2" >> "$STATUS_LOG"
+}
+
+acquire() {
+    # Fast path: already done
+    [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 2; }
+    if mkdir "$LOCK_DIR" 2>/dev/null; then
+        cat "$HOSTNAME_FILE" > "$OWNER_FILE"
+        log_status installer lock_acquired
+        return 0   # we are installer
+    fi
+    return 1       # we are waiter
+}
+
+release() { rm -rf "$LOCK_DIR"; }
+
+is_stale() {
+    # Owner hostname → hostname from status log → ping
+    local owner_hostname host
+    owner_hostname=$(cat "$OWNER_FILE" 2>/dev/null || echo "")
+    [ -z "$owner_hostname" ] && return 1
+    host=$(awk -v h="$owner_hostname" '$3==h {print $2; exit}' "$STATUS_LOG")
+    [ -z "$host" ] && return 1
+    ping -c1 -W2 "$host" >/dev/null 2>&1 && return 1
+    return 0       # host unreachable → stale
+}
+
+wait_for_done_or_takeover() {
+    local started; started=$(date +%s)
+    while true; do
+        [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 0; }
+        if [ ! -d "$LOCK_DIR" ]; then
+            sleep $(( RANDOM % (TAKEOVER_MAX - TAKEOVER_MIN + 1) + TAKEOVER_MIN ))
+            return 10   # caller should retry acquire
+        fi
+        if is_stale; then
+            log_status waiter crash_detected
+            release
+            continue
+        fi
+        (( $(date +%s) - started > GLOBAL_WAIT_TIMEOUT )) && {
+            log_status timeout_waiter fail; return 1;
+        }
+        sleep "$POLL_INTERVAL"
+    done
+}
+
+case "${1:-}" in
+    acquire) acquire ;;
+    release) release ;;
+    wait)    wait_for_done_or_takeover ;;
+    is_stale) is_stale ;;
+    *) echo "usage: $0 {acquire|release|wait|is_stale}" >&2; exit 64 ;;
+esac
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2
new file mode 100644
index 0000000000..be8fb867b3
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Generate hostname for lock ownership identity.
+# Idempotent: uses hostname directly.
+set -euo pipefail
+
+HOSTNAME_FILE="/var/run/cuda_install_hostname"
+
+hostname > "$HOSTNAME_FILE"
+
+echo "[INFO] CUDA install hostname for this node: $(cat "$HOSTNAME_FILE")"
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2
new file mode 100644
index 0000000000..ba2cde8f3d
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Local NVIDIA driver install. Always runs on Slurm nodes. Idempotent.
+# Never touches NFS lock artifacts. Never touches /hpc_tools/cuda contents.
+set -euo pipefail
+
+LOGFILE="/var/log/nvidia_install.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== NVIDIA driver install ====="
+
+if ! lspci | grep -qi nvidia; then
+    echo "[INFO] No NVIDIA GPU detected. Exiting."
+    exit 0
+fi
+
+if command -v nvidia-smi >/dev/null 2>&1; then
+    echo "[INFO] NVIDIA driver already installed. Skipping."
+else
+    echo "[INFO] Installing NVIDIA driver via dnf..."
+    dnf install -y cuda-drivers
+    command -v nvidia-smi >/dev/null 2>&1 || { echo "[ERROR] Driver install failed."; exit 1; }
+fi
+
+nvidia-smi -pm 1 || true
+
+# Mount shared toolkit at /usr/local/cuda (harmless if already mounted)
+mkdir -p /usr/local/cuda
+CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda"
+if ! mountpoint -q /usr/local/cuda; then
+    mount -t nfs "$CUDA_NFS" /usr/local/cuda || true
+fi
+grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \
+    echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab
+
+cat > /etc/profile.d/cuda.sh <<'EOF'
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export CUDA_HOME=/usr/local/cuda
+EOF
+chmod +x /etc/profile.d/cuda.sh
+
+echo "===== NVIDIA driver install completed ====="
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2
new file mode 100644
index 0000000000..471c3be291
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2
@@ -0,0 +1,166 @@
+#!/bin/bash
+# Lock-aware CUDA toolkit installer. Publishes to /hpc_tools/cuda on NFS.
+# Exits 0 if toolkit is already present (.done_cuda), if this node installed it,
+set -euo pipefail
+
+LOGFILE="/var/log/cuda_toolkit_install.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+LOCK_ROOT="/hpc_tools/cuda"
+DONE_FILE="$LOCK_ROOT/.done_cuda"
+LOCK_MGR="/usr/local/bin/cuda_lock_manager.sh"
+HOSTNAME_FILE="/var/run/cuda_install_hostname"
+
+# Function to set up CUDA environment variables
+setup_cuda_env() {
+    echo "[INFO] Setting up CUDA environment variables for shared location..."
+    cat > /etc/profile.d/cuda.sh <<'EOF'
+export PATH=/hpc_tools/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/hpc_tools/cuda/lib64:$LD_LIBRARY_PATH
+export CUDA_HOME=/hpc_tools/cuda
+EOF
+    chmod +x /etc/profile.d/cuda.sh
+    echo "[INFO] CUDA environment configured successfully"
+}
+
+# Generate hostname for lock ownership (idempotent)
+/usr/local/bin/generate_install_uuid.sh
+
+# Fast-path: already done
+[ -f "$DONE_FILE" ] && {
+    echo "[INFO] CUDA toolkit already installed on shared storage by another node."
+    echo "[INFO] This node will use the existing CUDA installation."
+    setup_cuda_env
+    echo "[INFO] CUDA environment configured successfully."
+    exit 0
+}
+
+# Check if running in manual mode (not cloud-init)
+MANUAL_MODE="${CUDA_INSTALL_MANUAL:-false}"
+if [ "$MANUAL_MODE" = "true" ]; then
+    echo "[INFO] Running in manual mode - will force acquire lock if held"
+    FORCE_LOCK=true
+else
+    echo "[INFO] Running in cloud-init mode - will proceed without waiting if lock held"
+    FORCE_LOCK=false
+fi
+
+# Attempt lock acquisition
+set +e; "$LOCK_MGR" acquire; rc=$?; set -e
+
+# In manual mode, if lock is held, release it explicitly then acquire again
+if [ "$FORCE_LOCK" = "true" ] && [ "$rc" = "1" ]; then
+    echo "[WARN] Lock is held by another node. In manual mode, releasing lock first..."
+    "$LOCK_MGR" release
+    echo "[INFO] Lock released. Now acquiring lock..."
+    set +e; "$LOCK_MGR" acquire; rc=$?; set -e
+fi
+case $rc in
+    0)   # installer
+        echo "[INFO] Acquired lock. Installing toolkit..."
+        mkdir -p /shared-cuda-toolkit
+        mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
+
+        if [ $? -ne 0 ]; then
+            echo "[ERROR] Failed to mount NFS cuda share."
+            echo "[ERROR] CUDA toolkit installation failed on this node."
+            echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh"
+            "$LOCK_MGR" release
+            exit 1
+        fi
+
+        # Check if CUDA toolkit is already installed on NFS
+        if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then
+            echo "[INFO] CUDA toolkit already installed on NFS. Exiting."
+            "$LOCK_MGR" release
+            exit 0
+        fi
+
+        # Install CUDA toolkit to local location using dnf
+        echo "[INFO] Installing CUDA toolkit to local location using dnf..."
+        mkdir -p /cuda
+        # Copy host repository configuration to installroot (only for manual mode)
+        if [ "$MANUAL_MODE" = "true" ]; then
+            mkdir -p /cuda/etc/yum.repos.d
+            cp -r /etc/yum.repos.d/* /cuda/etc/yum.repos.d/ 2>/dev/null || true
+        fi
+        if timeout "${INSTALL_TIMEOUT:-1800}" dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit; then
+            echo "[SUCCESS] CUDA toolkit installed successfully."
+
+            # Clean up repository configuration from installroot (if copied for manual mode)
+            if [ "$MANUAL_MODE" = "true" ]; then
+                rm -rf /cuda/etc/yum.repos.d
+            fi
+
+            # Copy CUDA toolkit to shared location
+            echo "[INFO] Copying CUDA toolkit to shared location..."
+            CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1)
+            if [ -z "$CUDA_SRC_DIR" ]; then
+                echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/"
+                echo "[ERROR] CUDA toolkit installation failed on this node."
+                echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh"
+                "$LOCK_MGR" release
+                exit 1
+            fi
+
+            echo "[INFO] Found CUDA at: $CUDA_SRC_DIR"
+            echo "[INFO] Copying contents directly to /shared-cuda-toolkit..."
+            cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true
+
+            # Verify CUDA toolkit installation
+            echo "[INFO] Verifying CUDA toolkit installation..."
+            if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then
+                echo "[SUCCESS] CUDA toolkit verified."
+            else
+                echo "[ERROR] CUDA toolkit (nvcc) not found after installation."
+                echo "[ERROR] CUDA toolkit installation failed on this node."
+                echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh"
+                "$LOCK_MGR" release
+                exit 1
+            fi
+
+            # Atomic publish of .done_cuda (see §4.4). Never use `touch`.
+            TMP="$LOCK_ROOT/.done_cuda.tmp.$(cat $HOSTNAME_FILE)"
+            printf 'installed_by=%s\nts=%s\n' \
+                "$(hostname -s)" "$(date -Iseconds)" > "$TMP"
+            sync -f "$TMP" 2>/dev/null || sync
+            mv -f -- "$TMP" "$DONE_FILE"
+            "$LOCK_MGR" release
+            # log pass
+            printf '%s  %s  %s  installer  pass\n' \
+                "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \
+                "$(cat $HOSTNAME_FILE)" \
+                >> "$LOCK_ROOT/.cuda_install_status.log"
+            
+            setup_cuda_env
+            
+            umount /shared-cuda-toolkit 2>/dev/null
+            exit 0
+        else
+            result=$?
+            "$LOCK_MGR" release
+            [ "$result" = "124" ] && st="timeout_killed" || st="fail"
+            printf '%s  %s  %s  installer  %s\n' \
+                "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \
+                "$(cat $HOSTNAME_FILE)" "$st" \
+                >> "$LOCK_ROOT/.cuda_install_status.log"
+            echo "[ERROR] CUDA toolkit installation failed on this node."
+            echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh"
+            exit 1
+        fi
+        ;;
+    1)   # waiter - another node is installing
+        echo "[INFO] Another node is installing CUDA toolkit. Proceeding with cloud-init without waiting."
+        echo "[INFO] This node will use the shared CUDA toolkit once installation completes."
+        setup_cuda_env
+        echo "[INFO] CUDA environment configured (will work once installation completes)"
+        
+        exit 0
+        ;;
+    2)  # already done
+        echo "[INFO] CUDA toolkit already installed on shared storage."
+        setup_cuda_env
+        exit 0
+        ;;
+    *)  echo "[ERROR] acquire rc=$rc"; exit 1 ;;
+esac
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2
new file mode 100644
index 0000000000..158e089805
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2
@@ -0,0 +1,98 @@
+#!/bin/bash
+LOGFILE="/var/log/dcgm_setup.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== Starting NVIDIA DCGM setup ====="
+
+# GPU detection gate - DCGM requires NVIDIA GPU hardware
+if ! lspci | grep -qi nvidia; then
+    echo "[INFO] No NVIDIA GPU detected. Skipping DCGM setup."
+    exit 0
+fi
+
+# CUDA prerequisite gate
+echo "[INFO] Validating NVIDIA driver prerequisite..."
+if ! command -v nvidia-smi &>/dev/null; then
+    echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping DCGM setup."
+    exit 0
+fi
+
+if ! nvidia-smi &>/dev/null; then
+    echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping DCGM setup."
+    exit 0
+fi
+echo "[INFO] NVIDIA driver prerequisite satisfied."
+
+# Display nvidia-smi output for verification
+echo "========== NVIDIA Driver & GPU Information =========="
+nvidia-smi 2>&1
+echo "====================================================="
+
+# Detect CUDA major version for DCGM package selection
+echo "[INFO] Detecting CUDA version for DCGM package compatibility..."
+# Try to get CUDA version from nvidia-smi
+CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1)
+
+# Fallback: Try to get CUDA version from nvcc if available
+if [ -z "$CUDA_VERSION" ]; then
+    if command -v nvcc &>/dev/null; then
+        CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1)
+        echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION"
+    else
+        echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc."
+        echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup."
+        exit 1
+    fi
+else
+    echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION"
+fi
+
+# Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies
+echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..."
+if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then
+    echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup."
+    exit 1
+fi
+echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully."
+
+# Enable and start DCGM daemon (SB-003)
+echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..."
+systemctl enable {{ dcgm_service_name }}
+
+RETRIES={{ dcgm_health_check_retries }}
+ATTEMPT=0
+DCGM_STARTED=false
+
+while [ $ATTEMPT -lt $RETRIES ]; do
+    ATTEMPT=$((ATTEMPT + 1))
+    echo "[INFO] Starting {{ dcgm_service_name }} (attempt $ATTEMPT/$RETRIES)..."
+    systemctl start {{ dcgm_service_name }}
+    sleep 3
+
+    if systemctl is-active --quiet {{ dcgm_service_name }}; then
+        DCGM_STARTED=true
+        echo "[SUCCESS] {{ dcgm_service_name }}.service is active."
+        break
+    else
+        echo "[WARN] {{ dcgm_service_name }} failed to start on attempt $ATTEMPT."
+    fi
+done
+
+if [ "$DCGM_STARTED" != "true" ]; then
+    echo "[ERROR] {{ dcgm_service_name }} failed to start after $RETRIES attempts. Service will stay down (BL-002)."
+    exit 1
+fi
+
+# GPU discovery (SB-004)
+echo "[INFO] Enumerating GPUs via dcgmi discovery..."
+if command -v dcgmi &>/dev/null; then
+    echo "========== GPU Discovery Output =========="
+    dcgmi discovery -l 2>&1
+    GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU")
+    echo "=========================================="
+    echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)."
+else
+    echo "[WARN] dcgmi command not found. Skipping GPU enumeration."
+fi
+
+echo "===== NVIDIA DCGM setup completed ====="
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2
new file mode 100644
index 0000000000..4a51c179ae
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2
@@ -0,0 +1,141 @@
+#!/bin/bash
+# NVIDIA Peer Memory (nvidia-peermem) DKMS installation for GPUDirect RDMA support.
+# SHALL be installed on all compute nodes where GPU hardware is detected.
+# Required on RDMA-capable GPU nodes only.
+# Idempotent: skips installation if module is already loaded.
+set -euo pipefail
+
+LOGFILE="/var/log/nvidia_peermem_install.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== Starting NVIDIA Peer Memory (nvidia-peermem) setup ====="
+
+# GPU detection gate - only proceed if NVIDIA GPU is present
+echo "[INFO] Checking for NVIDIA GPU hardware..."
+if ! lspci | grep -qi nvidia; then
+    echo "[INFO] No NVIDIA GPU detected. Skipping nvidia-peermem installation."
+    exit 0
+fi
+
+# NVIDIA driver prerequisite gate
+echo "[INFO] Validating NVIDIA driver prerequisite..."
+if ! command -v nvidia-smi &>/dev/null; then
+    echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping nvidia-peermem."
+    exit 0
+fi
+
+if ! nvidia-smi &>/dev/null; then
+    echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping nvidia-peermem."
+    exit 0
+fi
+echo "[INFO] NVIDIA driver prerequisite satisfied."
+
+# Check if nvidia-peermem module is already loaded
+echo "[INFO] Checking if nvidia-peermem module is already loaded..."
+if lsmod | grep -qE 'nv_peer_mem|nvidia_peermem'; then
+    echo "[INFO] nvidia-peermem module is already loaded. Skipping installation."
+    # Verify module metadata
+    if modinfo nvidia-peermem &>/dev/null; then
+        echo "[INFO] nvidia-peermem module metadata verified."
+    else
+        echo "[WARN] nvidia-peermem module loaded but modinfo failed. This may indicate a corrupted module."
+    fi
+    exit 0
+fi
+
+# Check running kernel
+KERNEL_VERSION=$(uname -r)
+echo "[INFO] Running kernel version: $KERNEL_VERSION"
+
+# Check if kernel headers are available (required for DKMS)
+if [ ! -d "/lib/modules/$KERNEL_VERSION/build" ]; then
+    echo "[ERROR] Kernel headers not found for kernel $KERNEL_VERSION."
+    echo "[ERROR] Required for DKMS build. Please install kernel-devel package."
+    exit 1
+fi
+
+# Get NVIDIA driver version from nvidia-smi
+NVIDIA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | tr -d ' ')
+if [ -z "$NVIDIA_VERSION" ]; then
+    echo "[ERROR] Could not determine NVIDIA driver version from nvidia-smi."
+    exit 1
+fi
+echo "[INFO] NVIDIA driver version: $NVIDIA_VERSION"
+
+# Check current DKMS status
+echo "[INFO] Checking current DKMS status..."
+dkms status || true
+
+# Add NVIDIA driver to DKMS if not already added
+if ! dkms status | grep -q "nvidia/$NVIDIA_VERSION"; then
+    echo "[INFO] Adding NVIDIA driver $NVIDIA_VERSION to DKMS..."
+    if ! dkms add -m nvidia -v "$NVIDIA_VERSION"; then
+        echo "[ERROR] Failed to add NVIDIA driver to DKMS."
+        exit 1
+    fi
+    echo "[INFO] NVIDIA driver added to DKMS successfully."
+else
+    echo "[INFO] NVIDIA driver $NVIDIA_VERSION already in DKMS."
+fi
+
+# Build NVIDIA module for the running kernel
+echo "[INFO] Building NVIDIA module for kernel $KERNEL_VERSION..."
+if ! dkms build -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then
+    echo "[ERROR] Failed to build NVIDIA module for kernel $KERNEL_VERSION."
+    echo "[ERROR] Check kernel logs for build errors."
+    exit 1
+fi
+echo "[INFO] NVIDIA module built successfully."
+
+# Install the built module
+echo "[INFO] Installing NVIDIA module for kernel $KERNEL_VERSION..."
+if ! dkms install -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then
+    echo "[ERROR] Failed to install NVIDIA module for kernel $KERNEL_VERSION."
+    exit 1
+fi
+echo "[INFO] NVIDIA module installed successfully."
+
+# Verify nvidia-peermem module metadata
+echo "[INFO] Verifying nvidia-peermem module metadata..."
+if modinfo nvidia-peermem &>/dev/null; then
+    echo "[INFO] nvidia-peermem module metadata verified."
+    modinfo nvidia-peermem
+else
+    echo "[ERROR] nvidia-peermem module metadata not found after DKMS install."
+    echo "[ERROR] This may indicate the module was not built or installed correctly."
+    exit 1
+fi
+
+# Ensure base NVIDIA modules are loaded first
+    echo "Loading base NVIDIA modules..."
+    modprobe nvidia 2>/dev/null || echo "nvidia module not available or failed to load"
+    modprobe nvidia-uvm 2>/dev/null || echo "nvidia-uvm module not available or failed to load"
+    modprobe nvidia-modeset 2>/dev/null || echo "nvidia-modeset module not available or failed to load"
+    modprobe nvidia-drm 2>/dev/null || echo "nvidia-drm module not available or failed to load"
+
+# Load the nvidia-peermem module
+echo "[INFO] Loading nvidia-peermem module..."
+if modprobe nvidia-peermem; then
+    echo "[SUCCESS] nvidia-peermem module loaded successfully."
+else
+    echo "[WARN] Failed to load nvidia-peermem module with modprobe."
+    echo "[WARN] This may not be critical if RDMA is not required on this node."
+    echo "[WARN] Check kernel logs for detailed error information."
+    dmesg | grep -i peermem || true
+    # Continue with warning unless RDMA dependency exists
+    # (RDMA dependency check would be environment-specific)
+fi
+
+# Confirm module is loaded
+if lsmod | grep -q nvidia_peermem; then
+    echo "[SUCCESS] nvidia_peermem is loaded in kernel."
+else
+    echo "[WARN] nvidia_peermem not found in lsmod output."
+    echo "[WARN] Module may have failed to load or may not be required for this configuration."
+fi
+
+# Check kernel logs for peer memory messages or errors
+echo "[INFO] Checking kernel logs for peer memory messages..."
+dmesg | grep -i peermem || echo "[INFO] No peermem messages found in recent kernel logs."
+
+echo "===== NVIDIA Peer Memory (nvidia-peermem) setup completed ====="
diff --git a/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2
new file mode 100644
index 0000000000..79d72db10b
--- /dev/null
+++ b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Slurm-node entry point. Decides toolkit path based on login_compiler_node_present.
+# GPU detection gate ensures CUDA operations only run on nodes with NVIDIA hardware.
+
+set -euo pipefail
+
+LOGIN_COMPILER_PRESENT="{{ login_compiler_node_present | lower }}"
+SLURM_NODE_PRESENT="{{ slurm_node_present | lower }}"
+
+[ "$SLURM_NODE_PRESENT" = "true" ] || { echo "[INFO] Not a Slurm node."; exit 0; }
+
+# GPU detection gate - if no GPU present, skip CUDA toolkit and driver installation
+if ! lspci | grep -qi nvidia; then
+    echo "[INFO] No NVIDIA GPU detected. Skipping CUDA toolkit and driver installation."
+    exit 0
+fi
+
+if [ "$LOGIN_COMPILER_PRESENT" = "true" ]; then
+    echo "[INFO] Login/compiler nodes present → mounting shared toolkit from NFS."
+    # Mount shared toolkit at /usr/local/cuda
+    mkdir -p /usr/local/cuda
+    CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda"
+    if ! mountpoint -q /usr/local/cuda; then
+        mount -t nfs "$CUDA_NFS" /usr/local/cuda || true
+    fi
+    grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \
+        echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab
+    # Export CUDA environment variables
+    cat > /etc/profile.d/cuda.sh <<'EOF'
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export CUDA_HOME=/usr/local/cuda
+EOF
+    chmod +x /etc/profile.d/cuda.sh
+    echo "[INFO] CUDA environment configured from shared NFS toolkit."
+else
+    echo "[INFO] No login/compiler nodes → participating in lock."
+    # install_cuda_toolkit.sh is lock-aware:
+    #   - if this node wins the lock, it runs the install and publishes .done_cuda
+    #   - if this node loses the lock, it returns immediately without waiting
+    if ! /usr/local/bin/install_cuda_toolkit.sh; then
+        echo "[ERROR] install_cuda_toolkit.sh returned non-zero."
+        exit 1
+    fi
+    echo "[INFO] CUDA toolkit installation handled by another node or completed by this node."
+    echo "[INFO] Proceeding with driver, DCGM, and nvidia-peermem installation."
+fi
+
+echo "[SUCCESS] CUDA coordinator completed."
+exit 0
diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml
index 9be62ddcbe..8de6ceb69a 100644
--- a/provision/roles/configure_ochami/vars/main.yml
+++ b/provision/roles/configure_ochami/vars/main.yml
@@ -102,15 +102,14 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*')
 # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role)
 all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}"
 
-# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role)
-cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}"
-cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}"
-
+# Login/compiler node presence flag (set by slurm_config role)
+login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}"
+slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}"
 # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2
+dcgm_support: "{{ hostvars['localhost'].get('telemetry_sources', {}).get('dcgm', {}).get('metrics_enabled', true) | bool }}"
 # NVIDIA DCGM (Data Center GPU Manager) configuration
 dcgm_service_name: "nvidia-dcgm"
 dcgm_health_check_retries: 3
-dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) }}"
 
 # Usage: fetch_additional_images.yml
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
diff --git a/provision/roles/slurm_config/tasks/create_slurm_dir.yml b/provision/roles/slurm_config/tasks/create_slurm_dir.yml
index b68bcbbded..a89b33aeb3 100644
--- a/provision/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/provision/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -30,22 +30,6 @@
     name: slurm_custom_aarch64
   failed_when: false
 
-- name: Extract CUDA runfile name for x86_64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_x86_64 is defined
-    - slurm_custom_x86_64.slurm_node is defined
-    - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
-- name: Extract CUDA runfile name for aarch64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_aarch64 is defined
-    - slurm_custom_aarch64.slurm_node is defined
-    - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
 - name: Set facts for slurm
   ansible.builtin.set_fact:
     nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
diff --git a/provision/roles/slurm_config/tasks/hpc_tools.yml b/provision/roles/slurm_config/tasks/hpc_tools.yml
index 46260da267..37a2c166d7 100644
--- a/provision/roles/slurm_config/tasks/hpc_tools.yml
+++ b/provision/roles/slurm_config/tasks/hpc_tools.yml
@@ -22,7 +22,6 @@
     mode: "{{ common_mode }}"
   loop:
     - cuda
-    - runfile
     - scripts
     - container_images
     - nvidia_sdk
diff --git a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml
index c61e8d92a9..5b99d35c30 100644
--- a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml
+++ b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml
@@ -92,3 +92,11 @@
   ansible.builtin.set_fact:
     controller_ip: "{{ ip_name_map[ctld_list | first] }}"
   when: ctld_list | length > 0
+
+- name: Set login_compiler_node_present flag
+  ansible.builtin.set_fact:
+    login_compiler_node_present: "{{ compiler_login_list | length > 0 }}"
+
+- name: Set slurm_node_present flag
+  ansible.builtin.set_fact:
+    slurm_node_present: "{{ cmpt_list | length > 0 }}"
diff --git a/provision/roles/slurm_config/vars/main.yml b/provision/roles/slurm_config/vars/main.yml
index 580d776d92..d2f60542ed 100644
--- a/provision/roles/slurm_config/vars/main.yml
+++ b/provision/roles/slurm_config/vars/main.yml
@@ -171,16 +171,6 @@ parallel_copy_max_workers: 4
 
 parallel_copy_candidates:
 
-  # CUDA Runfile (aarch64 repo path)
-  - name: cuda_runfile_aarch64
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-
-  # CUDA Runfile (x86_64 repo path)
-  - name: cuda_runfile_x86_64
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-
   # NVIDIA HPC SDK (x86_64 tarball extracted dir)
   - name: nvhpc_sdk_x86_64
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"