From 5962ce9a0742e2746117e5522a34ac0dbcf91f25 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 16 Apr 2026 15:28:57 +0530 Subject: [PATCH 01/13] adding the cuda and nvidia driver version gate check --- .../ci-group-slurm_node_aarch64.yaml.j2 | 20 +++++++++++++++++++ .../ci-group-slurm_node_x86_64.yaml.j2 | 20 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 49e5322195..73dbf7f494 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -237,6 +237,26 @@ fi echo "[INFO] NVIDIA driver prerequisite satisfied." + # NVIDIA driver version check (Eng Spec Section 11, Pipeline 1 Step 5) + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]') + DRIVER_MAJOR=$(echo "$DRIVER_VERSION" | cut -d. -f1) + if [ -n "$DRIVER_MAJOR" ] && [ "$DRIVER_MAJOR" -ge 550 ] 2>/dev/null; then + echo "[INFO] NVIDIA driver version $DRIVER_VERSION meets minimum requirement (>= 550.x)." + else + echo "[ERROR] NVIDIA driver version ${DRIVER_VERSION:-unknown} does not meet minimum requirement (>= 550.x). Skipping DCGM setup." + exit 1 + fi + + # CUDA version check (Functional Spec VC-007: minimum CUDA 12.x required) + CUDA_VERSION=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed 's/.*CUDA Version: *//' | sed 's/ .*//') + CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) + if [ -n "$CUDA_MAJOR" ] && [ "$CUDA_MAJOR" -ge 12 ] 2>/dev/null; then + echo "[INFO] CUDA version $CUDA_VERSION meets minimum requirement (>= 12.x)." + else + echo "[ERROR] CUDA version ${CUDA_VERSION:-unknown} does not meet minimum requirement (>= 12.x). Skipping DCGM setup." + exit 1 + fi + # Check if datacenter-gpu-manager package is installed if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index ccffc5cd9e..418a8cc677 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -239,6 +239,26 @@ fi echo "[INFO] NVIDIA driver prerequisite satisfied." + # NVIDIA driver version check (Eng Spec Section 11, Pipeline 1 Step 5) + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]') + DRIVER_MAJOR=$(echo "$DRIVER_VERSION" | cut -d. -f1) + if [ -n "$DRIVER_MAJOR" ] && [ "$DRIVER_MAJOR" -ge 550 ] 2>/dev/null; then + echo "[INFO] NVIDIA driver version $DRIVER_VERSION meets minimum requirement (>= 550.x)." + else + echo "[ERROR] NVIDIA driver version ${DRIVER_VERSION:-unknown} does not meet minimum requirement (>= 550.x). Skipping DCGM setup." + exit 1 + fi + + # CUDA version check (Functional Spec VC-007: minimum CUDA 12.x required) + CUDA_VERSION=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed 's/.*CUDA Version: *//' | sed 's/ .*//') + CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) + if [ -n "$CUDA_MAJOR" ] && [ "$CUDA_MAJOR" -ge 12 ] 2>/dev/null; then + echo "[INFO] CUDA version $CUDA_VERSION meets minimum requirement (>= 12.x)." + else + echo "[ERROR] CUDA version ${CUDA_VERSION:-unknown} does not meet minimum requirement (>= 12.x). Skipping DCGM setup." + exit 1 + fi + # Check if datacenter-gpu-manager package is installed if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." From 9bdacdd62b64a0453a508b9f21b28771c8188aaf Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 23 Apr 2026 20:16:06 +0530 Subject: [PATCH 02/13] initial cuda dnf installation --- .../schema/telemetry_config.json | 6 +- .../aarch64/rhel/10.0/slurm_custom.json | 5 - .../config/x86_64/rhel/10.0/slurm_custom.json | 6 +- input/telemetry_config.yml | 22 ---- ...-group-login_compiler_node_aarch64.yaml.j2 | 114 +++++++++-------- ...i-group-login_compiler_node_x86_64.yaml.j2 | 105 +++++++-------- .../ci-group-slurm_node_aarch64.yaml.j2 | 86 +++++-------- .../ci-group-slurm_node_x86_64.yaml.j2 | 120 +++++++++--------- .../roles/configure_ochami/vars/main.yml | 5 - .../slurm_config/tasks/create_slurm_dir.yml | 16 --- .../roles/slurm_config/tasks/hpc_tools.yml | 1 - provision/roles/slurm_config/vars/main.yml | 10 -- 12 files changed, 211 insertions(+), 285 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index a6523462e8..79f602eee3 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -6,10 +6,6 @@ "idrac_telemetry_support": { "type": "boolean" }, - "dcgm_support": { - "type": "boolean", - "description": "Enable or disable NVIDIA DCGM (Data Center GPU Manager) on GPU compute nodes. When true, nvidia-dcgm.service is started during cloud-init provisioning. Default: true" - }, "idrac_telemetry_collection_type": { "anyOf": [ { @@ -126,7 +122,7 @@ ] } }, - "required": ["idrac_telemetry_support", "dcgm_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port" ], + "required": ["idrac_telemetry_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port" ], "$defs": { "kafka_configurations": { "type": "object", diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index de29f815fe..b3e950b4f3 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -24,11 +24,6 @@ {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"}, {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"}, {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"}, - {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"}, - {"package": "cuda-run", - "type": "iso", - "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run" - }, { "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0", "type": "tarball", diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index dc0c23452f..0a6b097414 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -28,11 +28,7 @@ {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"}, {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"}, {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"}, - {"package": "datacenter-gpu-manager-4-core", "type": "rpm", "repo_name": "cuda"}, - {"package": "cuda-run", - "type": "iso", - "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run" - }, + {"package": "mpich", "url": ""}, { "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", "type": "tarball", diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index 397806c594..c568b17186 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -85,28 +85,6 @@ idrac_telemetry_support: true # Default: "victoria,kafka" idrac_telemetry_collection_type: "victoria,kafka" -# ============================================================================ -# NVIDIA DCGM (Data Center GPU Manager) CONFIGURATION -# ============================================================================ -# DCGM monitors NVIDIA GPU health and collects GPU telemetry metrics from -# compute nodes equipped with NVIDIA GPUs. -# -# When enabled, the DCGM daemon (nvidia-dcgm.service) is started on GPU nodes -# during cloud-init provisioning. It validates the NVIDIA driver, enumerates -# GPUs via dcgmi discovery, and exposes GPU health data. -# -# PREREQUISITE: NVIDIA GPU driver must be installed on the target nodes. -# The driver is installed automatically via the CUDA runfile during -# cloud-init when an NVIDIA GPU is detected via lspci. -# -# NOTE: DCGM exporter deployment (dcgm-exporter on port 9400) and Prometheus -# scrape configuration are planned for a future release. - -# Enable or disable NVIDIA DCGM support on GPU compute nodes -# Accepted values: true or false -# Default: true -dcgm_support: true - # ============================================================================ # VICTORIAMETRICS CONFIGURATION # ============================================================================ diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index fe6966c4be..16918bb8aa 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -93,78 +93,90 @@ exit 0 fi - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes + # Mount NFS cuda directory + echo "[INFO] Mounting NFS cuda directory..." mkdir -p /shared-cuda-toolkit mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null exit 1 fi - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." + # Check if CUDA toolkit is already installed on NFS + if [ -f "/shared-cuda-toolkit/usr/local/cuda/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/usr/local/cuda/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." + umount /shared-cuda-toolkit 2>/dev/null + exit 0 + fi - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE + echo "[INFO] Installing CUDA toolkit to local location using dnf..." + mkdir -p /cuda + dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False --nogpgcheck cuda-toolkit - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit + if [ $? -eq 0 ]; then + echo "[SUCCESS] CUDA toolkit installed successfully to shared location." + else + echo "[ERROR] CUDA toolkit installation failed." + umount /shared-cuda-toolkit 2>/dev/null + exit 1 + fi - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." + echo "[INFO] Copying CUDA toolkit to shared location..." + # Copy the installed CUDA toolkit to the shared location for compute nodes + cp -r /cuda/* /shared-cuda-toolkit/ 2>/dev/null || true + + echo "[INFO] Fixing CUDA symlink for NFS compatibility..." + # Fix the cuda symlink to point to versioned directory instead of /etc/alternatives + cd /shared-cuda-toolkit/usr/local/ + # Remove all symlinks (cuda, cuda-13, etc.) + rm -f cuda cuda-13 cuda-12 2>/dev/null + # Find the actual CUDA directory (not symlink) - look for directory with bin/nvcc + for dir in cuda-*; do + if [ -d "$dir" ] && [ -f "$dir/bin/nvcc" ]; then + CUDA_VERSION_DIR="$dir" + break fi + done + if [ -n "$CUDA_VERSION_DIR" ]; then + ln -sf "/shared-cuda-toolkit/usr/local/$CUDA_VERSION_DIR" cuda + echo "[INFO] Created symlink: cuda -> /shared-cuda-toolkit/usr/local/$CUDA_VERSION_DIR" + ls -la cuda + echo "[DEBUG] Checking nvcc at: /shared-cuda-toolkit/usr/local/cuda/bin/nvcc" + ls -la /shared-cuda-toolkit/usr/local/cuda/bin/nvcc 2>&1 || echo "[DEBUG] nvcc not found" else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" + echo "[ERROR] Could not find CUDA installation directory" + exit 1 fi - + echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + if [ -f "/shared-cuda-toolkit/usr/local/cuda/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/usr/local/cuda/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" + echo "[INFO] CUDA installation path: /shared-cuda-toolkit/usr/local/cuda" else echo "[ERROR] CUDA toolkit (nvcc) not found after installation." + umount /shared-cuda-toolkit 2>/dev/null + exit 1 fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + # Set up environment variables pointing to shared location + cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' + export PATH=/shared-cuda-toolkit/usr/local/cuda/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/usr/local/cuda/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit/usr/local/cuda + ENDOFFILE - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true + # Apply environment variables for current session + export PATH=/shared-cuda-toolkit/usr/local/cuda/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/usr/local/cuda/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit/usr/local/cuda + + echo "[INFO] CUDA environment configured" - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + echo "[INFO] Cleaning up temporary mount..." + umount /shared-cuda-toolkit 2>/dev/null echo "===== CUDA Toolkit installation completed =====" @@ -389,4 +401,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 1ee1fce5e1..c43d768d99 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -93,78 +93,77 @@ exit 0 fi - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes + # Mount NFS cuda directory + echo "[INFO] Mounting NFS cuda directory..." mkdir -p /shared-cuda-toolkit mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null exit 1 fi - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE + # Check if CUDA toolkit is already installed on NFS + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." + umount /shared-cuda-toolkit 2>/dev/null + exit 0 + fi - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit + echo "[INFO] Installing CUDA toolkit to local location using dnf..." + mkdir -p /cuda + dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." - fi + if [ $? -eq 0 ]; then + echo "[SUCCESS] CUDA toolkit installed successfully." else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" + echo "[ERROR] CUDA toolkit installation failed." + umount /shared-cuda-toolkit 2>/dev/null + exit 1 + fi + + echo "[INFO] Copying CUDA toolkit to shared location..." + # Find the CUDA installation directory in /cuda/usr/local/ + CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) + if [ -z "$CUDA_SRC_DIR" ]; then + echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" + umount /shared-cuda-toolkit 2>/dev/null + exit 1 fi + + echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" + echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." + # Copy only the contents of cuda-13.2 directly to /shared-cuda-toolkit + cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" + echo "[INFO] CUDA installation path: /shared-cuda-toolkit" else echo "[ERROR] CUDA toolkit (nvcc) not found after installation." + umount /shared-cuda-toolkit 2>/dev/null + exit 1 fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + # Set up environment variables pointing to shared location + cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit + ENDOFFILE - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true + # Apply environment variables for current session + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit + + echo "[INFO] CUDA environment configured" - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + echo "[INFO] Cleaning up temporary mount..." + umount /shared-cuda-toolkit 2>/dev/null echo "===== CUDA Toolkit installation completed =====" @@ -240,7 +239,7 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh @@ -259,6 +258,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/install_cuda_toolkit.sh {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount @@ -360,6 +360,7 @@ - systemctl restart sshd - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - mkdir -p /etc/containers/registries.conf.d - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 73dbf7f494..cb62f3fe98 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -95,37 +95,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -135,7 +119,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda/usr/local/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda @@ -214,7 +198,6 @@ echo "===== NVIDIA GPU setup completed =====" -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -237,31 +220,30 @@ fi echo "[INFO] NVIDIA driver prerequisite satisfied." - # NVIDIA driver version check (Eng Spec Section 11, Pipeline 1 Step 5) - DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]') - DRIVER_MAJOR=$(echo "$DRIVER_VERSION" | cut -d. -f1) - if [ -n "$DRIVER_MAJOR" ] && [ "$DRIVER_MAJOR" -ge 550 ] 2>/dev/null; then - echo "[INFO] NVIDIA driver version $DRIVER_VERSION meets minimum requirement (>= 550.x)." - else - echo "[ERROR] NVIDIA driver version ${DRIVER_VERSION:-unknown} does not meet minimum requirement (>= 550.x). Skipping DCGM setup." - exit 1 - fi - - # CUDA version check (Functional Spec VC-007: minimum CUDA 12.x required) - CUDA_VERSION=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed 's/.*CUDA Version: *//' | sed 's/ .*//') - CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) - if [ -n "$CUDA_MAJOR" ] && [ "$CUDA_MAJOR" -ge 12 ] 2>/dev/null; then - echo "[INFO] CUDA version $CUDA_VERSION meets minimum requirement (>= 12.x)." + # Detect CUDA major version from nvidia-smi + echo "[INFO] Detecting CUDA version from NVIDIA driver..." + CUDA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | awk -F'.' '{print $1}') + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ] || [ "$CUDA_VERSION" -lt "12" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc. Skipping DCGM setup." + exit 1 + fi else - echo "[ERROR] CUDA version ${CUDA_VERSION:-unknown} does not meet minimum requirement (>= 12.x). Skipping DCGM setup." - exit 1 + echo "[INFO] CUDA major version detected: $CUDA_VERSION" fi - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --nogpgcheck --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." @@ -302,8 +284,6 @@ echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -335,7 +315,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts /hpc_tools/cuda echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -346,6 +326,7 @@ echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -572,14 +553,13 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/install_nvidia_driver.sh + - /usr/local/bin/setup_dcgm.sh - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 418a8cc677..4c4a5c594d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -96,37 +96,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -143,12 +127,12 @@ if [ $? -eq 0 ]; then echo "[SUCCESS] CUDA toolkit NFS mount successful" - + # Add to fstab for persistence grep -q "$cuda_nfs_share" /etc/fstab || echo "$cuda_nfs_share /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab - + echo "[INFO] Configuring persistent CUDA environment..." - + # System-wide profile for login shells cat > /etc/profile.d/cuda.sh << 'EOF' export PATH=/usr/local/cuda/bin:$PATH @@ -156,7 +140,7 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/profile.d/cuda.sh - + # Bashrc for non-login shells cat > /etc/bashrc.cuda << 'EOF' if [ -d "/usr/local/cuda/bin" ]; then @@ -166,7 +150,7 @@ fi EOF grep -q "bashrc.cuda" /etc/bashrc || echo "source /etc/bashrc.cuda" >> /etc/bashrc - + # Slurm prolog for job environment mkdir -p /etc/slurm/prolog.d cat > /etc/slurm/prolog.d/cuda.sh << 'EOF' @@ -176,12 +160,12 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/slurm/prolog.d/cuda.sh - + # Apply immediately for current session export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export CUDA_HOME=/usr/local/cuda - + echo "[SUCCESS] Persistent CUDA environment configured" else echo "[ERROR] Failed to mount CUDA toolkit NFS share" @@ -215,8 +199,6 @@ echo "===== NVIDIA GPU setup completed =====" - -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -238,31 +220,49 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." - - # NVIDIA driver version check (Eng Spec Section 11, Pipeline 1 Step 5) - DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]') - DRIVER_MAJOR=$(echo "$DRIVER_VERSION" | cut -d. -f1) - if [ -n "$DRIVER_MAJOR" ] && [ "$DRIVER_MAJOR" -ge 550 ] 2>/dev/null; then - echo "[INFO] NVIDIA driver version $DRIVER_VERSION meets minimum requirement (>= 550.x)." + + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi else - echo "[ERROR] NVIDIA driver version ${DRIVER_VERSION:-unknown} does not meet minimum requirement (>= 550.x). Skipping DCGM setup." - exit 1 + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" fi - # CUDA version check (Functional Spec VC-007: minimum CUDA 12.x required) - CUDA_VERSION=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed 's/.*CUDA Version: *//' | sed 's/ .*//') - CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) - if [ -n "$CUDA_MAJOR" ] && [ "$CUDA_MAJOR" -ge 12 ] 2>/dev/null; then - echo "[INFO] CUDA version $CUDA_VERSION meets minimum requirement (>= 12.x)." - else - echo "[ERROR] CUDA version ${CUDA_VERSION:-unknown} does not meet minimum requirement (>= 12.x). Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." - exit 1 + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." fi # Enable and start DCGM daemon (SB-003) @@ -296,16 +296,17 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -576,10 +577,7 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module - /usr/local/bin/configure_dirs_and_mounts.sh @@ -641,6 +639,8 @@ {% endif %} - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh + - /usr/local/bin/install_nvidia_driver.sh + - /usr/local/bin/setup_dcgm.sh - systemctl restart slurmd - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 40590bd1a6..0f9a5bf620 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -102,15 +102,10 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" -# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role) -cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}" -cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}" - # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 -dcgm_support: "{{ hostvars['localhost']['dcgm_support'] | default(true) }}" # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" diff --git a/provision/roles/slurm_config/tasks/create_slurm_dir.yml b/provision/roles/slurm_config/tasks/create_slurm_dir.yml index b68bcbbded..a89b33aeb3 100644 --- a/provision/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/provision/roles/slurm_config/tasks/create_slurm_dir.yml @@ -30,22 +30,6 @@ name: slurm_custom_aarch64 failed_when: false -- name: Extract CUDA runfile name for x86_64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_x86_64 is defined - - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Extract CUDA runfile name for aarch64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_aarch64 is defined - - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" diff --git a/provision/roles/slurm_config/tasks/hpc_tools.yml b/provision/roles/slurm_config/tasks/hpc_tools.yml index 46260da267..37a2c166d7 100644 --- a/provision/roles/slurm_config/tasks/hpc_tools.yml +++ b/provision/roles/slurm_config/tasks/hpc_tools.yml @@ -22,7 +22,6 @@ mode: "{{ common_mode }}" loop: - cuda - - runfile - scripts - container_images - nvidia_sdk diff --git a/provision/roles/slurm_config/vars/main.yml b/provision/roles/slurm_config/vars/main.yml index 580d776d92..d2f60542ed 100644 --- a/provision/roles/slurm_config/vars/main.yml +++ b/provision/roles/slurm_config/vars/main.yml @@ -171,16 +171,6 @@ parallel_copy_max_workers: 4 parallel_copy_candidates: - # CUDA Runfile (aarch64 repo path) - - name: cuda_runfile_aarch64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - - # CUDA Runfile (x86_64 repo path) - - name: cuda_runfile_x86_64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - # NVIDIA HPC SDK (x86_64 tarball extracted dir) - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" From ff60b1138f32abf4146cce5e4d314aeebd9ee574 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 23 Apr 2026 22:42:34 +0530 Subject: [PATCH 03/13] updating arch files --- .../config/x86_64/rhel/10.0/slurm_custom.json | 1 - ...-group-login_compiler_node_aarch64.yaml.j2 | 62 ++++++++----------- .../ci-group-slurm_node_aarch64.yaml.j2 | 50 +++++++++++---- 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 0a6b097414..852944cb70 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -28,7 +28,6 @@ {"package": "slurm-pam_slurm", "type": "rpm", "repo_name": "slurm_custom"}, {"package": "kernel-devel", "type": "rpm", "repo_name": "appstream"}, {"package": "kernel-headers", "type": "rpm", "repo_name": "appstream"}, - {"package": "mpich", "url": ""}, { "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", "type": "tarball", diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 16918bb8aa..ebaea939f8 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -104,8 +104,8 @@ fi # Check if CUDA toolkit is already installed on NFS - if [ -f "/shared-cuda-toolkit/usr/local/cuda/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/usr/local/cuda/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." umount /shared-cuda-toolkit 2>/dev/null exit 0 @@ -113,10 +113,10 @@ echo "[INFO] Installing CUDA toolkit to local location using dnf..." mkdir -p /cuda - dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False --nogpgcheck cuda-toolkit + dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." + echo "[SUCCESS] CUDA toolkit installed successfully." else echo "[ERROR] CUDA toolkit installation failed." umount /shared-cuda-toolkit 2>/dev/null @@ -124,37 +124,24 @@ fi echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - cp -r /cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Fixing CUDA symlink for NFS compatibility..." - # Fix the cuda symlink to point to versioned directory instead of /etc/alternatives - cd /shared-cuda-toolkit/usr/local/ - # Remove all symlinks (cuda, cuda-13, etc.) - rm -f cuda cuda-13 cuda-12 2>/dev/null - # Find the actual CUDA directory (not symlink) - look for directory with bin/nvcc - for dir in cuda-*; do - if [ -d "$dir" ] && [ -f "$dir/bin/nvcc" ]; then - CUDA_VERSION_DIR="$dir" - break - fi - done - if [ -n "$CUDA_VERSION_DIR" ]; then - ln -sf "/shared-cuda-toolkit/usr/local/$CUDA_VERSION_DIR" cuda - echo "[INFO] Created symlink: cuda -> /shared-cuda-toolkit/usr/local/$CUDA_VERSION_DIR" - ls -la cuda - echo "[DEBUG] Checking nvcc at: /shared-cuda-toolkit/usr/local/cuda/bin/nvcc" - ls -la /shared-cuda-toolkit/usr/local/cuda/bin/nvcc 2>&1 || echo "[DEBUG] nvcc not found" - else - echo "[ERROR] Could not find CUDA installation directory" + # Find the CUDA installation directory in /cuda/usr/local/ + CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) + if [ -z "$CUDA_SRC_DIR" ]; then + echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" + umount /shared-cuda-toolkit 2>/dev/null exit 1 fi + echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" + echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." + # Copy only the contents of cuda-13.2 directly to /shared-cuda-toolkit + cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true + echo "[INFO] Verifying CUDA toolkit installation..." - if [ -f "/shared-cuda-toolkit/usr/local/cuda/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/usr/local/cuda/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: /shared-cuda-toolkit/usr/local/cuda" + echo "[INFO] CUDA installation path: /shared-cuda-toolkit" else echo "[ERROR] CUDA toolkit (nvcc) not found after installation." umount /shared-cuda-toolkit 2>/dev/null @@ -163,15 +150,15 @@ # Set up environment variables pointing to shared location cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit ENDOFFILE # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit echo "[INFO] CUDA environment configured" @@ -252,7 +239,6 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools @@ -269,6 +255,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + + - /usr/local/bin/install_cuda_toolkit.sh {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cb62f3fe98..ed787eb0f0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -119,7 +119,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda/usr/local/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda @@ -219,32 +219,51 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." - - # Detect CUDA major version from nvidia-smi - echo "[INFO] Detecting CUDA version from NVIDIA driver..." - CUDA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | awk -F'.' '{print $1}') + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + # Fallback: Try to get CUDA version from nvcc if available - if [ -z "$CUDA_VERSION" ] || [ "$CUDA_VERSION" -lt "12" ]; then + if [ -z "$CUDA_VERSION" ]; then if command -v nvcc &>/dev/null; then CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" else - echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc. Skipping DCGM setup." + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." exit 1 fi else - echo "[INFO] CUDA major version detected: $CUDA_VERSION" + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" fi # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." - if ! dnf install -y --nogpgcheck --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." + fi + # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." systemctl enable {{ dcgm_service_name }} @@ -276,8 +295,11 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi @@ -327,6 +349,7 @@ echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -558,8 +581,7 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - /usr/local/bin/install_nvidia_driver.sh - - /usr/local/bin/setup_dcgm.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh @@ -613,6 +635,8 @@ - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh + - /usr/local/bin/install_nvidia_driver.sh + - /usr/local/bin/setup_dcgm.sh - systemctl restart slurmd - echo "Cloud-Init has completed successfully." \ No newline at end of file From 70ed9038c7fc0c84b3665c5a6614f32edc0d380c Mon Sep 17 00:00:00 2001 From: mcas Date: Tue, 28 Apr 2026 15:22:48 +0530 Subject: [PATCH 04/13] login compiler node cuda lock installation --- ...i-group-login_compiler_node_x86_64.yaml.j2 | 106 +++--------- .../hpc_tools/cuda_lock_manager.sh.j2 | 78 +++++++++ .../hpc_tools/generate_install_uuid.sh.j2 | 10 ++ .../hpc_tools/install_cuda_toolkit.sh.j2 | 151 ++++++++++++++++++ .../roles/configure_ochami/vars/main.yml | 3 + .../tasks/read_slurm_hostnames.yml | 8 + 6 files changed, 270 insertions(+), 86 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index c43d768d99..1847869058 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -77,95 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - # Mount NFS cuda directory - echo "[INFO] Mounting NFS cuda directory..." - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - exit 1 - fi - - # Check if CUDA toolkit is already installed on NFS - if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - umount /shared-cuda-toolkit 2>/dev/null - exit 0 - fi - - echo "[INFO] Installing CUDA toolkit to local location using dnf..." - mkdir -p /cuda - dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." - else - echo "[ERROR] CUDA toolkit installation failed." - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - echo "[INFO] Copying CUDA toolkit to shared location..." - # Find the CUDA installation directory in /cuda/usr/local/ - CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) - if [ -z "$CUDA_SRC_DIR" ]; then - echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi - - echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" - echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." - # Copy only the contents of cuda-13.2 directly to /shared-cuda-toolkit - cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Verifying CUDA toolkit installation..." - if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: /shared-cuda-toolkit" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE - - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - - echo "[INFO] Cleaning up temporary mount..." - umount /shared-cuda-toolkit 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -258,7 +188,11 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 new file mode 100644 index 0000000000..c037204a28 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 @@ -0,0 +1,78 @@ +#!/bin/bash +# Distributed lock manager for CUDA toolkit install on shared NFS. +# Backed by atomic mkdir on /hpc_tools/cuda/.nfs_lock_cuda. +# Exposes: acquire | release | wait | is_stale +set -euo pipefail + +LOCK_ROOT="/hpc_tools/cuda" +LOCK_DIR="$LOCK_ROOT/.nfs_lock_cuda" +OWNER_FILE="$LOCK_DIR/owner.txt" +DONE_FILE="$LOCK_ROOT/.done_cuda" +STATUS_LOG="$LOCK_ROOT/.cuda_install_status.log" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +INSTALL_TIMEOUT="${INSTALL_TIMEOUT:-1800}" +POLL_INTERVAL="${POLL_INTERVAL:-5}" +TAKEOVER_MIN="${TAKEOVER_MIN:-5}" +TAKEOVER_MAX="${TAKEOVER_MAX:-15}" +GLOBAL_WAIT_TIMEOUT="${GLOBAL_WAIT_TIMEOUT:-$((INSTALL_TIMEOUT * 2))}" + +log_status() { + # ts host hostname role result + printf '%s %s %s %s %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat "$HOSTNAME_FILE" 2>/dev/null || echo UNKNOWN)" \ + "$1" "$2" >> "$STATUS_LOG" +} + +acquire() { + # Fast path: already done + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 2; } + if mkdir "$LOCK_DIR" 2>/dev/null; then + cat "$HOSTNAME_FILE" > "$OWNER_FILE" + log_status installer lock_acquired + return 0 # we are installer + fi + return 1 # we are waiter +} + +release() { rm -rf "$LOCK_DIR"; } + +is_stale() { + # Owner hostname → hostname from status log → ping + local owner_hostname host + owner_hostname=$(cat "$OWNER_FILE" 2>/dev/null || echo "") + [ -z "$owner_hostname" ] && return 1 + host=$(awk -v h="$owner_hostname" '$3==h {print $2; exit}' "$STATUS_LOG") + [ -z "$host" ] && return 1 + ping -c1 -W2 "$host" >/dev/null 2>&1 && return 1 + return 0 # host unreachable → stale +} + +wait_for_done_or_takeover() { + local started; started=$(date +%s) + while true; do + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 0; } + if [ ! -d "$LOCK_DIR" ]; then + sleep $(( RANDOM % (TAKEOVER_MAX - TAKEOVER_MIN + 1) + TAKEOVER_MIN )) + return 10 # caller should retry acquire + fi + if is_stale; then + log_status waiter crash_detected + release + continue + fi + (( $(date +%s) - started > GLOBAL_WAIT_TIMEOUT )) && { + log_status timeout_waiter fail; return 1; + } + sleep "$POLL_INTERVAL" + done +} + +case "${1:-}" in + acquire) acquire ;; + release) release ;; + wait) wait_for_done_or_takeover ;; + is_stale) is_stale ;; + *) echo "usage: $0 {acquire|release|wait|is_stale}" >&2; exit 64 ;; +esac diff --git a/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 new file mode 100644 index 0000000000..be8fb867b3 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Generate hostname for lock ownership identity. +# Idempotent: uses hostname directly. +set -euo pipefail + +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +hostname > "$HOSTNAME_FILE" + +echo "[INFO] CUDA install hostname for this node: $(cat "$HOSTNAME_FILE")" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 new file mode 100644 index 0000000000..5ebb472a4e --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 @@ -0,0 +1,151 @@ +#!/bin/bash +# Lock-aware CUDA toolkit installer. Publishes to /hpc_tools/cuda on NFS. +# Exits 0 if toolkit is already present (.done_cuda), if this node installed it, +# or if another node installed it while we were waiting. +set -euo pipefail + +LOGFILE="/var/log/cuda_toolkit_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +LOCK_ROOT="/hpc_tools/cuda" +DONE_FILE="$LOCK_ROOT/.done_cuda" +LOCK_MGR="/usr/local/bin/cuda_lock_manager.sh" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +# Function to set up CUDA environment variables +setup_cuda_env() { + echo "[INFO] Setting up CUDA environment variables for shared location..." + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/hpc_tools/cuda/bin:$PATH +export LD_LIBRARY_PATH=/hpc_tools/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/hpc_tools/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured successfully" +} + +# Generate hostname for lock ownership (idempotent) +/usr/local/bin/generate_install_uuid.sh + +# Fast-path: already done +[ -f "$DONE_FILE" ] && { echo "[INFO] .done_cuda present, skipping."; exit 0; } + +# Check if running in manual mode (not cloud-init) +MANUAL_MODE="${CUDA_INSTALL_MANUAL:-false}" +if [ "$MANUAL_MODE" = "true" ]; then + echo "[INFO] Running in manual mode - will force acquire lock if held" + FORCE_LOCK=true +else + echo "[INFO] Running in cloud-init mode - will proceed without waiting if lock held" + FORCE_LOCK=false +fi + +# Attempt lock acquisition +set +e; "$LOCK_MGR" acquire; rc=$?; set -e + +# In manual mode, if lock is held, release it explicitly then acquire again +if [ "$FORCE_LOCK" = "true" ] && [ "$rc" = "1" ]; then + echo "[WARN] Lock is held by another node. In manual mode, releasing lock first..." + "$LOCK_MGR" release + echo "[INFO] Lock released. Now acquiring lock..." + set +e; "$LOCK_MGR" acquire; rc=$?; set -e +fi +case $rc in + 0) # installer + echo "[INFO] Acquired lock. Installing toolkit..." + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Check if CUDA toolkit is already installed on NFS + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[INFO] CUDA toolkit already installed on NFS. Exiting." + "$LOCK_MGR" release + exit 0 + fi + + # Install CUDA toolkit to local location using dnf + echo "[INFO] Installing CUDA toolkit to local location using dnf..." + mkdir -p /cuda + if timeout "${INSTALL_TIMEOUT:-1800}" dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit; then + echo "[SUCCESS] CUDA toolkit installed successfully." + + # Copy CUDA toolkit to shared location + echo "[INFO] Copying CUDA toolkit to shared location..." + CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) + if [ -z "$CUDA_SRC_DIR" ]; then + echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" + echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." + cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true + + # Verify CUDA toolkit installation + echo "[INFO] Verifying CUDA toolkit installation..." + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[SUCCESS] CUDA toolkit verified." + else + echo "[ERROR] CUDA toolkit (nvcc) not found after installation." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Atomic publish of .done_cuda (see §4.4). Never use `touch`. + TMP="$LOCK_ROOT/.done_cuda.tmp.$(cat $HOSTNAME_FILE)" + printf 'installed_by=%s\nts=%s\n' \ + "$(hostname -s)" "$(date -Iseconds)" > "$TMP" + sync -f "$TMP" 2>/dev/null || sync + mv -f -- "$TMP" "$DONE_FILE" + "$LOCK_MGR" release + # log pass + printf '%s %s %s installer pass\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + + setup_cuda_env + + umount /shared-cuda-toolkit 2>/dev/null + exit 0 + else + result=$? + "$LOCK_MGR" release + [ "$result" = "124" ] && st="timeout_killed" || st="fail" + printf '%s %s %s installer %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" "$st" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + exit 1 + fi + ;; + 1) # waiter - another node is installing + echo "[INFO] Another node is installing CUDA toolkit. Proceeding with cloud-init without waiting." + echo "[INFO] This node will use the shared CUDA toolkit once installation completes." + setup_cuda_env + echo "[INFO] CUDA environment configured (will work once installation completes)" + + exit 0 + ;; + 2) # already done + echo "[INFO] CUDA toolkit already installed on shared storage." + setup_cuda_env + exit 0 + ;; + *) echo "[ERROR] acquire rc=$rc"; exit 1 ;; +esac diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 0f9a5bf620..e57f9a301c 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -102,6 +102,9 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" +# Login/compiler node presence flag (set by slurm_config role) +login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}" +slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}" # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" diff --git a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml index c61e8d92a9..5b99d35c30 100644 --- a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -92,3 +92,11 @@ ansible.builtin.set_fact: controller_ip: "{{ ip_name_map[ctld_list | first] }}" when: ctld_list | length > 0 + +- name: Set login_compiler_node_present flag + ansible.builtin.set_fact: + login_compiler_node_present: "{{ compiler_login_list | length > 0 }}" + +- name: Set slurm_node_present flag + ansible.builtin.set_fact: + slurm_node_present: "{{ cmpt_list | length > 0 }}" From 2b9ecde9d55aacd24e54cf99881bf15c54d7fe55 Mon Sep 17 00:00:00 2001 From: mcas Date: Tue, 28 Apr 2026 18:07:14 +0530 Subject: [PATCH 05/13] updated the message for nodes where installation skips --- .../templates/hpc_tools/install_cuda_toolkit.sh.j2 | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 index 5ebb472a4e..389d0ebade 100644 --- a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 @@ -28,7 +28,13 @@ EOF /usr/local/bin/generate_install_uuid.sh # Fast-path: already done -[ -f "$DONE_FILE" ] && { echo "[INFO] .done_cuda present, skipping."; exit 0; } +[ -f "$DONE_FILE" ] && { + echo "[INFO] CUDA toolkit already installed on shared storage by another node." + echo "[INFO] This node will use the existing CUDA installation." + setup_cuda_env + echo "[INFO] CUDA environment configured successfully." + exit 0 +} # Check if running in manual mode (not cloud-init) MANUAL_MODE="${CUDA_INSTALL_MANUAL:-false}" From cdf070b4100a11e77e3b21e8a5ded78dd380b9b1 Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 29 Apr 2026 11:51:00 +0530 Subject: [PATCH 06/13] slurm cuda,dcg,peermem lock installation --- ...-group-login_compiler_node_aarch64.yaml.j2 | 107 +++----------- .../ci-group-slurm_node_aarch64.yaml.j2 | 55 ++++++- .../ci-group-slurm_node_x86_64.yaml.j2 | 56 +++++++- .../hpc_tools/install_cuda_driver.sh.j2 | 42 ++++++ .../hpc_tools/install_cuda_toolkit.sh.j2 | 11 +- .../templates/hpc_tools/install_dcgm.sh.j2 | 98 +++++++++++++ .../hpc_tools/install_nvidia_peermem.sh.j2 | 134 ++++++++++++++++++ .../hpc_tools/slurm_cuda_coordinator.sh.j2 | 50 +++++++ 8 files changed, 461 insertions(+), 92 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 7c4d006b2b..5cca3891ea 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -77,95 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - # Mount NFS cuda directory - echo "[INFO] Mounting NFS cuda directory..." - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - exit 1 - fi - - # Check if CUDA toolkit is already installed on NFS - if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - umount /shared-cuda-toolkit 2>/dev/null - exit 0 - fi - - echo "[INFO] Installing CUDA toolkit to local location using dnf..." - mkdir -p /cuda - dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." - else - echo "[ERROR] CUDA toolkit installation failed." - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Find the CUDA installation directory in /cuda/usr/local/ - CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) - if [ -z "$CUDA_SRC_DIR" ]; then - echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi - - echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" - echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." - # Copy only the contents of cuda-13.2 directly to /shared-cuda-toolkit - cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Verifying CUDA toolkit installation..." - if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then - CUDA_VERSION=$(/shared-cuda-toolkit/bin/nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: /shared-cuda-toolkit" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - umount /shared-cuda-toolkit 2>/dev/null - exit 1 - fi - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - - echo "[INFO] Cleaning up temporary mount..." - umount /shared-cuda-toolkit 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -255,8 +185,11 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - + +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index efcdb86926..d4dc32cda5 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -326,6 +326,51 @@ permissions: '0755' content: | {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} +{% endif %} +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% endif %} - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' @@ -635,8 +680,14 @@ - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh - - /usr/local/bin/install_nvidia_driver.sh - - /usr/local/bin/setup_dcgm.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh + /usr/local/bin/install_dcgm.sh + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index a52a83e1f9..bd1db1bc8b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -329,6 +329,52 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} +{% endif %} + - path: /etc/hosts append: true content: | @@ -640,8 +686,14 @@ {% endif %} - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh - - /usr/local/bin/install_nvidia_driver.sh - - /usr/local/bin/setup_dcgm.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh + /usr/local/bin/install_dcgm.sh + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 new file mode 100644 index 0000000000..ba2cde8f3d --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 @@ -0,0 +1,42 @@ +#!/bin/bash +# Local NVIDIA driver install. Always runs on Slurm nodes. Idempotent. +# Never touches NFS lock artifacts. Never touches /hpc_tools/cuda contents. +set -euo pipefail + +LOGFILE="/var/log/nvidia_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== NVIDIA driver install =====" + +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Exiting." + exit 0 +fi + +if command -v nvidia-smi >/dev/null 2>&1; then + echo "[INFO] NVIDIA driver already installed. Skipping." +else + echo "[INFO] Installing NVIDIA driver via dnf..." + dnf install -y cuda-drivers + command -v nvidia-smi >/dev/null 2>&1 || { echo "[ERROR] Driver install failed."; exit 1; } +fi + +nvidia-smi -pm 1 || true + +# Mount shared toolkit at /usr/local/cuda (harmless if already mounted) +mkdir -p /usr/local/cuda +CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" +if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true +fi +grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + +cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF +chmod +x /etc/profile.d/cuda.sh + +echo "===== NVIDIA driver install completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 index 389d0ebade..471c3be291 100644 --- a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 @@ -1,7 +1,6 @@ #!/bin/bash # Lock-aware CUDA toolkit installer. Publishes to /hpc_tools/cuda on NFS. # Exits 0 if toolkit is already present (.done_cuda), if this node installed it, -# or if another node installed it while we were waiting. set -euo pipefail LOGFILE="/var/log/cuda_toolkit_install.log" @@ -80,9 +79,19 @@ case $rc in # Install CUDA toolkit to local location using dnf echo "[INFO] Installing CUDA toolkit to local location using dnf..." mkdir -p /cuda + # Copy host repository configuration to installroot (only for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + mkdir -p /cuda/etc/yum.repos.d + cp -r /etc/yum.repos.d/* /cuda/etc/yum.repos.d/ 2>/dev/null || true + fi if timeout "${INSTALL_TIMEOUT:-1800}" dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit; then echo "[SUCCESS] CUDA toolkit installed successfully." + # Clean up repository configuration from installroot (if copied for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + rm -rf /cuda/etc/yum.repos.d + fi + # Copy CUDA toolkit to shared location echo "[INFO] Copying CUDA toolkit to shared location..." CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 new file mode 100644 index 0000000000..158e089805 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 @@ -0,0 +1,98 @@ +#!/bin/bash +LOGFILE="/var/log/dcgm_setup.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA DCGM setup =====" + +# GPU detection gate - DCGM requires NVIDIA GPU hardware +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping DCGM setup." + exit 0 +fi + +# CUDA prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping DCGM setup." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping DCGM setup." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Display nvidia-smi output for verification +echo "========== NVIDIA Driver & GPU Information ==========" +nvidia-smi 2>&1 +echo "=====================================================" + +# Detect CUDA major version for DCGM package selection +echo "[INFO] Detecting CUDA version for DCGM package compatibility..." +# Try to get CUDA version from nvidia-smi +CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + +# Fallback: Try to get CUDA version from nvcc if available +if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi +else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" +fi + +# Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies +echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." +if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." + exit 1 +fi +echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + +# Enable and start DCGM daemon (SB-003) +echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." +systemctl enable {{ dcgm_service_name }} + +RETRIES={{ dcgm_health_check_retries }} +ATTEMPT=0 +DCGM_STARTED=false + +while [ $ATTEMPT -lt $RETRIES ]; do + ATTEMPT=$((ATTEMPT + 1)) + echo "[INFO] Starting {{ dcgm_service_name }} (attempt $ATTEMPT/$RETRIES)..." + systemctl start {{ dcgm_service_name }} + sleep 3 + + if systemctl is-active --quiet {{ dcgm_service_name }}; then + DCGM_STARTED=true + echo "[SUCCESS] {{ dcgm_service_name }}.service is active." + break + else + echo "[WARN] {{ dcgm_service_name }} failed to start on attempt $ATTEMPT." + fi +done + +if [ "$DCGM_STARTED" != "true" ]; then + echo "[ERROR] {{ dcgm_service_name }} failed to start after $RETRIES attempts. Service will stay down (BL-002)." + exit 1 +fi + +# GPU discovery (SB-004) +echo "[INFO] Enumerating GPUs via dcgmi discovery..." +if command -v dcgmi &>/dev/null; then + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." +else + echo "[WARN] dcgmi command not found. Skipping GPU enumeration." +fi + +echo "===== NVIDIA DCGM setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 new file mode 100644 index 0000000000..6ad516a28f --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 @@ -0,0 +1,134 @@ +#!/bin/bash +# NVIDIA Peer Memory (nvidia-peermem) DKMS installation for GPUDirect RDMA support. +# SHALL be installed on all compute nodes where GPU hardware is detected. +# Required on RDMA-capable GPU nodes only. +# Idempotent: skips installation if module is already loaded. +set -euo pipefail + +LOGFILE="/var/log/nvidia_peermem_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA Peer Memory (nvidia-peermem) setup =====" + +# GPU detection gate - only proceed if NVIDIA GPU is present +echo "[INFO] Checking for NVIDIA GPU hardware..." +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping nvidia-peermem installation." + exit 0 +fi + +# NVIDIA driver prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping nvidia-peermem." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping nvidia-peermem." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Check if nvidia-peermem module is already loaded +echo "[INFO] Checking if nvidia-peermem module is already loaded..." +if lsmod | grep -qE 'nv_peer_mem|nvidia_peermem'; then + echo "[INFO] nvidia-peermem module is already loaded. Skipping installation." + # Verify module metadata + if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + else + echo "[WARN] nvidia-peermem module loaded but modinfo failed. This may indicate a corrupted module." + fi + exit 0 +fi + +# Check running kernel +KERNEL_VERSION=$(uname -r) +echo "[INFO] Running kernel version: $KERNEL_VERSION" + +# Check if kernel headers are available (required for DKMS) +if [ ! -d "/lib/modules/$KERNEL_VERSION/build" ]; then + echo "[ERROR] Kernel headers not found for kernel $KERNEL_VERSION." + echo "[ERROR] Required for DKMS build. Please install kernel-devel package." + exit 1 +fi + +# Get NVIDIA driver version from nvidia-smi +NVIDIA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | tr -d ' ') +if [ -z "$NVIDIA_VERSION" ]; then + echo "[ERROR] Could not determine NVIDIA driver version from nvidia-smi." + exit 1 +fi +echo "[INFO] NVIDIA driver version: $NVIDIA_VERSION" + +# Check current DKMS status +echo "[INFO] Checking current DKMS status..." +dkms status || true + +# Add NVIDIA driver to DKMS if not already added +if ! dkms status | grep -q "nvidia/$NVIDIA_VERSION"; then + echo "[INFO] Adding NVIDIA driver $NVIDIA_VERSION to DKMS..." + if ! dkms add -m nvidia -v "$NVIDIA_VERSION"; then + echo "[ERROR] Failed to add NVIDIA driver to DKMS." + exit 1 + fi + echo "[INFO] NVIDIA driver added to DKMS successfully." +else + echo "[INFO] NVIDIA driver $NVIDIA_VERSION already in DKMS." +fi + +# Build NVIDIA module for the running kernel +echo "[INFO] Building NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms build -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to build NVIDIA module for kernel $KERNEL_VERSION." + echo "[ERROR] Check kernel logs for build errors." + exit 1 +fi +echo "[INFO] NVIDIA module built successfully." + +# Install the built module +echo "[INFO] Installing NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms install -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to install NVIDIA module for kernel $KERNEL_VERSION." + exit 1 +fi +echo "[INFO] NVIDIA module installed successfully." + +# Verify nvidia-peermem module metadata +echo "[INFO] Verifying nvidia-peermem module metadata..." +if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + modinfo nvidia-peermem +else + echo "[ERROR] nvidia-peermem module metadata not found after DKMS install." + echo "[ERROR] This may indicate the module was not built or installed correctly." + exit 1 +fi + +# Load the nvidia-peermem module +echo "[INFO] Loading nvidia-peermem module..." +if modprobe nvidia-peermem; then + echo "[SUCCESS] nvidia-peermem module loaded successfully." +else + echo "[WARN] Failed to load nvidia-peermem module with modprobe." + echo "[WARN] This may not be critical if RDMA is not required on this node." + echo "[WARN] Check kernel logs for detailed error information." + dmesg | grep -i peermem || true + # Continue with warning unless RDMA dependency exists + # (RDMA dependency check would be environment-specific) +fi + +# Confirm module is loaded +if lsmod | grep -q nvidia_peermem; then + echo "[SUCCESS] nvidia_peermem is loaded in kernel." +else + echo "[WARN] nvidia_peermem not found in lsmod output." + echo "[WARN] Module may have failed to load or may not be required for this configuration." +fi + +# Check kernel logs for peer memory messages or errors +echo "[INFO] Checking kernel logs for peer memory messages..." +dmesg | grep -i peermem || echo "[INFO] No peermem messages found in recent kernel logs." + +echo "===== NVIDIA Peer Memory (nvidia-peermem) setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 new file mode 100644 index 0000000000..79d72db10b --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 @@ -0,0 +1,50 @@ +#!/bin/bash +# Slurm-node entry point. Decides toolkit path based on login_compiler_node_present. +# GPU detection gate ensures CUDA operations only run on nodes with NVIDIA hardware. + +set -euo pipefail + +LOGIN_COMPILER_PRESENT="{{ login_compiler_node_present | lower }}" +SLURM_NODE_PRESENT="{{ slurm_node_present | lower }}" + +[ "$SLURM_NODE_PRESENT" = "true" ] || { echo "[INFO] Not a Slurm node."; exit 0; } + +# GPU detection gate - if no GPU present, skip CUDA toolkit and driver installation +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping CUDA toolkit and driver installation." + exit 0 +fi + +if [ "$LOGIN_COMPILER_PRESENT" = "true" ]; then + echo "[INFO] Login/compiler nodes present → mounting shared toolkit from NFS." + # Mount shared toolkit at /usr/local/cuda + mkdir -p /usr/local/cuda + CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" + if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true + fi + grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + # Export CUDA environment variables + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured from shared NFS toolkit." +else + echo "[INFO] No login/compiler nodes → participating in lock." + # install_cuda_toolkit.sh is lock-aware: + # - if this node wins the lock, it runs the install and publishes .done_cuda + # - if this node loses the lock, it returns immediately without waiting + if ! /usr/local/bin/install_cuda_toolkit.sh; then + echo "[ERROR] install_cuda_toolkit.sh returned non-zero." + exit 1 + fi + echo "[INFO] CUDA toolkit installation handled by another node or completed by this node." + echo "[INFO] Proceeding with driver, DCGM, and nvidia-peermem installation." +fi + +echo "[SUCCESS] CUDA coordinator completed." +exit 0 From f06835bc985b26884bfef606e65c3cde47f54b92 Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 29 Apr 2026 16:11:08 +0530 Subject: [PATCH 07/13] adding mod[rpbe for peermem --- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 2 +- .../templates/hpc_tools/install_nvidia_peermem.sh.j2 | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index af308509c2..90a2a0e070 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -191,10 +191,10 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - /usr/local/bin/configure_vast_installation.sh - - mount -a {% if login_compiler_node_present %} - /usr/local/bin/generate_install_uuid.sh diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 index 6ad516a28f..4a51c179ae 100644 --- a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 @@ -106,6 +106,13 @@ else exit 1 fi +# Ensure base NVIDIA modules are loaded first + echo "Loading base NVIDIA modules..." + modprobe nvidia 2>/dev/null || echo "nvidia module not available or failed to load" + modprobe nvidia-uvm 2>/dev/null || echo "nvidia-uvm module not available or failed to load" + modprobe nvidia-modeset 2>/dev/null || echo "nvidia-modeset module not available or failed to load" + modprobe nvidia-drm 2>/dev/null || echo "nvidia-drm module not available or failed to load" + # Load the nvidia-peermem module echo "[INFO] Loading nvidia-peermem module..." if modprobe nvidia-peermem; then From b9d941d5f4ece3fa56c79efea001824a30165efd Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 30 Apr 2026 11:54:56 +0530 Subject: [PATCH 08/13] adding cuda mount for slurm --- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 1 - .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index fcbc6d8468..6e02844ee7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -402,7 +402,6 @@ echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 1fed916565..7b342f105b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -416,6 +416,7 @@ echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/ /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab From 766482722551f3f74974663af34f188c320e8279 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 30 Apr 2026 13:29:02 +0530 Subject: [PATCH 09/13] adding the dcgm_support --- .../input_validation/schema/telemetry_config.json | 7 ++++++- input/telemetry_config.yml | 8 ++++++++ .../cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 2 ++ .../cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 2 ++ .../tasks/transform_telemetry_config.yml | 2 ++ .../import_input_parameters/templates/telemetry_config.j2 | 8 ++++++++ 6 files changed, 28 insertions(+), 1 deletion(-) diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index 7da811bf91..d5f0b6884f 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -39,6 +39,11 @@ "default": 10001, "description": "LDMS sampler port on compute nodes. Valid range: 10001-10100. Default: 10001" }, + "dcgm_support": { + "type": "boolean", + "default": true, + "description": "Enable or disable DCGM installation on Slurm GPU nodes." + }, "powerscale_configurations": { "type": "object", "properties": { @@ -169,7 +174,7 @@ ] } }, - "required": ["idrac_telemetry_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port" ], + "required": ["idrac_telemetry_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port", "dcgm_support" ], "$defs": { "kafka_configurations": { "type": "object", diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index 4fb91aae7f..af30f8030e 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -269,6 +269,14 @@ ldms_sampler_configurations: config_parameters: "" # Monitor all interfaces activation_parameters: "interval=30000000 offset=0" # interval=30000000 microseconds (30 seconds), offset=0 +# ============================================================================ +# DCGM (NVIDIA Data Center GPU Manager) CONFIGURATION +# ============================================================================ +# Enable or disable DCGM installation on Slurm GPU nodes. +# Accepted values: true or false +# Default: true +dcgm_support: true + # ============================================================================ # POWERSCALE TELEMETRY CONFIGURATION # ============================================================================ diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 6e02844ee7..1668713db6 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -693,7 +693,9 @@ set -e /usr/local/bin/slurm_cuda_coordinator.sh /usr/local/bin/install_cuda_driver.sh +{% if hostvars['localhost']['dcgm_support'] | default(true) | bool %} /usr/local/bin/install_dcgm.sh +{% endif %} /usr/local/bin/install_nvidia_peermem.sh {% endif %} - systemctl restart slurmd diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 7b342f105b..7f2a73ad6f 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -698,7 +698,9 @@ set -e /usr/local/bin/slurm_cuda_coordinator.sh /usr/local/bin/install_cuda_driver.sh +{% if hostvars['localhost']['dcgm_support'] | default(true) | bool %} /usr/local/bin/install_dcgm.sh +{% endif %} /usr/local/bin/install_nvidia_peermem.sh {% endif %} - systemctl restart slurmd diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 9e431f6671..77a74bd581 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -72,6 +72,7 @@ telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}" telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}" telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}" + telemetry_dcgm_support: "{{ backup_telemetry_config.dcgm_support | default(true) }}" telemetry_ldms_sampler_configurations: >- {{ backup_telemetry_config.ldms_sampler_configurations @@ -122,6 +123,7 @@ telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}" telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}" telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}" + telemetry_dcgm_support: "{{ telemetry_dcgm_support }}" telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}" - name: Validate YAML syntax of transformed telemetry_config.yml diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index ae57457882..569f90a1ed 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -188,6 +188,14 @@ ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }} # Default: 10001 ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }} +# ============================================================================ +# DCGM (NVIDIA Data Center GPU Manager) CONFIGURATION +# ============================================================================ +# Enable or disable DCGM installation on Slurm GPU nodes. +# Accepted values: true or false +# Default: true +dcgm_support: {{ telemetry_dcgm_support | default(true) | bool | ternary('true', 'false') }} + # LDMS Sampler Plugin Configurations # ---------------------------------------------------------------------------- # Configure which metrics to collect from compute nodes and collection intervals. From 4da575bc0470c5a725d1f92eed72670eacb51e49 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 30 Apr 2026 14:04:56 +0530 Subject: [PATCH 10/13] adding variable approach --- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 2 +- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 2 +- provision/roles/configure_ochami/vars/main.yml | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 1668713db6..e5c5664ea8 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -693,7 +693,7 @@ set -e /usr/local/bin/slurm_cuda_coordinator.sh /usr/local/bin/install_cuda_driver.sh -{% if hostvars['localhost']['dcgm_support'] | default(true) | bool %} +{% if dcgm_support %} /usr/local/bin/install_dcgm.sh {% endif %} /usr/local/bin/install_nvidia_peermem.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 7f2a73ad6f..90447757e7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -698,7 +698,7 @@ set -e /usr/local/bin/slurm_cuda_coordinator.sh /usr/local/bin/install_cuda_driver.sh -{% if hostvars['localhost']['dcgm_support'] | default(true) | bool %} +{% if dcgm_support %} /usr/local/bin/install_dcgm.sh {% endif %} /usr/local/bin/install_nvidia_peermem.sh diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index e57f9a301c..90cc13b6e7 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -106,6 +106,7 @@ all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | defaul login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}" slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}" # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 +dcgm_support: "{{ hostvars['localhost']['dcgm_support'] | default(true) | bool }}" # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 From 667454dd3e0541f3b3fac196c1bd14fd7454a5df Mon Sep 17 00:00:00 2001 From: mcas Date: Sat, 2 May 2026 14:37:06 +0530 Subject: [PATCH 11/13] variable chnages wrt new telemetry_config.yml chnages --- provision/roles/configure_ochami/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 90cc13b6e7..8de6ceb69a 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -106,7 +106,7 @@ all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | defaul login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}" slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}" # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 -dcgm_support: "{{ hostvars['localhost']['dcgm_support'] | default(true) | bool }}" +dcgm_support: "{{ hostvars['localhost'].get('telemetry_sources', {}).get('dcgm', {}).get('metrics_enabled', true) | bool }}" # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 From 64423172e4cef25306652263395d66ea5e0af79e Mon Sep 17 00:00:00 2001 From: mcas Date: Mon, 4 May 2026 10:25:53 +0530 Subject: [PATCH 12/13] removing dcgm_support variable --- .../tasks/transform_telemetry_config.yml | 1 - .../import_input_parameters/templates/telemetry_config.j2 | 8 -------- 2 files changed, 9 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 77a74bd581..b4fa2cf1c3 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -72,7 +72,6 @@ telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}" telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}" telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}" - telemetry_dcgm_support: "{{ backup_telemetry_config.dcgm_support | default(true) }}" telemetry_ldms_sampler_configurations: >- {{ backup_telemetry_config.ldms_sampler_configurations diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index 569f90a1ed..ae57457882 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -188,14 +188,6 @@ ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }} # Default: 10001 ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }} -# ============================================================================ -# DCGM (NVIDIA Data Center GPU Manager) CONFIGURATION -# ============================================================================ -# Enable or disable DCGM installation on Slurm GPU nodes. -# Accepted values: true or false -# Default: true -dcgm_support: {{ telemetry_dcgm_support | default(true) | bool | ternary('true', 'false') }} - # LDMS Sampler Plugin Configurations # ---------------------------------------------------------------------------- # Configure which metrics to collect from compute nodes and collection intervals. From 0571cc013a5d7b9d8c5933580476792db0b8a40a Mon Sep 17 00:00:00 2001 From: mcas Date: Mon, 4 May 2026 10:47:07 +0530 Subject: [PATCH 13/13] removing stale entries --- .../import_input_parameters/tasks/transform_telemetry_config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index b4fa2cf1c3..9e431f6671 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -122,7 +122,6 @@ telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}" telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}" telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}" - telemetry_dcgm_support: "{{ telemetry_dcgm_support }}" telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}" - name: Validate YAML syntax of transformed telemetry_config.yml