diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index b3e950b4f3..862f4a0fef 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,7 +7,13 @@ {"package": "pmix", "type": "rpm", "repo_name": "appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }, + {"package": "imb", "type": "tarball", "url": "https://github.com/intel/mpi-benchmarks/archive/refs/tags/IMB-v2021.8.tar.gz"}, + {"package": "osu-micro-benchmarks", "type": "tarball", "url": "https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-7.5.tar.gz"}, + {"package": "likwid", "type": "tarball", "url": "https://github.com/RRZE-HPC/likwid/archive/refs/tags/v5.4.1.tar.gz"}, + {"package": "geopm", "type": "tarball", "url": "https://github.com/geopm/geopm/archive/refs/tags/v3.1.0.tar.gz"}, + {"package": "papi", "type": "tarball", "url": "http://icl.utk.edu/projects/papi/downloads/papi-7.1.0.tar.gz"}, + {"package": "sionlib", "type": "tarball", "url": "https://apps.fz-juelich.de/jsc/sionlib/download.php?version=1.7.7"} ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 852944cb70..74920d9e24 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -7,7 +7,14 @@ {"package": "pmix", "type": "rpm", "repo_name": "appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }, + {"package": "imb", "type": "tarball", "url": "https://github.com/intel/mpi-benchmarks/archive/refs/tags/IMB-v2021.8.tar.gz"}, + {"package": "osu-micro-benchmarks", "type": "tarball", "url": "https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-7.5.tar.gz"}, + {"package": "likwid", "type": "tarball", "url": "https://github.com/RRZE-HPC/likwid/archive/refs/tags/v5.4.1.tar.gz"}, + {"package": "geopm", "type": "tarball", "url": "https://github.com/geopm/geopm/archive/refs/tags/v3.1.0.tar.gz"}, + {"package": "papi", "type": "tarball", "url": "http://icl.utk.edu/projects/papi/downloads/papi-7.1.0.tar.gz"}, + {"package": "msr-safe", "type": "tarball", "url": "https://github.com/llnl/msr-safe/archive/refs/tags/v1.7.0.tar.gz"}, + {"package": "sionlib", "type": "tarball", "url": "https://apps.fz-juelich.de/jsc/sionlib/download.php?version=1.7.7"} ] }, "slurm_control_node": { diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 317ff1aa26..176e5e6325 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -144,6 +144,7 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 593cef9d00..c7c2036a4b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -144,6 +144,7 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index c884c40dc9..41b0df8707 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -402,6 +402,7 @@ echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/ /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/provision/roles/slurm_config/files/pull_benchmarks.sh b/provision/roles/slurm_config/files/pull_benchmarks.sh new file mode 100644 index 0000000000..6bd444b944 --- /dev/null +++ b/provision/roles/slurm_config/files/pull_benchmarks.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# pull_benchmarks.sh - Pull and organize HPC benchmark artifacts from local repository +# Usage: ./pull_benchmarks.sh [config_path] +# arch: x86_64 or aarch64 +# config_path: Optional path to slurm_custom.json (default: /opt/omnia/config) +# + +set -e + +ARCH="${1:-x86_64}" +CONFIG_PATH="${2:-/opt/omnia/config}" +HPC_TOOLS_BASE="/hpc_tools" +LOCAL_REPO_BASE="/var/lib/pulp/content" + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Validate architecture +if [[ "${ARCH}" != "x86_64" && "${ARCH}" != "aarch64" ]]; then + log_error "Invalid architecture: ${ARCH}. Must be x86_64 or aarch64." + exit 1 +fi + +# Check if hpc_tools directory exists +if [[ ! -d "${HPC_TOOLS_BASE}" ]]; then + log_error "hpc_tools base directory does not exist: ${HPC_TOOLS_BASE}" + log_error "Ensure NFS mount for hpc_tools is available." + exit 1 +fi + +# Find slurm_custom.json +SLURM_CUSTOM_FILE="" +for path in "${CONFIG_PATH}/slurm_custom.json" "/etc/omnia/slurm_custom.json" "/opt/omnia/slurm_custom.json"; do + if [[ -f "${path}" ]]; then + SLURM_CUSTOM_FILE="${path}" + break + fi +done + +if [[ -z "${SLURM_CUSTOM_FILE}" ]]; then + log_error "slurm_custom.json not found in standard locations." + exit 1 +fi + +log_info "Using slurm_custom.json: ${SLURM_CUSTOM_FILE}" + +# Parse benchmark packages from slurm_custom.json +# Look for packages with type "tarball" or "source" +BENCHMARK_PACKAGES=$(jq -r '.packages[]? | select(.type == "tarball" or .type == "source") | .package' "${SLURM_CUSTOM_FILE}" 2>/dev/null || echo "") + +if [[ -z "${BENCHMARK_PACKAGES}" ]]; then + log_warn "No benchmark packages found in slurm_custom.json." + exit 0 +fi + +log_info "Found benchmark packages: ${BENCHMARK_PACKAGES}" + +# Function to pull a single benchmark +pull_benchmark() { + local pkg_name="$1" + local pkg_info + local pkg_url + local pkg_type + local dest_dir + + pkg_info=$(jq -r ".packages[]? | select(.package == \"${pkg_name}\")" "${SLURM_CUSTOM_FILE}") + pkg_url=$(echo "${pkg_info}" | jq -r '.url // empty') + pkg_type=$(echo "${pkg_info}" | jq -r '.type // "source"') + + dest_dir="${HPC_TOOLS_BASE}/${pkg_name}" + + # Create destination directory + log_info "Creating directory: ${dest_dir}" + mkdir -p "${dest_dir}" + + # Check if artifact exists in local repo + # Search in offline_repo structure + local artifact_path="" + for search_path in "/var/lib/pulp/content/offline_repo/cluster/${ARCH}/rhel/10.0/source/${pkg_name}" \ + "/var/lib/pulp/content/offline_repo/cluster/${ARCH}/rhel/10.0/tarball/${pkg_name}" \ + "${LOCAL_REPO_BASE}/offline_repo/cluster/${ARCH}/rhel/10.0/source/${pkg_name}" \ + "${LOCAL_REPO_BASE}/offline_repo/cluster/${ARCH}/rhel/10.0/tarball/${pkg_name}"; do + if [[ -d "${search_path}" ]]; then + artifact_path="${search_path}" + break + fi + done + + if [[ -z "${artifact_path}" ]]; then + log_warn "Artifact not found in local repository for ${pkg_name}, skipping." + return 1 + fi + + # Copy artifacts to destination + log_info "Copying artifacts from ${artifact_path} to ${dest_dir}" + cp -r "${artifact_path}"/* "${dest_dir}/" 2>/dev/null || true + + # If URL is provided and local copy failed, attempt direct pull + if [[ -n "${pkg_url}" && ! -f "${dest_dir}"/* ]]; then + log_info "Attempting direct pull from URL: ${pkg_url}" + cd "${dest_dir}" + if command -v wget &>/dev/null; then + wget -q "${pkg_url}" -O "${pkg_name}.tar.gz" || log_warn "Failed to download ${pkg_url}" + elif command -v curl &>/dev/null; then + curl -sSL "${pkg_url}" -o "${pkg_name}.tar.gz" || log_warn "Failed to download ${pkg_url}" + fi + fi + + # Verify files were copied + if [[ -n "$(ls -A ${dest_dir})" ]]; then + log_info "Successfully staged ${pkg_name}" + return 0 + else + log_warn "No files staged for ${pkg_name}" + return 1 + fi +} + +# Pull each benchmark +SUCCESS_COUNT=0 +FAIL_COUNT=0 + +for pkg in ${BENCHMARK_PACKAGES}; do + if pull_benchmark "${pkg}"; then + ((SUCCESS_COUNT++)) + else + ((FAIL_COUNT++)) + fi +done + +log_info "Benchmark staging complete: ${SUCCESS_COUNT} succeeded, ${FAIL_COUNT} failed" + +exit 0 diff --git a/provision/roles/slurm_config/tasks/hpc_tools.yml b/provision/roles/slurm_config/tasks/hpc_tools.yml index 37a2c166d7..940a71137f 100644 --- a/provision/roles/slurm_config/tasks/hpc_tools.yml +++ b/provision/roles/slurm_config/tasks/hpc_tools.yml @@ -42,6 +42,22 @@ group: "{{ root_group }}" mode: "0644" +- name: Deploy pull_benchmarks.sh to NFS share + ansible.builtin.template: + src: "pull_benchmarks.sh.j2" + dest: "{{ pull_benchmarks_script_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0755" + +- name: Deploy benchmark_tools.list to NFS share + ansible.builtin.template: + src: "benchmark_tools.list.j2" + dest: "{{ benchmark_tools_list_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0644" + - name: Set fact for pulp mirror ansible.builtin.set_fact: pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" diff --git a/provision/roles/slurm_config/templates/benchmark_tools.list.j2 b/provision/roles/slurm_config/templates/benchmark_tools.list.j2 new file mode 100644 index 0000000000..1fe5865ed2 --- /dev/null +++ b/provision/roles/slurm_config/templates/benchmark_tools.list.j2 @@ -0,0 +1,14 @@ +# Benchmark Tools List +# Lists HPC benchmark tools to pull from the local offline repository to /hpc_tools// +# Format: (one per line) +# Lines starting with # are ignored. Empty lines are ignored. +# Architecture is auto-detected at runtime (uname -m). +# Note: msr-safe is x86_64 only — automatically skipped on aarch64. + +osu-micro-benchmarks +imb +likwid +papi +geopm +sionlib +msr-safe diff --git a/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 b/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 new file mode 100644 index 0000000000..8d5b505713 --- /dev/null +++ b/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 @@ -0,0 +1,164 @@ +#!/bin/bash +# HPC benchmark tarball pull script (Pulp only) +# Deployed via NFS share for all nodes +# Reads benchmark tool names from benchmark_tools.list file and downloads tarballs from Pulp +# Downloads from Pulp mirror only (no internet fallback) +# Usage: pull_benchmarks.sh + +LOGFILE="/var/log/pull_benchmarks.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_TOOLS_LIST="${SCRIPT_DIR}/benchmark_tools.list" +HPC_TOOLS_DIR="/hpc_tools" +ARCH="$(uname -m)" +OS_VERSION="{{ hostvars['localhost']['cluster_os_version'] }}" +PULP_SERVER="{{ hostvars['localhost']['admin_nic_ip'] }}:2225" +PULP_CONTENT_BASE="https://${PULP_SERVER}/pulp/content{{ hostvars['localhost']['oim_shared_path'] }}/omnia/offline_repo/cluster" + +echo "===== Starting Benchmark Tarball Pull Script (Pulp Only) =====" +echo "[INFO] Timestamp: $(date)" +echo "[INFO] Architecture: $ARCH" +echo "[INFO] Pulp mirror: $PULP_SERVER" + +# Check prerequisites +if [[ "$ARCH" != "x86_64" && "$ARCH" != "aarch64" ]]; then + echo "[ERROR] Unsupported architecture: $ARCH" + exit 1 +fi + +if [ ! -f "$BENCHMARK_TOOLS_LIST" ]; then + echo "[ERROR] Benchmark tools list not found: $BENCHMARK_TOOLS_LIST" + echo "[INFO] Please create the file with one tool name per line." + echo "[INFO] Expected location: ${SCRIPT_DIR}/benchmark_tools.list" + exit 1 +fi + +if [ ! -d "$HPC_TOOLS_DIR" ]; then + echo "[ERROR] /hpc_tools is not mounted or does not exist" + exit 1 +fi + +# Function to download all files from a Pulp content directory into a local directory. +# Uses wget (preferred) or curl as fallback. +# Arguments: $1 = Pulp directory URL, $2 = local destination directory +pull_from_pulp() { + local url="$1" + local dest="$2" + + if command -v wget &>/dev/null; then + wget -q -r -np -nd -R "index.html*" --no-check-certificate \ + -P "$dest" "${url}/" 2>&1 + return $? + elif command -v curl &>/dev/null; then + local page + page="$(curl -ksfL "${url}/" 2>/dev/null)" || return 1 + local -a files + local files_count + mapfile -t files < <(echo "$page" \ + | grep -oP 'href="\K[^"]+' \ + | grep -vE '^\.\.?/?$|index\.html') + files_count=0 + for _file in "${files[@]}"; do + [ -n "$_file" ] && ((files_count++)) + done + [ "$files_count" -eq 0 ] && return 1 + local rc=0 + for f in "${files[@]}"; do + f="${f%/}" + [ -z "$f" ] && continue + curl -ksfL "${url}/${f}" -o "${dest}/${f}" || rc=1 + done + return $rc + else + echo "[ERROR] Neither wget nor curl is available." + return 1 + fi +} + +echo "[INFO] Reading benchmark tools from: $BENCHMARK_TOOLS_LIST" + +TOTAL=0 +SUCCESS_COUNT=0 +FAILED_COUNT=0 +SKIPPED_COUNT=0 +FAILED_TOOLS="" + +while IFS= read -r tool || [ -n "$tool" ]; do + # Skip empty lines and comments + [[ -z "$tool" || "$tool" =~ ^[[:space:]]*# ]] && continue + + # Trim whitespace + tool=$(echo "$tool" | xargs) + [[ -z "$tool" ]] && continue + + ((TOTAL++)) + + echo "" + echo "===== Processing Tool $TOTAL: $tool =====" + + # Architecture-specific skip + if [[ "$tool" == "msr-safe" && "$ARCH" != "x86_64" ]]; then + echo "[WARN] $tool is x86_64 only. Skipping on $ARCH." + ((SKIPPED_COUNT++)) + continue + fi + + PULP_URL="${PULP_CONTENT_BASE}/${ARCH}/rhel/${OS_VERSION}/tarball/${tool}" + DEST_DIR="${HPC_TOOLS_DIR}/${tool}" + + echo "[INFO] Pulp URL: $PULP_URL" + echo "[INFO] Destination: $DEST_DIR" + + # Skip if already staged + if [ -d "$DEST_DIR" ] && [ -n "$(ls -A "$DEST_DIR" 2>/dev/null)" ]; then + echo "[WARN] $tool already present at $DEST_DIR. Skipping." + echo "[INFO] Remove the directory to re-download." + ((SKIPPED_COUNT++)) + continue + fi + + mkdir -p "$DEST_DIR" + + echo "[INFO] Pulling from Pulp mirror..." + + if pull_from_pulp "$PULP_URL" "$DEST_DIR"; then + if [ -n "$(ls -A "$DEST_DIR" 2>/dev/null)" ]; then + echo "[SUCCESS] $tool staged at $DEST_DIR" + echo "[SOURCE] Downloaded from: PULP MIRROR ($PULP_SERVER)" + ls -lh "$DEST_DIR" + ((SUCCESS_COUNT++)) + else + echo "[ERROR] Pull returned success but no files found for $tool" + rmdir "$DEST_DIR" 2>/dev/null + ((FAILED_COUNT++)) + FAILED_TOOLS="${FAILED_TOOLS}\n - ${tool} (no files downloaded)" + fi + else + echo "[ERROR] Failed to pull $tool from Pulp mirror." + echo "[INFO] Tool may not be available in Pulp or download was interrupted." + rmdir "$DEST_DIR" 2>/dev/null + ((FAILED_COUNT++)) + FAILED_TOOLS="${FAILED_TOOLS}\n - ${tool}" + fi + +done < "$BENCHMARK_TOOLS_LIST" + +echo "" +echo "===== Benchmark Pull Summary =====" +echo "[INFO] Total tools processed: $TOTAL" +echo "[INFO] Successful: $SUCCESS_COUNT" +echo "[INFO] Skipped: $SKIPPED_COUNT" +echo "[INFO] Failed: $FAILED_COUNT" + +if [ $FAILED_COUNT -gt 0 ]; then + echo -e "[ERROR] Failed tools:$FAILED_TOOLS" + EXIT_CODE=1 +else + EXIT_CODE=0 +fi + +echo "" +echo "===== Benchmark Pull Completed =====" +exit ${EXIT_CODE:-0} diff --git a/provision/roles/slurm_config/vars/main.yml b/provision/roles/slurm_config/vars/main.yml index d2f60542ed..1c47bc3b50 100644 --- a/provision/roles/slurm_config/vars/main.yml +++ b/provision/roles/slurm_config/vars/main.yml @@ -162,6 +162,10 @@ nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/{{ hostvars['l nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" +# Benchmark pull script path +pull_benchmarks_script_path: "{{ slurm_config_path }}/hpc_tools/scripts/pull_benchmarks.sh" +benchmark_tools_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/benchmark_tools.list" + # parallel file copy parallel_copy_max_workers: 4