Skip to content

Commit

Permalink
Add multihost GPU CI run with last public jaxlib release
Browse files Browse the repository at this point in the history
  • Loading branch information
sudhakarsingh27 committed Sep 30, 2022
1 parent 3c7d927 commit 4fbc9a1
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 4 deletions.
45 changes: 42 additions & 3 deletions .github/workflows/nightly-ci-multiprocess-gpu.yml
Expand Up @@ -15,7 +15,7 @@ on:
- '**workflows/nightly-ci-multiprocess-gpu.yml'

jobs:
build:
jaxlib-nightly:
runs-on: self-hosted
steps:
- uses: actions/checkout@v3
Expand All @@ -24,7 +24,46 @@ jobs:
run: |
export JOBSCRIPTSDIR=${GITHUB_WORKSPACE}/.github/workflows/slurm_job_scripts
source $JOBSCRIPTSDIR/slurm_utils_common.sh
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest.sub | tee output.log
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest_jaxlib_nightly.sub | tee output.log
sleep 2m
export SLURM_JOBID=$(grep 'Submitted batch job' "output.log" | awk '{ print $4 }')
export SLURM_OUTPUT=$(scontrol show job "${SLURM_JOBID}" | grep 'StdOut' | awk -F '=' '{ print $2 }')
job_wait "${SLURM_JOBID}" & PID=$!
touch "${SLURM_OUTPUT}"
echo -e " ---------------------------------------------------\n" \
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n" \
"---------------------------------------------------\n"
tail --pid="${PID}" -f "${SLURM_OUTPUT}"
export SLURM_STATE=$(job_state "${SLURM_JOBID}"); echo "SLURM_JOBID=${SLURM_JOBID} SLURM_STATE='${SLURM_STATE}'"
export SLURM_WALLTIME=$(job_time "${SLURM_JOBID}"); echo "SLURM_WALLTIME=${SLURM_WALLTIME} secs"
export SLURM_EXITCODE=$(job_exit_code "${SLURM_JOBID}" || echo $?); echo "SLURM_EXITCODE='${SLURM_EXITCODE}'"
if [ "${SLURM_EXITCODE}" != "0" ]; then exit ${SLURM_EXITCODE:-999}; fi
if [ "${SLURM_STATE}" != "COMPLETED" ]; then exit 1; fi
- name: Publish Test Results
uses: EnricoMi/publish-unit-test-result-action@v2
if: always()
with:
junit_files: "outputs/*.xml"

- name: Upload run results from all nodes
uses: actions/upload-artifact@v3
if: always()
with:
name: output-from-nodes
path: "outputs/*.txt"

jaxlib-release:
runs-on: self-hosted
needs: jaxlib-nightly
steps:
- uses: actions/checkout@v3

- name: Launch slurm job and hook output to this shell
run: |
export JOBSCRIPTSDIR=${GITHUB_WORKSPACE}/.github/workflows/slurm_job_scripts
source $JOBSCRIPTSDIR/slurm_utils_common.sh
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest_jaxlib_release.sub | tee output.log
sleep 2m
export SLURM_JOBID=$(grep 'Submitted batch job' "output.log" | awk '{ print $4 }')
export SLURM_OUTPUT=$(scontrol show job "${SLURM_JOBID}" | grep 'StdOut' | awk -F '=' '{ print $2 }')
Expand Down Expand Up @@ -55,7 +94,7 @@ jobs:

report:
name: report
needs: build
needs: [jaxlib-nightly, jaxlib-release]
if: |
failure()
&& github.event_name == 'schedule'
Expand Down
Expand Up @@ -50,7 +50,7 @@ OUTPUT_DIR="${BASE_WORKSPACE_DIR}/outputs/"
mkdir -p $OUTPUT_DIR

# redirect both stdout and stderr in the same file for ease of analysis
OUTFILE="${OUTPUT_DIR}/output-%j-%n.txt"
OUTFILE="${OUTPUT_DIR}/output-test-jaxlib-nightly-%j-%n.txt"

# Run any setup commands before the actual pytest command to make sure
# that the processes are launched together
Expand Down
@@ -0,0 +1,80 @@
#!/bin/bash
#SBATCH -A ci-jax-gpu
#SBATCH -p compute
#SBATCH -N 2 # number of nodes
#SBATCH -t 00:15:00 # wall time
#SBATCH -J "ci-jax-gpu" # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --mail-type=FAIL # only send email on failures
#SBATCH --overcommit # Needed for pytorch

set -x

# File system and volume glue code
#-------------------------------------------------------------------------------
CONTAINER="nvcr.io/nvidian/jax_t5x:cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython"
CONTAINER_NAME="multinode_ci_test_container"

BASE_WORKSPACE_DIR=$GITHUB_WORKSPACE
WORKSPACE_DIR=/workspace

MOUNTS="--container-mounts=$BASE_WORKSPACE_DIR:/$WORKSPACE_DIR"

# Since the docker container doesn't contain MLX drivers for IB, following flags
# are needed to make NCCL work with an ethernet setup
# Note:@sudhakarsingh27 This is very specific, need to abstract this out
EXPORTS="--export=ALL,NCCL_SOCKET_IFNAME=enp45s0f0,NCCL_SOCKET_NTHREADS=2,NCCL_NSOCKS_PERTHREAD=2"
#-------------------------------------------------------------------------------

# Setup command to be run before the actual pytest command
read -r -d '' setup_cmd <<EOF
python3.8 -m pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
&& python3.8 -m pip install pytest \
&& python3.8 -m pip install pytest-forked \
&& mkdir -p /workspace/outputs/
EOF

# Main pytest command that runs the tests
read -r -d '' cmd <<EOF
date \
&& python3.8 -m pip list | grep jax \
&& python3.8 -m pytest -m SlurmMultiNodeGpuTest --forked -v -s --continue-on-collection-errors \
--junit-xml=/workspace/outputs/junit_output_\${SLURM_PROCID}.xml \
/workspace/tests/multiprocess_gpu_test.py
EOF

# create run specific output directory for ease of analysis
OUTPUT_DIR="${BASE_WORKSPACE_DIR}/outputs/"
mkdir -p $OUTPUT_DIR

# redirect both stdout and stderr in the same file for ease of analysis
OUTFILE="${OUTPUT_DIR}/output-test-jaxlib-release-%j-%n.txt"

# Run any setup commands before the actual pytest command to make sure
# that the processes are launched together
echo $setup_cmd
srun -o $OUTFILE -e $OUTFILE \
--ntasks-per-node=1 \
--container-writable \
--container-image="$CONTAINER" \
--container-name=$CONTAINER_NAME \
$MOUNTS \
$EXPORTS \
bash -c "${setup_cmd}"

# Barrier command
wait

# Run the actual pytest command
echo $cmd
srun -o $OUTFILE -e $OUTFILE \
--ntasks-per-node=8 \
--open-mode=append \
--container-writable \
--container-image="$CONTAINER" \
--container-name=$CONTAINER_NAME \
$MOUNTS \
$EXPORTS \
bash -c "${cmd}"
set +x

0 comments on commit 4fbc9a1

Please sign in to comment.