# Santos Dumont (SD) - Numba GPU nó B715

Código fonte: ng2.py

* A fila *nvidia_dev* tem 4 nós, cada nó com 2 GPUs, totalizando 8 GPUs
* Em cada nó, ao utilizar as 2 GPUs de forma exclusiva, 22 CPUs ficam sem uso
* 2 CPUs são utilizadas, uma para cada GPU

In [1]:
! cp ng2.py /scratch${PWD#/prj}

In [2]:
%%writefile ng2.srm
#!/bin/bash
#SBATCH --job-name ng2          # Job name
#SBATCH --partition nvidia_dev # Select partition
#SBATCH --ntasks-per-node=2    # Tasks per node
#SBATCH --nodes=2              # Minimum to be allocated
#SBATCH --ntasks=4             # Total tasks
#SBATCH --time=00:20:00        # Limit execution time
#SBATCH --exclusive            # Exclusive acccess to nodes

echo '========================================'
echo '- Job ID:' $SLURM_JOB_ID
echo '- Tasks per node:' $SLURM_NTASKS_PER_NODE
echo '- # of nodes in the job:' $SLURM_JOB_NUM_NODES
echo '- # of tasks:' $SLURM_NTASKS
echo '- Dir from which sbatch was invoked:' ${SLURM_SUBMIT_DIR##*/}
cd $SLURM_SUBMIT_DIR
echo -n '- List of nodes allocated to the job: '
nodeset -e $SLURM_JOB_NODELIST

# Environment
echo '-- modules ----------------------------'
echo 'conda activate env2, --stack env3'
cd
SCR=/scratch${PWD#/prj}
cd $SCR
source $SCR/env2/etc/profile.d/conda.sh
conda activate $SCR/env2
conda activate --stack $SCR/env3
cd $SCR/b715

# Executable
EXEC="python ng2.py"

# Start
OPT='--mpi=pmi2 --cpu_bind=cores --distribution=block:cyclic'
echo '-- run --------------------------------'
echo '$ srun -n' $SLURM_NTASKS ${EXEC##*/}
echo '-- output -----------------------------'
srun  $OPT  -n $SLURM_NTASKS  $EXEC  | sort
echo '~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

Writing ng2.srm


<hr style="height:6px;border-width:0;color:gray;background-color:gray">

## 1 processo, 1 GPU, 1 nó

In [3]:
! sbatch  --ntasks-per-node=1  --nodes=1  --ntasks=1 ng2.srm

Submitted batch job 3611551


In [5]:
! squeue --name ng2 --partition=nvidia_dev --format "%.19S  %.7i  %.10P %.5j  %.2t %.5M %.5D %.4C"

         START_TIME    JOBID   PARTITION  NAME  ST  TIME NODES CPUS
2021-11-07T11:50:38  3611551  nvidia_dev   ng2   R  0:05     1   24


In [8]:
! squeue --name ng2 --partition=nvidia_dev --format "%.19S  %.7i  %.10P %.5j  %.2t %.5M %.5D %.4C"

         START_TIME    JOBID   PARTITION  NAME  ST  TIME NODES CPUS


In [9]:
! cat /scratch${PWD#/prj}/slurm-3611551.out

- Job ID: 3611551
- Tasks per node: 1
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: 2021-11-08
- List of nodes allocated to the job: sdumont3052
-- modules ----------------------------
conda activate env2, --stack env3
-- run --------------------------------
$ srun -n 1 python ng2.py
-- output -----------------------------
1. hostname    rank crank  cid
2. ----------- ---- ----- ----
3. sdumont3052   00    00   00
4. ---------------------------
5. Heat:1500.0000, TT:105.8648, KT:2.6595, CT:103.1252, MPI:1, dim:4800, ite:500
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


<hr style="height:6px;border-width:0;color:gray;background-color:gray">

## 8 processos, 8 GPUs, 4 nós

In [10]:
! sbatch  --ntasks-per-node=2  --nodes=4  ng2.srm

Submitted batch job 3614181


In [11]:
! squeue --name ng2 --format "%.19S  %.7i  %.10P %.5j  %.2t %.5M %.5D %.4C"

         START_TIME    JOBID   PARTITION  NAME  ST  TIME NODES CPUS
2021-11-07T11:53:45  3614181  nvidia_dev   ng2   R  0:01     4   96


In [12]:
! squeue --name ng2 --format "%.19S  %.7i  %.10P %.5j  %.2t %.5M %.5D %.4C"

         START_TIME    JOBID   PARTITION  NAME  ST  TIME NODES CPUS


In [13]:
! cat /scratch${PWD#/prj}/slurm-3614181.out

- Job ID: 3614181
- Tasks per node: 2
- # of nodes in the job: 4
- # of tasks: 4
- Dir from which sbatch was invoked: 2021-11-08
- List of nodes allocated to the job: sdumont3052 sdumont3053 sdumont3054 sdumont3055
-- modules ----------------------------
conda activate env2, --stack env3
-- run --------------------------------
$ srun -n 4 python ng2.py
-- output -----------------------------
1. hostname    rank crank  cid
2. ----------- ---- ----- ----
3. sdumont3052   00    00   00
3. sdumont3053   01    00   00
3. sdumont3054   02    00   00
3. sdumont3055   03    00   00
4. ---------------------------
5. Heat:1500.0000, TT:27.8101, KT:0.7045, CT:26.7243, MPI:4, dim:4800, ite:500
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
