# Numba CPU MPI nó Sequana

In [1]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
python --version

Python 3.7.7 :: Intel(R) Corporation


In [2]:
# Mostra os recursos do nó de login
! lscpu | head -n 15 | grep "Model \|CPU(s):\|Thre\|Core\|NUMA\|MHz"

CPU(s):                24
Thread(s) per core:    1
Core(s) per socket:    12
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) CPU E5-2695 v2 @ 2.40GHz
CPU MHz:               2865.820


## Código fonte

In [3]:
%%writefile numbampi.py
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads
from mpi4py import MPI   

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', parallel=True, fastmath=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1] = (aold[1:-1,1:-1]/2.0
        +(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2])/8.0)

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize three heat sources
nsources     = 3    # sources of energy
sources      = np.zeros((nsources, 2), np.int16)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# main routine
comm    = MPI.COMM_WORLD
mpisize = comm.size
mpirank = comm.rank
if not mpirank : t0 = -time()

# determine my coordinates (x,y)
pdims = MPI.Compute_dims(mpisize, 2)
px    = pdims[0]
py    = pdims[1]
rx    = mpirank % px
ry    = mpirank // px

# determine my four neighbors
north = (ry - 1) * px + rx
if (ry - 1) < 0 :
    north = MPI.PROC_NULL
south = (ry + 1) * px + rx
if (ry + 1) >= py :
    south = MPI.PROC_NULL
west = ry * px + rx - 1
if (rx - 1) < 0 :
    west = MPI.PROC_NULL
east = ry * px + rx + 1
if (rx + 1) >= px :
    east = MPI.PROC_NULL

# decompose the domain
bx   = n // px          # block size in x
by   = n // py          # block size in y
offx = rx * bx + 1      # offset in x
offy = ry * by + 1      # offset in y

# sources in my area, local to my rank
locnsources = 0
locsources  = np.empty((nsources, 2), np.int16)

# determine which sources are in my patch
for i in range(nsources) :
    locx = sources[i, 0] - offx
    locy = sources[i, 1] - offy
    if(locx >= 0 and locx <= bx and locy >= 0 and locy <= by) :
        locsources[locnsources, 0] = locx
        locsources[locnsources, 1] = locy
        locnsources += 1

# working arrays with 1-wide halo zones
anew = np.zeros((bx+2, by+2), np.float64)
aold = np.zeros((bx+2, by+2), np.float64)

# system total heat
rheat = np.zeros(1, np.float64)
bheat = np.zeros(1, np.float64)

for _ in range(0, niters, 2) :
    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(aold[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(aold[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(aold[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(aold[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        aold[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        aold[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        aold[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        aold[1:bx+1, 0] = r4.wait()

    # update grid
    kernel(anew, aold)

    # refresh heat sources
    anew[locsources[:locnsources, 0], locsources[:locnsources, 1]] += energy

    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(anew[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(anew[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(anew[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(anew[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        anew[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        anew[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        anew[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        anew[1:bx+1, 0] = r4.wait()

    # update grid
    kernel(aold, anew)

    # refresh heat sources
    aold[locsources[:locnsources, 0], locsources[:locnsources, 1]] += energy 

# get final heat in the system
bheat[0] = np.sum(aold[1:-1, 1:-1])
comm.Reduce(bheat, rheat)

if not mpirank :
    t0 += time()
    print(f"Heat: {rheat[0]:.4f}", end=" | ")
    print(f"Time: {t0:.4f}", end=" | ")
    print(f"MPISize: {mpisize}")

Writing numbampi.py


### Testa a execução

É importante testar para verificar possíveis erros, antes de enviar para as filas de execução

In [4]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
time mpiexec -n 1 python -m cProfile -s cumtime numbampi.py > numbampi.txt


real	0m8.299s
user	0m11.938s
sys	0m1.350s


In [6]:
! head -20 numbampi.txt

Heat: 750.0000 | Time: 0.6470 | MPISize: 1
         2638012 function calls (2405034 primitive calls) in 7.454 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    910/1    0.009    0.000    7.457    7.457 {built-in method builtins.exec}
        1    0.033    0.033    7.457    7.457 numbampi.py:1(<module>)
   629/32    0.005    0.000    4.783    0.149 <frozen importlib._bootstrap>:978(_find_and_load)
   629/32    0.004    0.000    4.782    0.149 <frozen importlib._bootstrap>:948(_find_and_load_unlocked)
   605/32    0.004    0.000    4.760    0.149 <frozen importlib._bootstrap>:663(_load_unlocked)
   898/32    0.001    0.000    4.750    0.148 <frozen importlib._bootstrap>:211(_call_with_frames_removed)
   513/61    0.002    0.000    4.118    0.068 {built-in method builtins.__import__}
8657/5866    0.011    0.000    4.109    0.001 <frozen importlib._bootstrap>:1009(_handle_fromlist)
   520/30    0.002    0.000    3.710    0.

In [7]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
mpiexec -n 16 python numbampi.py

Heat: 750.0000 | Time: 1.3539 | MPISize: 16


### Copia arquivo com código python para /scratch

* /prj e /scratch possuem o mesmo diretório, criados em uma etapa anterior.
* o nó de execução não enxerga o /prj do nó de login, o que for rodar precisa ser copiado para /scratch, incluindo eventuais bibliotecas ou algum ambiente python que foi criado.
* o .srm (arquivo do slurm) não precisa copiar.

In [8]:
! cp  numbampi.py  /scratch${PWD#/prj}

### Arquivo de lote do Slurm

In [9]:
%%writefile numbampi.srm
#!/bin/bash
#SBATCH --ntasks=96            #Total de tarefas
#SBATCH --job-name numbampi    #Nome do job, 8 caracteres
#SBATCH --partition cpu_dev    #Fila (partition) a ser utilizada
#SBATCH --time=00:01:00        #Tempo max. de execução
#SBATCH --exclusive            #Utilização exclusiva dos nós

echo '- Job ID:' $SLURM_JOB_ID
echo '- Tarefas por no:' $SLURM_NTASKS_PER_NODE
echo '- Qtd. de nos:' $SLURM_JOB_NUM_NODES
echo '- Tot. de tarefas:' $SLURM_NTASKS
echo '- Nos alocados:' $SLURM_JOB_NODELIST
nodeset -e $SLURM_JOB_NODELIST

#Modulos
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh

#Entra no diretório de trabalho
cd /scratch${PWD#/prj}

#Executavel
EXEC='python numbampi.py'

#Dispara a execucao
srun --mpi=pmi2  -n $SLURM_NTASKS  $EXEC

Writing numbampi.srm


## Envia para a fila de execução dev

In [11]:
%%bash
sbatch numbampi.srm
squeue --user $(whoami) -h -r | wc -l
squeue --partition=cpu_dev -h -r | wc -l
squeue --start --name=numbampi --format "%S %.8i %.9P %.5j %.2t %.5M %.5D %.4C"

Submitted batch job 1360854
1
4
START_TIME    JOBID PARTITION  NAME ST  TIME NODES CPUS
N/A  1360854   cpu_dev numba PD  0:00     4   96


Verifica se já executou:

In [12]:
! squeue --start --name=numbampi --format "%S %.8i %.9P %.5j %.2t %.5M %.5D %.4C"

START_TIME    JOBID PARTITION  NAME ST  TIME NODES CPUS


Mostra o arquivo contendo a saída:

In [13]:
! cat /scratch${PWD#/prj}/slurm-1360854.out

- Job ID: 1360854
- Tarefas por no:
- Qtd. de nos: 4
- Tot. de tarefas: 96
- Nos alocados: sdumont[1263-1266]
sdumont1263 sdumont1264 sdumont1265 sdumont1266
Heat: 602.6262 | Time: 3.2346 | MPISize: 96


Neste caso enviamos para fila `cpu_dev` que é uma fila "rápida" para executar testes, e para trabalhos pequenos.