# Numba CPU MPI nó Sequana

In [2]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
python --version

Python 3.7.7 :: Intel(R) Corporation


Sequana:

In [1]:
! lscpu | head -n 15 | grep "Model \|CPU(s):\|Thre\|Core\|NUMA\|MHz"

CPU(s):                88
Thread(s) per core:    2
Core(s) per socket:    22
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz
CPU MHz:               2101.000


### 2400 x 2400 e 250 passos

In [None]:
%%writefile numbampisequana2.py
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads
from mpi4py import MPI   

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations
# other variables
heat         = np.zeros((1), np.float64)     # system total heat
size         = n + 2
anew         = np.zeros((size, size), np.float64)
aold         = np.zeros((size, size), np.float64)
sources      = np.empty((3,2), np.int32)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]
niters       = (niters+1) // 2
nsources     = 3    # sources of energy
sources      = np.zeros((nsources, 2), np.intc)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', fastmath=True, parallel=True, nogil=True)
def kernel1(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+
            1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
comm = MPI.COMM_WORLD
mpirank = comm.rank
mpisize = comm.size

# sources in my area, local to my rank
locnsources = 0
locsources = np.empty((nsources,2), np.intc)

rheat = np.zeros(1, np.double)
bheat = np.zeros(1, np.double)

# determine my coordinates (x,y)
pdims = MPI.Compute_dims(mpisize, 2)
px    = pdims[0]
py    = pdims[1]
rx    = mpirank % px
ry    = mpirank // px

# determine my four neighbors
north = (ry - 1) * px + rx
if (ry - 1) < 0 :
    north = MPI.PROC_NULL
south = (ry + 1) * px + rx
if (ry + 1) >= py :
    south = MPI.PROC_NULL
west = ry * px + rx - 1
if (rx - 1) < 0 :
    west = MPI.PROC_NULL
east = ry * px + rx + 1
if (rx + 1) >= px :
    east = MPI.PROC_NULL

# decompose the domain
bx = n // px            # block size in x
by = n // py            # block size in y
offx = rx * bx + 1      # offset in x
offy = ry * by + 1      # offset in y

# determine which sources are in my patch
for i in range(nsources) :
    locx = sources[i, 0] - offx
    locy = sources[i, 1] - offy
    if(locx >= 0 and locx <= bx and locy >= 0 and locy <= by) :
        locsources[locnsources, 0] = locx + 2 - 1
        locsources[locnsources, 1] = locy + 2 - 1
        locnsources += 1

# working arrays with 1-wide halo zones
anew = np.zeros((bx+2, by+2), np.double)
aold = np.zeros((bx+2, by+2), np.double)

if not mpirank : t0 = time()

for _ in range(niters) :
    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(aold[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(aold[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(aold[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(aold[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        aold[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        aold[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        aold[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        aold[1:bx+1, 0] = r4.wait()

    # update grid
    kernel1(anew, aold)

    # refresh heat sources
    for i in range(locnsources) :
        anew[locsources[i, 0]-1, locsources[i, 1]-1] += energy

    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(anew[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(anew[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(anew[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(anew[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        anew[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        anew[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        anew[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        anew[1:bx+1, 0] = r4.wait()

    # update grid
    kernel1(aold, anew)

    # refresh heat sources
    for i in range(locnsources) :
        aold[locsources[i, 0]-1, locsources[i, 1]-1] += energy 

# get final heat in the system
bheat[0] = np.sum(aold[1:-1, 1:-1])
comm.Reduce(bheat, rheat)

if not mpirank :
    t0 = time() - t0
    print("Heat: %0.4f | Time: %0.4f | MPISize: %g" % (rheat[0], t0, mpisize) )

## Rodando usando mpiexec
(1, 4, 9, 16, 36)

### 1 processo

In [41]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
export UCX_TLS=rc,ud,sm,self
mpiexec -n 1 python numbampisequana2.py

Heat: 750.0000 | Time: 0.4033 | MPISize: 1


### 4 processos

In [42]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
export UCX_TLS=rc,ud,sm,self
mpiexec -n 4 python numbampisequana2.py

Heat: 750.0000 | Time: 0.3033 | MPISize: 4


### 9 processos

In [43]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
export UCX_TLS=rc,ud,sm,self
mpiexec -n 9 python numbampisequana2.py

Heat: 750.0000 | Time: 0.6694 | MPISize: 9


### 16 processos

In [44]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
export UCX_TLS=rc,ud,sm,self
mpiexec -n 16 python numbampisequana2.py

Heat: 750.0000 | Time: 0.7669 | MPISize: 16


### 36 processos

In [45]:
%%bash
module load intel_psxe/2020
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
unset I_MPI_PMI_LIBRARY
export UCX_TLS=rc,ud,sm,self
mpiexec -n 36 python numbampisequana2.py

Heat: 750.0000 | Time: 2.4932 | MPISize: 36
