# Google Colab - Numba CPU MPI

Using mpiexec

In [1]:
! mpiexec --version

mpiexec (OpenRTE) 2.1.1

Report bugs to http://www.open-mpi.org/community/help/


## Código fonte

In [2]:
%%writefile numbampi.py
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads, config
from mpi4py import MPI

# set the threading layer before any parallel target compilation
config.THREADING_LAYER = 'omp'    # OpenMP

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', parallel=True, fastmath=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1] = (aold[1:-1,1:-1]/2.0
        +(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2])/8.0)

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize three heat sources
nsources     = 3    # sources of energy
sources      = np.zeros((nsources, 2), np.int16)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# main routine
comm    = MPI.COMM_WORLD
mpisize = comm.size
mpirank = comm.rank
if not mpirank : t0 = -time()

# determine my coordinates (x,y)
pdims = MPI.Compute_dims(mpisize, 2)
px    = pdims[0]
py    = pdims[1]
rx    = mpirank % px
ry    = mpirank // px

# determine my four neighbors
north = (ry - 1) * px + rx
if (ry - 1) < 0 :
    north = MPI.PROC_NULL
south = (ry + 1) * px + rx
if (ry + 1) >= py :
    south = MPI.PROC_NULL
west = ry * px + rx - 1
if (rx - 1) < 0 :
    west = MPI.PROC_NULL
east = ry * px + rx + 1
if (rx + 1) >= px :
    east = MPI.PROC_NULL

# decompose the domain
bx   = n // px          # block size in x
by   = n // py          # block size in y
offx = rx * bx + 1      # offset in x
offy = ry * by + 1      # offset in y

# sources in my area, local to my rank
locnsources = 0
locsources  = np.empty((nsources, 2), np.int16)

# determine which sources are in my patch
for i in range(nsources) :
    locx = sources[i, 0] - offx
    locy = sources[i, 1] - offy
    if(locx >= 0 and locx <= bx and locy >= 0 and locy <= by) :
        locsources[locnsources, 0] = locx
        locsources[locnsources, 1] = locy
        locnsources += 1

# working arrays with 1-wide halo zones
anew = np.zeros((bx+2, by+2), np.float64)
aold = np.zeros((bx+2, by+2), np.float64)

# system total heat
rheat = np.zeros(1, np.float64)
bheat = np.zeros(1, np.float64)

for _ in range(0, niters, 2) :
    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(aold[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(aold[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(aold[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(aold[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        aold[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        aold[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        aold[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        aold[1:bx+1, 0] = r4.wait()

    # update grid
    kernel(anew, aold)

    # refresh heat sources
    anew[locsources[:locnsources, 0], locsources[:locnsources, 1]] += energy

    # exchange data with neighbors
    if north != MPI.PROC_NULL :
        r1=comm.irecv(source=north, tag=1)
        s1=comm.isend(anew[1, 1:bx+1], dest=north, tag=1)
    if south != MPI.PROC_NULL :
        r2=comm.irecv(source=south, tag=1)
        s2=comm.isend(anew[bx, 1:bx+1], dest=south, tag=1)
    if east != MPI.PROC_NULL :
        r3 = comm.irecv(source=east, tag=1)
        s3 = comm.isend(anew[1:bx+1, bx], dest=east, tag=1)
    if west != MPI.PROC_NULL :
        r4 = comm.irecv(source=west, tag=1)
        s4 = comm.isend(anew[1:bx+1, 1], dest=west, tag=1)
    # wait
    if north != MPI.PROC_NULL :
        s1.wait()
        anew[0, 1:bx+1] = r1.wait()
    if south != MPI.PROC_NULL :
        s2.wait()
        anew[bx+1, 1:bx+1] = r2.wait()
    if east != MPI.PROC_NULL :
        s3.wait()
        anew[1:bx+1, bx+1] = r3.wait()
    if west != MPI.PROC_NULL :
        s4.wait
        anew[1:bx+1, 0] = r4.wait()

    # update grid
    kernel(aold, anew)

    # refresh heat sources
    aold[locsources[:locnsources, 0], locsources[:locnsources, 1]] += energy 

# get final heat in the system
bheat[0] = np.sum(aold[1:-1, 1:-1])
comm.Reduce(bheat, rheat)

if not mpirank :
    t0 += time()
    print(f"Heat: {rheat[0]:.4f}", end=" | ")
    print(f"Time: {t0:.4f}", end=" | ")
    print(f"MPISize: {mpisize}")

Overwriting numbampi.py


### Testa a execução

In [3]:
! time mpiexec --allow-run-as-root -n 1 python -m cProfile -s cumtime numbampi.py > numbampi.txt


real	0m6.619s
user	0m8.379s
sys	0m0.591s


In [4]:
! head -20 numbampi.txt

Heat: 750.0000 | Time: 2.7987 | MPISize: 1
         3003038 function calls (2769733 primitive calls) in 5.581 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    904/1    0.007    0.000    5.584    5.584 {built-in method builtins.exec}
        1    0.033    0.033    5.584    5.584 numbampi.py:1(<module>)
      250    2.712    0.011    2.712    0.011 numbampi.py:10(kernel)
    44/39    0.000    0.000    1.743    0.045 decorators.py:188(wrapper)
     50/2    0.000    0.000    1.735    0.868 compiler_lock.py:29(_acquire_compile_lock)
        1    0.000    0.000    1.704    1.704 dispatcher.py:795(compile)
        1    0.000    0.000    1.703    1.703 dispatcher.py:77(compile)
        1    0.000    0.000    1.703    1.703 dispatcher.py:84(_compile_cached)
        1    0.000    0.000    1.703    1.703 dispatcher.py:99(_compile_core)
        1    0.000    0.000    1.703    1.703 compiler.py:601(compile_extra)
        1    0.000

SD:

<div style="font-size:small;">
    
    Heat: 750.0000 | Time: 0.6470 | MPISize: 1
        2638012 function calls (2405034 primitive calls) in 7.454 seconds

    Ordered by: cumulative time

    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    910/1     0.009    0.000    7.457    7.457 {built-in method builtins.exec}
        1     0.033    0.033    7.457    7.457 numbampi.py:1(<module>)
    629/32    0.005    0.000    4.783    0.149 <frozen importlib._bootstrap>:978(_find_and_load)
    629/32    0.004    0.000    4.782    0.149 <frozen importlib._bootstrap>:948(_find_and_load_unlocked)
    605/32    0.004    0.000    4.760    0.149 <frozen importlib._bootstrap>:663(_load_unlocked)
    898/32    0.001    0.000    4.750    0.148 <frozen importlib._bootstrap>:211(_call_with_frames_removed)
    513/61    0.002    0.000    4.118    0.068 {built-in method builtins.__import__}
    8657/5866 0.011    0.000    4.109    0.001 <frozen importlib._bootstrap>:1009(_handle_fromlist)
    520/30    0.002    0.000    3.710    0.124 <frozen importlib._bootstrap_external>:722(exec_module)
        6     0.000    0.000    2.955    0.492 __init__.py:3(<module>)
    44/39     0.000    0.000    2.665    0.068 decorators.py:181(wrapper)
    50/2      0.000    0.000    2.655    1.328 compiler_lock.py:29(_acquire_compile_lock)
        1     0.000    0.000    2.582    2.582 dispatcher.py:770(compile)
        1     0.000    0.000    2.581    2.581 dispatcher.py:76(compile)
    
</div>

In [5]:
! time mpiexec --allow-run-as-root -n 1 python numbampi.py

Heat: 750.0000 | Time: 2.7990 | MPISize: 1

real	0m5.038s
user	0m7.230s
sys	0m0.488s


In [6]:
! time mpiexec --allow-run-as-root -n 2 python numbampi.py

Heat: 613.9639 | Time: 3.6108 | MPISize: 2

real	0m7.474s
user	0m13.194s
sys	0m0.947s
