# Numba GPU nó Sequana (sdumont18)

In [1]:
# Check if Cuda is active
from numba import cuda
print(cuda.gpus)

<Managed Device 0>, <Managed Device 1>, <Managed Device 2>, <Managed Device 3>


In [5]:
! lscpu | head -n 15 | grep "Model \|CPU(s):\|Thre\|Core\|NUMA\|MHz"

CPU(s):                88
Thread(s) per core:    2
Core(s) per socket:    22
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz
CPU MHz:               2101.000


In [4]:
! nvidia-smi

Fri Sep 17 16:09:06 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   50C    P0    41W / 250W |   2089MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|       

In [3]:
import numpy as np, math
from time import time
from numba import cuda

# parameters
n            = 2400    # nxn grid
energy       = 1       # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize the data arrays
anew         = np.zeros((n + 2, n + 2), np.float64)
aold         = np.zeros((n + 2, n + 2), np.float64)
# initialize three heat sources
nsources     = 3       # sources of energy
sources      = np.empty((nsources,2), np.int32)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]
sources      = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]
heat         = 0       # system total heat sum

# configure blocks & grids
## set the number of threads in a block
threads_per_block = (8, 8)
## calculate the number of thread blocks in the grid
blocks_per_grid_x = math.ceil(aold.shape[0] / threads_per_block[0])
blocks_per_grid_y = math.ceil(aold.shape[1] / threads_per_block[1])
blocks_per_grid   = (blocks_per_grid_x, blocks_per_grid_y)

# computationally intensive core
@cuda.jit
def kernel(a1, a2) :
    n = a1.shape[0] - 1
    i, j = cuda.grid(2)
    if (i > 0 and j > 0) and (i < n and j < n) :
        a1[i,j] = 1/2.0*(a2[i,j]+
                  1/4.0*(a2[i-1,j]+a2[i+1,j]+a2[i,j-1]+a2[i,j+1]))

# insert heat
@cuda.jit
def insert_heat(a, sources, energy) :
    n = a.shape[0] - 1
    i, j = cuda.grid(2)
    if ( (sources[0, 0] == i and sources[0, 1] == j) or
         (sources[1, 0] == i and sources[1, 1] == j) or
         (sources[2, 0] == i and sources[2, 1] == j) ) :
        a[i, j] += energy

# main routine
t0 = time()    # time measure
t1 = 0
t2 = 0

t_ = time()
# copy the arrays to the device
anew_global_mem    = cuda.to_device(anew)
aold_global_mem    = cuda.to_device(aold)
sources_global_mem = cuda.to_device(sources)
t2 += time() - t_

for _ in range(0, niters, 2) :
    t_ = time()
    kernel[blocks_per_grid, threads_per_block](
        anew_global_mem, aold_global_mem)
    insert_heat[blocks_per_grid, threads_per_block](
        anew_global_mem, sources_global_mem, energy)    
    kernel[blocks_per_grid, threads_per_block](
        aold_global_mem, anew_global_mem)
    insert_heat[blocks_per_grid, threads_per_block](
        aold_global_mem, sources_global_mem, energy)
    t1 += time() - t_

t_ = time()
# copy the result back to the host
aold = aold_global_mem.copy_to_host()
t2 += time() - t_

# system total heat
heat = np.sum( aold[1:-1, 1:-1] )

t0 = time() - t0

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f | Memory: %0.4f" %
      (heat, t0, t1, t2) )

Heat: 750.0000 | Time: 0.5087 | Kernel: 0.4278 | Memory: 0.0749


## Comparação com Numba CPU

In [7]:
%%writefile numbacpusequana.py
import numpy as np, sys
from time import time
from numba import njit, set_num_threads, get_num_threads, threading_layer

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations

# other variables
heat         = np.zeros((1), np.float64)     # system total heat
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', fastmath=True, parallel=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
set_num_threads(int(sys.argv[1]))
t2 = 0
t0 = time()    # time measure
for iters in range((niters+1)//2) :
    t3 = time()
    kernel(anew, aold)
    t2 += time() - t3
    anew[sources[:, 0], sources[:, 1]] += energy
    t3 = time()
    kernel(aold, anew)
    t2 += time() - t3
    aold[sources[:, 0], sources[:, 1]] += energy
heat[0] = np.sum( aold[1:-1, 1:-1] )  # system total heat
t1 = time()    # time measure

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f" % (heat[0], t1-t0, t2) )
print("Threading layer chosen: %s | Thread count: %s" % (threading_layer(), get_num_threads()) )

Overwriting numbacpusequana.py


(1, 4, 9, 16, 36, 49, 64, 81, 100)

## 1 thread

In [8]:
! python numbacpusequana.py 1

Heat: 750.0000 | Time: 2.1131 | Kernel: 2.1012
Threading layer chosen: tbb | Thread count: 1
[0m

## 4 threads

In [11]:
! python numbacpusequana.py 4

Heat: 750.0000 | Time: 0.8060 | Kernel: 0.7942
Threading layer chosen: tbb | Thread count: 4
[0m

## 9 threads

In [12]:
! python numbacpusequana.py 9

Heat: 750.0000 | Time: 0.4837 | Kernel: 0.4734
Threading layer chosen: tbb | Thread count: 9
[0m

## 16 threads

In [13]:
! python numbacpusequana.py 16

Heat: 750.0000 | Time: 0.3704 | Kernel: 0.3599
Threading layer chosen: tbb | Thread count: 16
[0m

## 36 threads

In [14]:
! python numbacpusequana.py 36

Heat: 750.0000 | Time: 0.3639 | Kernel: 0.3493
Threading layer chosen: tbb | Thread count: 36
[0m

## 49 threads

In [15]:
! python numbacpusequana.py 49

Heat: 750.0000 | Time: 0.3608 | Kernel: 0.3471
Threading layer chosen: tbb | Thread count: 49
[0m

## 64 threads

In [16]:
! python numbacpusequana.py 64

Heat: 750.0000 | Time: 0.3662 | Kernel: 0.3515
Threading layer chosen: tbb | Thread count: 64
[0m

## 81 threads

In [17]:
! python numbacpusequana.py 81

Heat: 750.0000 | Time: 0.3710 | Kernel: 0.3557
Threading layer chosen: tbb | Thread count: 81
[0m

## 88 threads

In [19]:
! python numbacpusequana.py 88

Heat: 750.0000 | Time: 0.3708 | Kernel: 0.3552
Threading layer chosen: tbb | Thread count: 88
[0m

## 100 threads

In [18]:
! python numbacpusequana.py 100

Traceback (most recent call last):
  File "numbacpusequana.py", line 23, in <module>
    set_num_threads(int(sys.argv[1]))
  File "/scratch/app/anaconda3/2020.11/lib/python3.8/site-packages/numba/np/ufunc/parallel.py", line 577, in set_num_threads
    snt_check(n)
  File "/scratch/app/anaconda3/2020.11/lib/python3.8/site-packages/numba/np/ufunc/parallel.py", line 539, in snt_check
    raise ValueError(msg)
ValueError: The number of threads must be between 1 and 88
[0m