# Numba GPU nó Sequana

In [1]:
# Check if Cuda is active
from numba import cuda
print(cuda.gpus)

<Managed Device 0>, <Managed Device 1>, <Managed Device 2>, <Managed Device 3>


Sequana:

In [2]:
! lscpu | head -n 15 | grep "Model \|CPU(s):\|Thre\|Core\|NUMA\|MHz"

CPU(s):                88
Thread(s) per core:    2
Core(s) per socket:    22
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz
CPU MHz:               2101.000


In [3]:
! nvidia-smi

Thu Sep 23 21:15:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   52C    P0    42W / 250W |   1596MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|       

In [4]:
import numpy as np, math
from time import time
from numba import cuda

# parameters
n            = 2400    # nxn grid
energy       = 1       # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize the data arrays
anew         = np.zeros((n + 2, n + 2), np.float64)
aold         = np.zeros((n + 2, n + 2), np.float64)
# initialize three heat sources
sources      = np.empty((3, 2), np.int16)
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# configure blocks & grids
## set the number of threads in a block
threads_per_block = (8, 8)
## calculate the number of thread blocks in the grid
blocks_per_grid_x = math.ceil(aold.shape[0] / threads_per_block[0])
blocks_per_grid_y = math.ceil(aold.shape[1] / threads_per_block[1])
blocks_per_grid   = (blocks_per_grid_x, blocks_per_grid_y)

# computationally intensive core
@cuda.jit
def kernel(a1, a2) :
    n = a1.shape[0] - 1
    i, j = cuda.grid(2)
    if (i > 0 and j > 0) and (i < n and j < n) :
        a1[i,j] = (a2[i,j]/2.0
                   +(a2[i-1,j]+a2[i+1,j]+a2[i,j-1]+a2[i,j+1])/8.0)

# insert heat
@cuda.jit
def insert_heat(a, sources, energy) :
    n = a.shape[0] - 1
    i, j = cuda.grid(2)
    if ( (sources[0, 0] == i and sources[0, 1] == j) or
         (sources[1, 0] == i and sources[1, 1] == j) or
         (sources[2, 0] == i and sources[2, 1] == j) ) :
        a[i, j] += energy

# main routine
t0 = -time()    # time measure
t1 = 0
t2 = 0

t_ = time()
# copy the arrays to the device
anew_global_mem    = cuda.to_device(anew)
aold_global_mem    = cuda.to_device(aold)
sources_global_mem = cuda.to_device(sources)
t2 += time() - t_

for _ in range(0, niters, 2) :
    t_ = time()
    kernel[blocks_per_grid, threads_per_block](
        anew_global_mem, aold_global_mem)
    insert_heat[blocks_per_grid, threads_per_block](
        anew_global_mem, sources_global_mem, energy)    
    kernel[blocks_per_grid, threads_per_block](
        aold_global_mem, anew_global_mem)
    insert_heat[blocks_per_grid, threads_per_block](
        aold_global_mem, sources_global_mem, energy)
    t1 += time() - t_

t_ = time()
# copy the result back to the host
aold = aold_global_mem.copy_to_host()
t2 += time() - t_

# system total heat
heat = np.sum(aold[1:-1, 1:-1])

t0 += time()

# show the result
print(f"Heat: {heat:.4f}", end=" | ")
print(f"Time: {t0:.4f}", end=" | ")
print(f"Kernel: {t1:.4f}", end=" | ")
print(f"Memory: {t2:.4f}")

Heat: 750.0000 | Time: 4.7038 | Kernel: 4.4812 | Memory: 0.2163


## Comparação com Numba CPU

In [2]:
%%writefile numbacpusequana.py
import numpy as np, sys
from time import time
from numba import njit, set_num_threads, get_num_threads, threading_layer

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize the data arrays
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
# initialize three heat sources
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', parallel=True, fastmath=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1] = (aold[1:-1,1:-1]/2.0
        +(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2])/8.0)

# main routine
set_num_threads(int(sys.argv[1]))
t0 = -time()    # time measure
t1 = 0
for _ in range(0, niters, 2) :
    t_ = time()
    kernel(anew, aold)
    t1 += time() - t_
    anew[sources[:, 0], sources[:, 1]] += energy
    t_ = time()
    kernel(aold, anew)
    t1 += time() - t_
    aold[sources[:, 0], sources[:, 1]] += energy
heat = np.sum(aold[1:-1, 1:-1])  # system total heat
t0 += time()    # time measure

# show the result
print(f"Heat: {heat:.4f}", end=" | ")
print(f"Time: {t0:.4f}", end=" | ")
print(f"Kernel: {t1:.4f}")
print(f"Threading layer: {threading_layer()}", end=" | ")
print(f"Thread count: {get_num_threads()}")

Overwriting numbacpusequana.py


* 1, 4, 9, 16, 36, 49, 64, 81, 88
* visualizar top em um terminal

## 1 thread

In [29]:
! python numbacpusequana.py 1

Heat: 750.0000 | Time: 2.0726 | Kernel: 2.0615
Threading layer: tbb | Thread count: 1
[0m

## 4 threads

In [30]:
! python numbacpusequana.py 4

Heat: 750.0000 | Time: 0.8354 | Kernel: 0.8231
Threading layer: tbb | Thread count: 4
[0m

## 16 threads

In [31]:
! python numbacpusequana.py 16

Heat: 750.0000 | Time: 0.3493 | Kernel: 0.3396
Threading layer: tbb | Thread count: 16
[0m

## 32 threads

In [32]:
! python numbacpusequana.py 32

Heat: 750.0000 | Time: 0.3143 | Kernel: 0.3025
Threading layer: tbb | Thread count: 32
[0m

## 34 threads

In [9]:
! python numbacpusequana.py 34

Heat: 750.0000 | Time: 0.3081 | Kernel: 0.2964
Threading layer: tbb | Thread count: 34
[0m

## 44 threads

In [33]:
! python numbacpusequana.py 44

Heat: 750.0000 | Time: 0.3731 | Kernel: 0.3582
Threading layer: tbb | Thread count: 44
[0m

## 64 threads

In [34]:
! python numbacpusequana.py 64

Heat: 750.0000 | Time: 0.3641 | Kernel: 0.3485
Threading layer: tbb | Thread count: 64
[0m

## 88 threads

In [35]:
! python numbacpusequana.py 88

Heat: 750.0000 | Time: 0.3708 | Kernel: 0.3543
Threading layer: tbb | Thread count: 88
[0m