In [1]:
%load_ext lab_black

In [2]:
import numpy as np
from numba import cuda

from timing.timing import time_function

In [3]:
ARRAY_SIZE = 10 ** 6

In [4]:
rng = np.random.default_rng(1234)

x = np.array(rng.random(ARRAY_SIZE), dtype=np.float32)
y = np.array(rng.random(ARRAY_SIZE), dtype=np.float32)

In [5]:
# Threads are arranged in grids.
# Each block contains many threads.
# Each streaming multiprocessor (SM) can run one or more blocks.
# A kernel is a grid of blocks.
# Each device can run one or more kernels.

In [6]:
@cuda.jit(device=True)
def cu_device_multiply(x_i, y_i):
    return x_i * y_i

In [7]:
@cuda.jit
def _cu_multiply(x, y, z):
    i = cuda.grid(1)
    if i < z.size:
        z[i] = cu_device_multiply(x[i], y[i])

In [8]:
def cu_multiply(x, y):

    threadsperblock = 32

    blockspergrid = (ARRAY_SIZE + (threadsperblock - 1)) // threadsperblock

    z = np.zeros(x.size)

    _cu_multiply[blockspergrid, threadsperblock](x, y, z)

    return z

In [9]:
@cuda.reduce
def cu_sum(a, b):
    return a + b

In [10]:
def cu_dot(x, y):
    return cu_sum(cu_multiply(x, y))

In [11]:
time_function(cu_dot, (x, y), agg=lambda x: x)



[2.148743592999381, 0.3385372689990618, 0.3462544419999176]