# Writing CUDA-Python

# Imports

In [14]:
import numpy as np
from numba import cuda

# Compiling

**A kernel cannot have any return value.**

In [31]:
@cuda.jit
def foo(aryA, aryB):
    ...

In [33]:
aryA = np.array([0, 1, 2])
aryB = np.array([0, 1, 2])

In [35]:
griddim = 1, 2
blockdim = 3, 4
foo[griddim, blockdim](aryA, aryB)

Exception ignored in: <bound method CompilationUnit.__del__ of <numba.cuda.cudadrv.nvvm.CompilationUnit object at 0x7f17b223e358>>
Traceback (most recent call last):
  File "/local_data/env-py3/lib/python3.6/site-packages/numba/cuda/cudadrv/nvvm.py", line 152, in __del__
    driver = NVVM()
  File "/local_data/env-py3/lib/python3.6/site-packages/numba/cuda/cudadrv/nvvm.py", line 116, in __new__
    raise NvvmSupportError(errmsg % e)
numba.cuda.cudadrv.error.NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
library nvvm not found
Exception ignored in: <bound method CompilationUnit.__del__ of <numba.cuda.cudadrv.nvvm.CompilationUnit object at 0x7f17b21fe5c0>>
Traceback (most recent call last):
  File "/local_data/env-py3/lib/python3.6/site-packages/numba/cuda/cudadrv/nvvm.py", line 152, in __del__
    driver = NVVM()
  File "/local_data/env-py3/lib/python3.6/site-packages/numba/cuda/cudadrv/nvvm.py", line 116, in __new__
    raise NvvmSupportError(errmsg % e)
num

NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
library nvvm not found

In [28]:
@cuda.jit
def increment_by_one(an_array):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    bw = cuda.blockDim.x
    # Compute flattened index inside the array
    pos = tx + ty * bw
    if pos < an_array.size:  # Check array boundaries
        an_array[pos] += 1

Matrix Multiplication

In [None]:
@cuda.jit
def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B"""
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0
        for k in range(A.shape[1]):
            tmp += A[i, k] * A[k, j]
        C[i, j] = tmp

In [9]:
cuda.select_device(0)

<weakproxy at 0x7f17ba04bb38 to Device at 0x7f17bb5e7ba8>

In [10]:
cuda.select_device(1)

<weakproxy at 0x7f17ba04bb38 to Device at 0x7f17bb5e7ba8>

In [7]:
print(cuda.gpus)

<Managed Device 0>, <Managed Device 1>


kernels cannot explicitly return a value; all result data must be written to an array passed to the function (if computing a scalar, you will probably pass a one-element array);



In [11]:
@cuda.jit
def increment_by_one(an_array):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    bw = cuda.blockDim.x
    # Compute flattened index inside the array
    pos = tx + ty * bw
    if pos < an_array.size:  # Check array boundaries
        an_array[pos] += 1

In [15]:
an_array = np.ones(5)
print(an_array)

[1. 1. 1. 1. 1.]


In [25]:
threadsperblock = 32
blockspergrid = (an_array.size + (threadsperblock - 1)) // threadsperblock

In [26]:
increment_by_one[blockspergrid, threadsperblock](an_array)

NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
library nvvm not found

# Seven Things You Might Not Know about Numba
https://devblogs.nvidia.com/seven-things-numba/

In [36]:
import math

In [37]:
@cuda.jit
def gpu_cos(x, out):
    # assume 1D array
    start  = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(start, x.shape[0], stride):
        out[i] = math.cos(x[i])

def do_cos(x):
    out = cuda.device_array_like(x)
    gpu_cos[64, 64](x, out)
    return out.copy_to_host()

In [38]:
test_x = np.random.uniform(-10, 10, 100).astype(np.float32)
result = do_cos(test_x)
np.testing.assert_allclose(result, np.cos(test_x), rtol = 1e-6)

NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
library nvvm not found