# Matrix Multiplication

In [6]:
import math
import numpy as np

import numba.cuda as cuda

In [3]:
@cuda.jit
def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

In [12]:
A = np.random.random(20).reshape(10, 2)
B = np.random.random(20).reshape(2, 10)

In [13]:
threads_per_block = (4, 4)
blocks_per_grid   = (
    math.ceil(A.size / threads_per_block[0]),
    math.ceil(B.size / threads_per_block[1]))

print(threads_per_block[0] * blocks_per_grid[0])
print(threads_per_block[1] * blocks_per_grid[1])

20
20


In [18]:
C = np.empty((10, 10))
#matmul[blocks_per_grid, threads_per_block](A, B, C)
np.allclose(np.matmul(A, B), C)

True

--------------

In [25]:
@cuda.jit
def get_i(A, B):
    """get index i"""
    i, j = cuda.grid(2)
    if i < B.shape[0] and j < B.shape[1]:
        B[i, j] = i
        
@cuda.jit        
def get_j(A, B):
    """get index j"""
    i, j = cuda.grid(2)
    if i < B.shape[0] and j < B.shape[1]:
        B[i, j] = j

In [26]:
A = np.random.random(20).reshape(10, 2)
B = np.empty_like(A)

In [30]:
threads_per_block = (4, 4)
blocks_per_grid   = (
    math.ceil(A.shape[0] / threads_per_block[0]),
    math.ceil(A.shape[1] / threads_per_block[1]))

print(threads_per_block[0] * blocks_per_grid[0])
print(threads_per_block[1] * blocks_per_grid[1])

8
4


In [31]:
B = np.empty_like(A)
get_i[blocks_per_grid, threads_per_block](A, B)
print(B)

[[0. 0.]
 [1. 1.]
 [2. 2.]
 [3. 3.]
 [4. 4.]
 [5. 5.]
 [6. 6.]
 [7. 7.]
 [0. 1.]
 [0. 1.]]


In [29]:
B = np.empty_like(A)
get_j[blocks_per_grid, threads_per_block](A, B)
print(B)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


If the number of total threads is not correct

In [32]:
threads_per_block = (4, 4)
blocks_per_grid   = (
    2, #math.ceil(A.shape[0] / threads_per_block[0]),
    math.ceil(A.shape[1] / threads_per_block[1]))

print(threads_per_block[0] * blocks_per_grid[0])
print(threads_per_block[1] * blocks_per_grid[1])

8
4


In [33]:
B = np.empty_like(A)
get_i[blocks_per_grid, threads_per_block](A, B)
print(B)

[[0. 0.]
 [1. 1.]
 [2. 2.]
 [3. 3.]
 [4. 4.]
 [5. 5.]
 [6. 6.]
 [7. 7.]
 [0. 1.]
 [0. 1.]]


In [34]:
B = np.empty_like(A)
get_j[blocks_per_grid, threads_per_block](A, B)
print(B)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
