<a href="https://colab.research.google.com/github/etfrer-yi/Numba-Accelerated-Matrix-Operations/blob/main/Numba_Matrix_Operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from numba import cuda
import numpy as np

In [None]:
def gpu_matrix_wrapper(A, B, fn):
  size = A.shape[0]
  A_device = cuda.to_device(A)
  B_device = cuda.to_device(B)
  C_device = cuda.device_array((size, size))

  blocks_per_grid, threads_per_block = size, size
  fn[blocks_per_grid, threads_per_block](A_device, B_device, C_device)
  cuda.synchronize()
  return C_device.copy_to_host()

In [None]:
# Consider: swapping x and y
@cuda.jit
def gpu_matrix_thread_add(A_device, B_device, C_device):
  x, y = cuda.grid(2)
  C_device[x][y] = A_device[x][y] + B_device[x][y]

@cuda.jit
def gpu_matrix_thread_sub(A_device, B_device, C_device):
  x, y = cuda.grid(2)
  C_device[x][y] = A_device[x][y] - B_device[x][y]

@cuda.jit
def gpu_matrix_thread_mult(A_device, B_device, C_device):
  x, y = cuda.grid(2)
  tmp = 0.
  for k in range(C_device.shape[0]):
    tmp += A_device[x][k] * B_device[k][y]
  C_device[x][y] = tmp

def gpu_matrix_add(A, B):
  return gpu_matrix_wrapper(A, B, gpu_matrix_thread_add)

def gpu_matrix_sub(A, B):
  return gpu_matrix_wrapper(A, B, gpu_matrix_thread_sub)

def gpu_matrix_mult(A, B):
  return gpu_matrix_wrapper(A, B, gpu_matrix_thread_mult)

In [None]:
A = np.random.randint(0, 5, size=(1024, 1024))
B = np.random.randint(0, 5, size=(1024, 1024))

In [None]:
%timeit cpu_C_add_res = np.add(A, B)
%timeit cpu_C_sub_res = np.subtract(A, B)
%timeit cpu_C_mult_res = np.multiply(A, B)

In [None]:
%timeit cpu_C_add_res = gpu_matrix_add(A, B)
%timeit cpu_C_sub_res = gpu_matrix_sub(A, B)
%timeit cpu_C_mult_res = gpu_matrix_mult(A, B)