<a href="https://colab.research.google.com/github/etfrer-yi/Numba-Accelerated-Matrix-Operations/blob/main/Numba_Matrix_Operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from numba import cuda
import numpy as np

In [None]:
def gpu_matrix_wrapper(A, B, fn):
  size = A.shape[0]
  A_device = cuda.to_device(A)
  B_device = cuda.to_device(B)
  C_device = cuda.device_array((size, size))

  blocks_per_grid, threads_per_block = size, size
  fn[blocks_per_grid, threads_per_block](A_device, B_device, C_device)
  cuda.synchronize()
  return C_device.copy_to_host()

In [None]:
@cuda.jit
def gpu_matrix_thread_add(A_device, B_device, C_device):
  x, y = cuda.grid(2)
  C_device[y][x] = A_device[y][x] + B_device[y][x]

@cuda.jit
def gpu_matrix_thread_sub(A_device, B_device, C_device):
  x, y = cuda.grid(2)
  C_device[y][x] = A_device[y][x] - B_device[y][x]

def gpu_matrix_add(A, B):
  return gpu_matrix_wrapper(A, B, gpu_matrix_thread_add)

def gpu_matrix_sub(A, B):
  return gpu_matrix_wrapper(A, B, gpu_matrix_thread_sub)

In [None]:
A = np.random.randint(0, 5, size=(1024, 1024))
B = np.random.randint(0, 5, size=(1024, 1024))

In [None]:
%timeit A + B
%timeit A - B

1.06 ms ± 30.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
964 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit gpu_matrix_add(A, B)
%timeit gpu_matrix_sub(A, B)

8 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.33 ms ± 58.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
