In [1]:
from numba import cuda
import numpy as np
import timeit
import time

In [2]:
@cuda.jit
def multiply_matrix_vector_with_synchronize(matrix, vector, result):
    i, j = cuda.grid(2)
    if i < matrix.shape[0] and j < vector.shape[0]:
        cuda.syncthreads()
        result[i] += matrix[i, j] * vector[j]

matrix = np.random.randint(100, size=(10000, 10000))
vector = np.random.randint(100, size=(10000))
result = np.zeros(10000, dtype=np.float32)

threads_per_block = 3
blocks_per_grid = 3

start = time.time()
multiply_matrix_vector_with_synchronize[blocks_per_grid, threads_per_block](matrix, vector, result)
end = time.time()
print(end-start)

0.6233041286468506


In [3]:
def sequential_matrix_vector(matrix, vector):
    result = np.zeros(10000, dtype=np.float32)
    for i in range(matrix.shape[0]):
        for j in range(vector.shape[0]):
            result[i] += matrix[i, j] * vector[j]
    return result

start = time.time()
sequential_matrix_vector(matrix, vector)
end = time.time()
print(end-start)

256.94266510009766


In [4]:
#speedup
256.94266510009766/0.6233041286468506

412.2267979483821