In [65]:
import numpy as np
from numba import njit, prange
import random

In [66]:
A, B = [], []
rows, cols = 250, 250
def gen_rnd_matrix(rows, cols):
    M = []
    for i in range(rows):
        row = []
        for j in range(cols):
            row.append(random.randint(0, 2 ** 16))
        M.append(row)
    return M
A = gen_rnd_matrix(rows, cols)
B = gen_rnd_matrix(rows, cols)
len(A), len(A[0])

(250, 250)

### python

In [67]:
def dot(a: list, b: list):
    accum = []
    for a_val, b_val in zip(a, b):
        accum.append(a_val * b_val)
    return sum(accum)

In [68]:
assert dot(A[0], B[0]) == np.dot(A[0], B[0])

In [69]:
def matmul(a: list[list], b: list[list]):
    m = len(a)
    p = len(b[0]) 
    c = []
    for i in range(m):
        row = []
        for j in range(p):
            row.append(dot(a[i], [r[j] for r in b]))
        c.append(row)
    return c

In [70]:
assert (matmul(A, B) == np.matmul(A, B)).all()

### jit

In [71]:
@njit()
def jdot(a: np.array, b: np.array):
    accum = 0
    for i in range(len(a)):
        accum += a[i] * b[i]
    return accum

In [72]:
@njit(parallel=True)
def jmatmul(a: np.array, b: np.array):
    c = np.zeros((a.shape[0], b.shape[1]))
    for i in prange(c.shape[0]):
        for j in prange(c.shape[1]):
            c[i, j] = jdot(a[i, :], b[:, j])
    return c

In [73]:
A_npy, B_npy = np.array(A), np.array(B) # jit requires numpy input for nested list

In [74]:
# warmup
_ = jdot(A_npy[0, :], B_npy[0, :])
_ = jmatmul(A_npy, B_npy)

### dot times

In [75]:
%timeit dot(A[0], B[0])  # run once since very slow

22 µs ± 747 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [76]:
%timeit np.dot(A_npy[0, :], B_npy[0, :])

1.64 µs ± 38.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [77]:
%timeit jdot(A_npy[0, :], B_npy[0, :])

753 ns ± 16.2 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


### matmul times

In [78]:
%time _ = matmul(A, B) # run once since very slow

CPU times: user 2.13 s, sys: 4.1 ms, total: 2.14 s
Wall time: 2.14 s


In [81]:
%timeit np.matmul(A_npy, B_npy)

9.33 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [82]:
%timeit jmatmul(A_npy, B_npy)

2.2 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
