In [1]:
import torch
import numpy as np
from numba import njit
from numba import njit,float64,prange
import numba
numba.config.NUMBA_DEFAULT_NUM_THREADS=4

In [2]:
@njit
def matrix_dot(m1,m2):
    m3 = np.zeros(m1.shape)
    for i in range(m1.shape[0]):
        for j in range(m1.shape[0]):
            for k in range(m1.shape[0]):
                m3[i, j] += m1[i,k]*m2[k,j]
    return m3

In [3]:
m1 = np.array([[1,1,1],[2,2,2],[3,3,3]])
m2 = m1
matrix_dot(m1,m2)

array([[ 6.,  6.,  6.],
       [12., 12., 12.],
       [18., 18., 18.]])

In [4]:
m1 = np.random.rand(100, 100)
m2 = np.random.rand(100, 100)# 前行后列
%timeit matrix_dot(m1,m2)

1.21 ms ± 91.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
@njit
def b(i,k):
    return np.sin( np.pi*(i-0.5)/200 )*np.cos(np.pi*k/200)

@njit
def c(k,j):
    return np.exp(-(k-0.5)/200*np.pi)*np.sqrt(j*(j-1)/200**2)

In [6]:
@njit(nogil=True, parallel=True)
def main():
    mb = np.zeros((200,200))
    mc = np.zeros((200,200))
    for i in prange(200):
        for j in prange(200):
            mb[i,j] = b(i+1,j+1)
            mc[i,j] = c(i+1,j+1)

    return matrix_dot(mb,mc)


In [15]:
%timeit mr = main()
print(mr)

11.4 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
[[0.         0.00182956 0.00316889 ... 0.25550399 0.25679769 0.25809139]
 [0.         0.00548823 0.0095059  ... 0.76644893 0.77032971 0.77421048]
 [0.         0.00914555 0.01584056 ... 1.27720475 1.28367166 1.29013856]
 ...
 [0.         0.00914555 0.01584056 ... 1.27720475 1.28367166 1.29013856]
 [0.         0.00548823 0.0095059  ... 0.76644893 0.77032971 0.77421048]
 [0.         0.00182956 0.00316889 ... 0.25550399 0.25679769 0.25809139]]


In [32]:
mr[0,1]

0.0018295613429145496

In [1]:
import torch

In [13]:
@njit(nogil=True, parallel=True)
def gen_bc():
    mb = np.zeros((200,200))
    mc = np.zeros((200,200))
    for i in prange(200):
        for j in prange(200):
            mb[i,j] = b(i+1,j+1)
            mc[i,j] = c(i+1,j+1)

    return mb,mc

In [54]:
%%timeit
mb, mc = gen_bc()
tensor_b = torch.from_numpy(mb).cuda()
tensor_c = torch.from_numpy(mc).cuda()

tensor_d = torch.mm(tensor_b, tensor_c)

678 µs ± 18.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
torch.set_printoptions(precision=16)
tensor_d[0,1]

tensor(0.0018295613429145, device='cuda:0', dtype=torch.float64)

In [23]:
%%timeit
tb = torch.zeros((200, 200)).cuda()
tc = torch.zeros((200, 200)).cuda()

for i in range(200):
    for j in range(200):
        tb[i,j] = b(i+1,j+1)
        tb[i,j] = c(i+1,j+1)

3.33 s ± 114 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit gen_bc()

408 µs ± 17 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
