In [2]:
import torch
import time
if torch.cuda.is_available():
    print(f"Using {torch.cuda.get_device_name(0)}")
else:
    print(f"Using CPU")

Using GeForce MX150


# Multiple views of a storage

In [3]:
mat = torch.full((13, 13), 1, dtype=int,device='cpu')
mat[:,[1,6, 11]] = 2
mat[[1,6, 11], :] = 2
mat[3:5,3:5]=3
mat[8:10,8:10]=3
mat[3:5,8:10]=3
mat[8:10,3:5]=3
print(mat)

tensor([[1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1],
        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1],
        [1, 2, 1, 3, 3, 1, 2, 1, 3, 3, 1, 2, 1],
        [1, 2, 1, 3, 3, 1, 2, 1, 3, 3, 1, 2, 1],
        [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1],
        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1],
        [1, 2, 1, 3, 3, 1, 2, 1, 3, 3, 1, 2, 1],
        [1, 2, 1, 3, 3, 1, 2, 1, 3, 3, 1, 2, 1],
        [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1],
        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1]])


# Eigendecomposition

In [4]:
torch.manual_seed(42)
M = torch.empty((20,20), dtype=float).normal_(mean=0., std=1.)
D = torch.diag(torch.arange(1,21, dtype=float))
Minv=torch.inverse(M)
mat = Minv.mm(D).mm(M)
eval, evect = torch.eig(mat, eigenvectors=True)
eval


tensor([[ 1.0000,  0.0000],
        [20.0000,  0.0000],
        [19.0000,  0.0000],
        [18.0000,  0.0000],
        [17.0000,  0.0000],
        [ 2.0000,  0.0000],
        [ 3.0000,  0.0000],
        [16.0000,  0.0000],
        [15.0000,  0.0000],
        [ 4.0000,  0.0000],
        [ 5.0000,  0.0000],
        [14.0000,  0.0000],
        [13.0000,  0.0000],
        [12.0000,  0.0000],
        [ 6.0000,  0.0000],
        [11.0000,  0.0000],
        [10.0000,  0.0000],
        [ 7.0000,  0.0000],
        [ 9.0000,  0.0000],
        [ 8.0000,  0.0000]], dtype=torch.float64)

# Flops per second

In [5]:
N=5000
M1 = torch.empty((N,N), dtype=torch.float32).normal_(0, 1)
M2 = torch.empty((N,N), dtype=torch.float32).normal_(0, 1)
def timemul(M1, M2):
    t1 = time.perf_counter()
    M1.mm(M2)
    t2 = time.perf_counter()
    return t2-t1
print(f"Throughput CPU: {N**3/timemul(M1, M2):e} flops/s")


print(f"Throughput GPU: {N**3/timemul(M1.cuda(), M2.cuda()):e} flops/s")


Throughput CPU: 1.137961e+11 flops/s
Throughput GPU: 3.426319e+11 flops/s


#  Playing with strides

In [7]:
def mul_row(mat):
    nrows, ncols = mat.shape
    newmat=torch.empty(mat.shape, dtype=torch.float32)
    for i in range(nrows):
        for j in range(ncols):
            newmat[i,j] = mat[i,j]*(i+1)
    return newmat

def mul_row_fast(mat):
    return mat * torch.arange(1, mat.shape[0]+1)[:,None]

M = torch.empty((1000,400), dtype=torch.float32).fill_(1)

t1 = time.perf_counter()
print(mul_row(M))
t2 = time.perf_counter()
print(f"Slow time = {t2-t1}")

t1 = time.perf_counter()
print(mul_row_fast(M))
t2 = time.perf_counter()
print(f"Fast time = {t2-t1}")

tensor([[   1.,    1.,    1.,  ...,    1.,    1.,    1.],
        [   2.,    2.,    2.,  ...,    2.,    2.,    2.],
        [   3.,    3.,    3.,  ...,    3.,    3.,    3.],
        ...,
        [ 998.,  998.,  998.,  ...,  998.,  998.,  998.],
        [ 999.,  999.,  999.,  ...,  999.,  999.,  999.],
        [1000., 1000., 1000.,  ..., 1000., 1000., 1000.]])
Slow time = 7.91157078599997
tensor([[   1.,    1.,    1.,  ...,    1.,    1.,    1.],
        [   2.,    2.,    2.,  ...,    2.,    2.,    2.],
        [   3.,    3.,    3.,  ...,    3.,    3.,    3.],
        ...,
        [ 998.,  998.,  998.,  ...,  998.,  998.,  998.],
        [ 999.,  999.,  999.,  ...,  999.,  999.,  999.],
        [1000., 1000., 1000.,  ..., 1000., 1000., 1000.]])
Fast time = 0.0022343939999700524
