In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from simple_conv_net_train import SimpleConvNet
from simple_conv_net_func import *

### Reproducibility

In [2]:
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Get single batch to analyse performance


In [4]:
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        "../data",
        train=True,
        download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    ),
    batch_size=64,
    shuffle=False,
)

data, target = train_loader.__iter__().__next__()
data, target = data.to(device), target.to(device)

### Initialize torch model


In [5]:
model = SimpleConvNet(device)
model.train()

SimpleConvNet(
  (conv_layer): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc_layer1): Linear(in_features=2880, out_features=500, bias=True)
  (fc_layer2): Linear(in_features=500, out_features=10, bias=True)
)

### Analyse each layer speed and check whether results are correct

### Convolution layer

###### Declare each layer variable as global. Need this hack to access variables from timeit cells

In [6]:
%%timeit
global z_conv

z_conv = model.conv_layer(data)

168 µs ± 6.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%%timeit
global z_conv_vector

z_conv_vector = conv2d_vector(data, model.conv_layer.weight, model.conv_layer.bias, device)

5.74 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%%timeit -n 1 -r 1
global z_conv_scalar

# Deactivate autograd here to reduce memory usage (out of memory error otherwise)
with torch.no_grad():
    z_conv_scalar = conv2d_scalar(data, model.conv_layer.weight, model.conv_layer.bias, device)

11min 17s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [9]:
diff_mse(z_conv, z_conv_vector), diff_mse(z_conv, z_conv_scalar)

(5.606953652591154e-15, 3.1910765618603032e-15)

### Maxpooling 2d

In [10]:
%%timeit
global z_pool

z_pool = F.max_pool2d(z_conv, 2, 2)

61.2 µs ± 133 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%%timeit
global z_pool_vector

z_pool_vector = pool2d_vector(z_conv_vector, device)

857 µs ± 2.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
%%timeit
global z_pool_scalar

z_pool_scalar = pool2d_scalar(z_conv_scalar, device)

23.8 s ± 366 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
diff_mse(z_conv, z_conv_vector), diff_mse(z_conv, z_conv_scalar)

(5.606953652591154e-15, 3.1910765618603032e-15)

### Reshape

In [14]:
%%timeit
global z_pool_reshaped

z_pool_reshaped = z_pool.view(-1, 20 * 12 * 12)

2.82 µs ± 11.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [15]:
%%timeit
global z_pool_reshaped_vector

z_pool_reshaped_vector = reshape_vector(z_pool_vector, device)

21.3 µs ± 87.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
%%timeit
global z_pool_reshaped_scalar

z_pool_reshaped_scalar = reshape_scalar(z_pool_scalar, device)

3.66 s ± 20.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
diff_mse(z_pool_reshaped, z_pool_reshaped_vector), diff_mse(z_pool_reshaped, z_pool_reshaped_scalar)

(5.630004807217733e-15, 2.9688413745221397e-15)

### Fully connected layer 1

In [18]:
%%timeit
global z_fc1

z_fc1 = model.fc_layer1(z_pool_reshaped)

272 µs ± 8.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [19]:
%%timeit
global z_fc1_vector

z_fc1_vector = fc_layer_vector(z_pool_reshaped_vector, model.fc_layer1.weight, model.fc_layer1.bias, device)

280 µs ± 22.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
%%timeit -n 1 -r 1
global z_fc1_scalar

# Deactivate autograd here to reduce memory usage (out of memory error otherwise)
with torch.no_grad():
    z_fc1_scalar = fc_layer_scalar(z_pool_reshaped_scalar, model.fc_layer1.weight, model.fc_layer1.bias, device)

1h 16min 13s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
diff_mse(z_fc1, z_fc1_vector), diff_mse(z_fc1, z_fc1_scalar)

(1.0316855368326747e-14, 1.6375784192210197e-13)

### ReLU

In [22]:
%%timeit 
global z_relu

z_relu = F.relu(z_fc1)

15.7 µs ± 55.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [23]:
%%timeit 
global z_relu_vector

z_relu_vector = relu_vector(z_fc1_vector, device)

131 µs ± 6.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
%%timeit
global z_relu_scalar

z_relu_scalar = relu_scalar(z_fc1_scalar, device)

1.74 s ± 44.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
diff_mse(z_relu, z_relu_vector), diff_mse(z_relu, z_relu_scalar)

(5.044605514293709e-15, 8.040198010410976e-14)

### Fully connected layer 2

In [26]:
%%timeit 
global z_fc2

z_fc2 = model.fc_layer2(z_relu)

36.3 µs ± 230 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
%%timeit 
global z_fc2_vector

z_fc2_vector =  fc_layer_vector(z_relu_vector, model.fc_layer2.weight, model.fc_layer2.bias, device)

38.3 µs ± 687 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
%%timeit -r 1 -n 1
global z_fc2_scalar

# Deactivate autograd here to reduce memory usage (out of memory error otherwise)
with torch.no_grad():
    z_fc2_scalar =  fc_layer_scalar(z_relu_scalar, model.fc_layer2.weight, model.fc_layer2.bias, device)

21 s ± 356 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
diff_mse(z_fc2, z_fc2_vector), diff_mse(z_fc2, z_fc2_scalar)

(2.1420055043786445e-15, 2.7987505095048403e-14)

### Measure training time on 1 batch

#### Pytorch step

In [30]:
%%timeit
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
optimizer.zero_grad()
output = model.forward(data)

loss = F.nll_loss(torch.log(output), target)
loss.backward()
optimizer.step()

2.6 ms ± 35.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


##### Vector step

In [31]:
%%timeit
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
optimizer.zero_grad()
output = model.forward_vector(data)

loss = F.nll_loss(torch.log(output), target)
loss.backward()
optimizer.step()

11.8 ms ± 51.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


##### Scalar step

In [32]:
%%timeit -n 1 -r 1
with torch.no_grad():
    output = model.forward_scalar(data)

1h 26min 5s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### We can see here how crucial is vectorization in terms of performance