In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from simple_conv_net_train import SimpleConvNet
from simple_conv_net_func import diff_mse
from simple_conv_net_func import conv2d_scalar, pool2d_scalar, relu_scalar, reshape_scalar, fc_layer_scalar
from simple_conv_net_func import conv2d_vector, pool2d_vector, relu_vector, reshape_vector, fc_layer_vector

### Reproducibility

In [2]:
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

<IPython.core.display.Javascript object>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

<IPython.core.display.Javascript object>

### Get single batch to analyse performance


In [4]:
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        "../data",
        train=True,
        download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    ),
    batch_size=1,
    shuffle=False,
)

data, target = train_loader.__iter__().__next__()
data, target = data.to(device), target.to(device)

<IPython.core.display.Javascript object>

### Initialize torch model


In [5]:
model = SimpleConvNet(device)
model.train()

SimpleConvNet(
  (conv_layer): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc_layer1): Linear(in_features=2880, out_features=500, bias=True)
  (fc_layer2): Linear(in_features=500, out_features=10, bias=True)
)

<IPython.core.display.Javascript object>

### Analyse each layer speed and check whether results are correct

### Convolution layer

In [6]:
%%timeit
global z_conv
z_conv = model.conv_layer(data)

55 µs ± 1.41 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

In [7]:
%%timeit
global z_conv_vector
z_conv_vector = conv2d_vector(
    data, model.conv_layer.weight, model.conv_layer.bias, device
)

787 µs ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

In [8]:
%%timeit -n 1 -r 1
global z_conv_scalar
z_conv_scalar = conv2d_scalar(
    data, model.conv_layer.weight, model.conv_layer.bias, device
)

11.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<IPython.core.display.Javascript object>

In [9]:
diff_mse(z_conv, z_conv_vector), diff_mse(z_conv, z_conv_scalar)

(5.785013954147638e-15, 3.1626666532929204e-15)

<IPython.core.display.Javascript object>

### Maxpooling 2d

In [10]:
%%timeit
global z_pool
z_pool = F.max_pool2d(z_conv, 2, 2)

18.5 µs ± 51.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


<IPython.core.display.Javascript object>

In [11]:
%%timeit
global z_pool_vector
z_pool_vector = pool2d_vector(z_conv_vector, device)

232 µs ± 3.95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

In [12]:
%%timeit
global z_pool_scalar
z_pool_scalar = pool2d_scalar(z_conv_scalar, device)

422 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<IPython.core.display.Javascript object>

In [13]:
diff_mse(z_conv, z_conv_vector), diff_mse(z_conv, z_conv_scalar)

(5.785013954147638e-15, 3.1626666532929204e-15)

<IPython.core.display.Javascript object>

### Reshape

In [14]:
%%timeit
global z_pool_reshaped
z_pool_reshaped = z_pool.view(-1, 20 * 12 * 12)

2.37 µs ± 22.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


<IPython.core.display.Javascript object>

In [15]:
%%timeit
global z_pool_reshaped_vector
z_pool_reshaped_vector = reshape_vector(z_pool_vector, device)

18.1 µs ± 92.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


<IPython.core.display.Javascript object>

In [16]:
%%timeit
global z_pool_reshaped_scalar
z_pool_reshaped_scalar = reshape_scalar(z_pool_scalar, device)

73.2 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


<IPython.core.display.Javascript object>

In [17]:
diff_mse(z_pool_reshaped, z_pool_reshaped_vector), diff_mse(
    z_pool_reshaped, z_pool_reshaped_scalar
)

(5.834178711021593e-15, 2.9984563992152286e-15)

<IPython.core.display.Javascript object>

### Fully connected layer 1

In [18]:
%%timeit
global z_fc1
z_fc1 = model.fc_layer1(z_pool_reshaped)

87.6 µs ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

In [19]:
%%timeit
global z_fc1_vector
z_fc1_vector = fc_layer_vector(
    z_pool_reshaped_vector, model.fc_layer1.weight, model.fc_layer1.bias, device
)

76.7 µs ± 19.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

In [20]:
%%timeit
global z_fc1_scalar
z_fc1_scalar = fc_layer_scalar(
    z_pool_reshaped_scalar, model.fc_layer1.weight, model.fc_layer1.bias, device
)


KeyboardInterrupt



<IPython.core.display.Javascript object>

In [None]:
diff_mse(z_fc1, z_fc1_vector), diff_mse(z_fc1, z_fc1_scalar)

### ReLU

In [None]:
%%timeit 
global z_relu
z_relu = F.relu(z_fc1)

In [None]:
%%timeit 
global z_relu_vector
z_relu_vector = relu_vector(z_fc1_vector, device)

In [None]:
%%timeit
global z_relu_scalar
z_relu_scalar = relu_scalar(z_fc1_scalar, device)

In [None]:
diff_mse(z_relu, z_relu_vector), diff_mse(z_relu, z_relu_scalar)

### Fully connected layer 2

In [None]:
%%timeit 
global z_fc2
z_fc2 = model.fc_layer2(z_relu)

In [None]:
%%timeit 
global z_fc2_vector
z_fc2_vector =  fc_layer_vector(z_relu_vector, model.fc_layer2.weight, model.fc_layer2.bias, device)

In [None]:
%%timeit 
global z_fc2_scalar
z_fc2_scalar =  fc_layer_scalar(z_relu_scalar, model.fc_layer2.weight, model.fc_layer2.bias, device)

In [None]:
diff_mse(z_fc2, z_fc2_vector), diff_mse(z_fc2, z_fc2_scalar)

###

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)


In [None]:
optimizer.zero_grad()
output = model.forward(data)

loss = F.nll_loss(torch.log(output), target)
loss.backward()
optimizer.step()