In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
use_cuda = True
log_interval = 512
device = torch.device('cuda' if (torch.cuda.is_available() and use_cuda) else 'cpu')
epochs = 20
lr = 1e-3
momentum = 0.5
test_batch_size = 512
batch_size = 64
save_model = False

#### A

#### Default architecture from https://github.com/pytorch/examples/blob/master/mnist/main.py

In [3]:
class Def_Net(nn.Module):
    def __init__(self):
        super(Def_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

Train test functions for simplicity

In [4]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [5]:
### Some additional code to configure dataloaders
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=test_batch_size, shuffle=True, **kwargs)

In [6]:
model = Def_Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

if (save_model):
    torch.save(model.state_dict(),"SavedWeights/mnist_cnn_relu.pt")


Test set: Average loss: 0.5057, Accuracy: 8741/10000 (87%)


Test set: Average loss: 0.2795, Accuracy: 9192/10000 (92%)


Test set: Average loss: 0.2147, Accuracy: 9378/10000 (94%)


Test set: Average loss: 0.1712, Accuracy: 9517/10000 (95%)


Test set: Average loss: 0.1506, Accuracy: 9580/10000 (96%)


Test set: Average loss: 0.1254, Accuracy: 9639/10000 (96%)


Test set: Average loss: 0.1106, Accuracy: 9682/10000 (97%)


Test set: Average loss: 0.0975, Accuracy: 9717/10000 (97%)


Test set: Average loss: 0.0919, Accuracy: 9742/10000 (97%)


Test set: Average loss: 0.0807, Accuracy: 9769/10000 (98%)


Test set: Average loss: 0.0776, Accuracy: 9769/10000 (98%)


Test set: Average loss: 0.0702, Accuracy: 9802/10000 (98%)


Test set: Average loss: 0.0728, Accuracy: 9792/10000 (98%)


Test set: Average loss: 0.0642, Accuracy: 9819/10000 (98%)


Test set: Average loss: 0.0609, Accuracy: 9823/10000 (98%)


Test set: Average loss: 0.0598, Accuracy: 9820/10000 (98%)


Test set: Average loss:

The above results correspond to the default architecture mentioned in the tutoraial link. With default arguments (mentioned in the tutorial) for the hyperparameters in cell 2, the architecture reached an accuracy of 98% in 10 epochs and a learning rate of 1e-3.

Below, all the ReLUs are replaced with sigmoids and the network is being trained again to see how it performs

In [7]:
class Sig_Net(nn.Module):
    def __init__(self):
        super(Sig_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.sigmoid(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.sigmoid(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

#### Training the sigmoid network

In [8]:
epochs = 100
model_sig = Sig_Net().to(device)
optimizer = optim.SGD(model_sig.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_sig, device, train_loader, optimizer, epoch)
    test(model_sig, device, test_loader)

if (save_model):
    torch.save(model_sig.state_dict(),"SavedWeights/mnist_cnn_sigm.pt")






Test set: Average loss: 2.3010, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3007, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.3004, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2993, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2994, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2985, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2978, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2976, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2966, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2955, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2960, Accuracy: 1010/10000 (10%)


Test set: Average loss: 2.2938, Accuracy: 2071/10000 (21%)


Test set: Average loss: 2.2918, Accuracy: 1154/10000 (12%)


Test set: Average loss: 2.2904, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2887, Accuracy: 1135/10000 (11%)


Test set: Average loss: 2.2870, Accuracy: 1335/10000 (13%)


Test set: Average loss:


Test set: Average loss: 0.4617, Accuracy: 8685/10000 (87%)


Test set: Average loss: 0.4503, Accuracy: 8714/10000 (87%)


Test set: Average loss: 0.4396, Accuracy: 8750/10000 (88%)


Test set: Average loss: 0.4289, Accuracy: 8776/10000 (88%)


Test set: Average loss: 0.4196, Accuracy: 8807/10000 (88%)


Test set: Average loss: 0.4103, Accuracy: 8841/10000 (88%)


Test set: Average loss: 0.4024, Accuracy: 8847/10000 (88%)


Test set: Average loss: 0.3933, Accuracy: 8877/10000 (89%)


Test set: Average loss: 0.3859, Accuracy: 8895/10000 (89%)


Test set: Average loss: 0.3786, Accuracy: 8926/10000 (89%)


Test set: Average loss: 0.3716, Accuracy: 8941/10000 (89%)


Test set: Average loss: 0.3646, Accuracy: 8953/10000 (90%)


Test set: Average loss: 0.3583, Accuracy: 8972/10000 (90%)


Test set: Average loss: 0.3521, Accuracy: 8998/10000 (90%)


Test set: Average loss: 0.3459, Accuracy: 9016/10000 (90%)


Test set: Average loss: 0.3403, Accuracy: 9020/10000 (90%)


Test set: Average loss:

As we expected, sigmoid activations are subpar when compared to ReLU in this case. This can be explained by the following argument.

Consider 2 deep network with just Fully connected layers. One has ReLU after all layers and one has sigmoid after all layers 

Due to the underlying architecture, the output of every node in every layer is restricted between 0 and 1 if sigmoid activation is used. Otherwise, the node outputs which are negative are suppressed when ReLU is used. Restricting output from 0 to 1 can be considered as a loss of information as withing 0 and 1 we will be dealing with numbers only upto a precision.

In case of images, since it's a convolutional operation going, we would like to have our values uncapped so that we can understand the response through the filter. Hence, ReLU clearly outperforms sigmoid. 

#### B

Dropout of probability P applied after last fully connected layer convolution

In [9]:
class DrO_Net(nn.Module):
    def __init__(self, prob):
        super(DrO_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.do1 = nn.Dropout(prob)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.do1(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [10]:
### Dropout of 0.2 from first convolution
epochs = 20
model_25 = DrO_Net(prob=0.25).to(device)
optimizer = optim.SGD(model_25.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_25, device, train_loader, optimizer, epoch)
    test(model_25, device, test_loader)

if (save_model):
    torch.save(model_25.state_dict(),"SavedWeights/mnist_cnn_dr_25.pt")


Test set: Average loss: 0.5770, Accuracy: 8617/10000 (86%)


Test set: Average loss: 0.2894, Accuracy: 9187/10000 (92%)


Test set: Average loss: 0.2177, Accuracy: 9376/10000 (94%)


Test set: Average loss: 0.1736, Accuracy: 9509/10000 (95%)


Test set: Average loss: 0.1477, Accuracy: 9576/10000 (96%)


Test set: Average loss: 0.1259, Accuracy: 9640/10000 (96%)


Test set: Average loss: 0.1129, Accuracy: 9671/10000 (97%)


Test set: Average loss: 0.1007, Accuracy: 9714/10000 (97%)


Test set: Average loss: 0.0906, Accuracy: 9733/10000 (97%)


Test set: Average loss: 0.0841, Accuracy: 9762/10000 (98%)


Test set: Average loss: 0.0786, Accuracy: 9770/10000 (98%)


Test set: Average loss: 0.0726, Accuracy: 9789/10000 (98%)


Test set: Average loss: 0.0678, Accuracy: 9805/10000 (98%)


Test set: Average loss: 0.0670, Accuracy: 9803/10000 (98%)


Test set: Average loss: 0.0632, Accuracy: 9803/10000 (98%)


Test set: Average loss: 0.0598, Accuracy: 9820/10000 (98%)


Test set: Average loss:

In [11]:
### Dropout of 0.5 from first convolution
model_50 = DrO_Net(prob=0.5).to(device)
optimizer = optim.SGD(model_50.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_50, device, train_loader, optimizer, epoch)
    test(model_50, device, test_loader)

if (save_model):
    torch.save(model_50.state_dict(),"SavedWeights/mnist_cnn_dr_50.pt")


Test set: Average loss: 0.5589, Accuracy: 8689/10000 (87%)


Test set: Average loss: 0.2878, Accuracy: 9211/10000 (92%)


Test set: Average loss: 0.2110, Accuracy: 9421/10000 (94%)


Test set: Average loss: 0.1654, Accuracy: 9542/10000 (95%)


Test set: Average loss: 0.1394, Accuracy: 9609/10000 (96%)


Test set: Average loss: 0.1215, Accuracy: 9638/10000 (96%)


Test set: Average loss: 0.1047, Accuracy: 9702/10000 (97%)


Test set: Average loss: 0.0955, Accuracy: 9727/10000 (97%)


Test set: Average loss: 0.0875, Accuracy: 9748/10000 (97%)


Test set: Average loss: 0.0811, Accuracy: 9771/10000 (98%)


Test set: Average loss: 0.0749, Accuracy: 9779/10000 (98%)


Test set: Average loss: 0.0708, Accuracy: 9787/10000 (98%)


Test set: Average loss: 0.0677, Accuracy: 9795/10000 (98%)


Test set: Average loss: 0.0650, Accuracy: 9807/10000 (98%)


Test set: Average loss: 0.0614, Accuracy: 9808/10000 (98%)


Test set: Average loss: 0.0584, Accuracy: 9821/10000 (98%)


Test set: Average loss:

In [12]:
### Dropout of 0.75 from first convolution
model_75 = DrO_Net(prob=0.75).to(device)
optimizer = optim.SGD(model_75.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_75, device, train_loader, optimizer, epoch)
    test(model_75, device, test_loader)

if (save_model):
    torch.save(model_75.state_dict(),"SavedWeights/mnist_cnn_dr_75.pt")


Test set: Average loss: 0.5469, Accuracy: 8742/10000 (87%)


Test set: Average loss: 0.3003, Accuracy: 9191/10000 (92%)


Test set: Average loss: 0.2224, Accuracy: 9375/10000 (94%)


Test set: Average loss: 0.1792, Accuracy: 9489/10000 (95%)


Test set: Average loss: 0.1531, Accuracy: 9551/10000 (96%)


Test set: Average loss: 0.1352, Accuracy: 9585/10000 (96%)


Test set: Average loss: 0.1233, Accuracy: 9624/10000 (96%)


Test set: Average loss: 0.1095, Accuracy: 9655/10000 (97%)


Test set: Average loss: 0.0990, Accuracy: 9688/10000 (97%)


Test set: Average loss: 0.0921, Accuracy: 9706/10000 (97%)


Test set: Average loss: 0.0872, Accuracy: 9721/10000 (97%)


Test set: Average loss: 0.0812, Accuracy: 9750/10000 (98%)


Test set: Average loss: 0.0758, Accuracy: 9762/10000 (98%)


Test set: Average loss: 0.0720, Accuracy: 9770/10000 (98%)


Test set: Average loss: 0.0690, Accuracy: 9784/10000 (98%)


Test set: Average loss: 0.0660, Accuracy: 9792/10000 (98%)


Test set: Average loss:

In [13]:
### Dropout of 11 from first convolution
model_100 = DrO_Net(prob=1).to(device)
optimizer = optim.SGD(model_100.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_100, device, train_loader, optimizer, epoch)
    test(model_100, device, test_loader)

if (save_model):
    torch.save(model_100.state_dict(),"SavedWeights/mnist_cnn_dr_100.pt")


Test set: Average loss: 2.3100, Accuracy: 1149/10000 (11%)


Test set: Average loss: 2.3097, Accuracy: 1148/10000 (11%)


Test set: Average loss: 2.3095, Accuracy: 1147/10000 (11%)


Test set: Average loss: 2.3093, Accuracy: 1241/10000 (12%)


Test set: Average loss: 2.3091, Accuracy: 1291/10000 (13%)


Test set: Average loss: 2.3090, Accuracy: 1340/10000 (13%)


Test set: Average loss: 2.3089, Accuracy: 1383/10000 (14%)


Test set: Average loss: 2.3089, Accuracy: 1401/10000 (14%)


Test set: Average loss: 2.3088, Accuracy: 1416/10000 (14%)


Test set: Average loss: 2.3088, Accuracy: 1438/10000 (14%)


Test set: Average loss: 2.3087, Accuracy: 1430/10000 (14%)


Test set: Average loss: 2.3087, Accuracy: 1444/10000 (14%)


Test set: Average loss: 2.3087, Accuracy: 1450/10000 (14%)


Test set: Average loss: 2.3087, Accuracy: 1445/10000 (14%)


Test set: Average loss: 2.3087, Accuracy: 1438/10000 (14%)


Test set: Average loss: 2.3086, Accuracy: 1437/10000 (14%)


Test set: Average loss:

Best among all the above dropout settings, the architecturure with 50% dropout performed the best. As expected, a dropout of 1 is messing up the network. 

#### C
Including Batch Normalization to the architecture and testing the results


In [14]:
class Best_Net(nn.Module):
    def __init__(self, prob=0.5):
        super(Best_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.do1 = nn.Dropout(prob)
        self.bn1 = torch.nn.BatchNorm2d(20)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = self.do1(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)
    
class Best_Net_BN(nn.Module):
    def __init__(self):
        super(Best_Net_BN, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.bn1 = torch.nn.BatchNorm2d(20)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [15]:
### Dropout and BatchNorm model
model_DrBn = Best_Net().to(device)
optimizer = optim.SGD(model_DrBn.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_DrBn, device, train_loader, optimizer, epoch)
    test(model_DrBn, device, test_loader)

if (save_model):
    torch.save(model_DrBn.state_dict(),"SavedWeights/mnist_cnn_drbn.pt")


Test set: Average loss: 0.4399, Accuracy: 9004/10000 (90%)


Test set: Average loss: 0.2303, Accuracy: 9377/10000 (94%)


Test set: Average loss: 0.1683, Accuracy: 9542/10000 (95%)


Test set: Average loss: 0.1351, Accuracy: 9606/10000 (96%)


Test set: Average loss: 0.1146, Accuracy: 9661/10000 (97%)


Test set: Average loss: 0.1003, Accuracy: 9688/10000 (97%)


Test set: Average loss: 0.0890, Accuracy: 9728/10000 (97%)


Test set: Average loss: 0.0814, Accuracy: 9745/10000 (97%)


Test set: Average loss: 0.0748, Accuracy: 9758/10000 (98%)


Test set: Average loss: 0.0689, Accuracy: 9778/10000 (98%)


Test set: Average loss: 0.0641, Accuracy: 9788/10000 (98%)


Test set: Average loss: 0.0606, Accuracy: 9808/10000 (98%)


Test set: Average loss: 0.0567, Accuracy: 9825/10000 (98%)


Test set: Average loss: 0.0543, Accuracy: 9829/10000 (98%)


Test set: Average loss: 0.0516, Accuracy: 9829/10000 (98%)


Test set: Average loss: 0.0497, Accuracy: 9843/10000 (98%)


Test set: Average loss:

In [16]:
### Just BatchNorm model
model_Bn = Best_Net_BN().to(device)
optimizer = optim.SGD(model_Bn.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_Bn, device, train_loader, optimizer, epoch)
    test(model_Bn, device, test_loader)

if (save_model):
    torch.save(model_Bn.state_dict(),"SavedWeights/mnist_cnn_bn.pt")


Test set: Average loss: 0.3387, Accuracy: 9204/10000 (92%)


Test set: Average loss: 0.1951, Accuracy: 9459/10000 (95%)


Test set: Average loss: 0.1512, Accuracy: 9573/10000 (96%)


Test set: Average loss: 0.1262, Accuracy: 9622/10000 (96%)


Test set: Average loss: 0.1059, Accuracy: 9679/10000 (97%)


Test set: Average loss: 0.0950, Accuracy: 9696/10000 (97%)


Test set: Average loss: 0.0868, Accuracy: 9735/10000 (97%)


Test set: Average loss: 0.0798, Accuracy: 9748/10000 (97%)


Test set: Average loss: 0.0749, Accuracy: 9758/10000 (98%)


Test set: Average loss: 0.0688, Accuracy: 9784/10000 (98%)


Test set: Average loss: 0.0654, Accuracy: 9783/10000 (98%)


Test set: Average loss: 0.0613, Accuracy: 9799/10000 (98%)


Test set: Average loss: 0.0585, Accuracy: 9816/10000 (98%)


Test set: Average loss: 0.0559, Accuracy: 9823/10000 (98%)


Test set: Average loss: 0.0533, Accuracy: 9828/10000 (98%)


Test set: Average loss: 0.0525, Accuracy: 9823/10000 (98%)


Test set: Average loss:

### Observations

From the above train test prediction results, it is pretty clear that the network with a dropout of 50% and a batch normalization layer right after convolutional output is the best configuration for this task. It outperformal all the previous discussed configurations.

### D

In [17]:
class BN_Xavier(nn.Module):
    def __init__(self):
        super(BN_Xavier, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.bn1 = torch.nn.BatchNorm2d(20)
        nn.init.xavier_uniform_(self.conv1.weight)
        nn.init.xavier_uniform_(self.conv2.weight)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


class BN_Kamming(nn.Module):
    def __init__(self):
        super(BN_Kamming, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.bn1 = torch.nn.BatchNorm2d(20)
        nn.init.kaiming_uniform_(self.conv1.weight)
        nn.init.kaiming_uniform_(self.conv2.weight)
        nn.init.kaiming_uniform_(self.fc1.weight)
        nn.init.kaiming_uniform_(self.fc2.weight)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [18]:
### Xavier init model
model_Xavier = BN_Xavier().to(device)
optimizer = optim.SGD(model_Xavier.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_Xavier, device, train_loader, optimizer, epoch)
    test(model_Xavier, device, test_loader)

if (save_model):
    torch.save(model_Xavier.state_dict(),"SavedWeights/mnist_cnn_xavier.pt")


Test set: Average loss: 0.1933, Accuracy: 9493/10000 (95%)


Test set: Average loss: 0.1250, Accuracy: 9650/10000 (96%)


Test set: Average loss: 0.0989, Accuracy: 9714/10000 (97%)


Test set: Average loss: 0.0848, Accuracy: 9745/10000 (97%)


Test set: Average loss: 0.0746, Accuracy: 9775/10000 (98%)


Test set: Average loss: 0.0674, Accuracy: 9788/10000 (98%)


Test set: Average loss: 0.0619, Accuracy: 9808/10000 (98%)


Test set: Average loss: 0.0571, Accuracy: 9819/10000 (98%)


Test set: Average loss: 0.0561, Accuracy: 9813/10000 (98%)


Test set: Average loss: 0.0525, Accuracy: 9829/10000 (98%)


Test set: Average loss: 0.0501, Accuracy: 9842/10000 (98%)


Test set: Average loss: 0.0485, Accuracy: 9843/10000 (98%)


Test set: Average loss: 0.0456, Accuracy: 9850/10000 (98%)


Test set: Average loss: 0.0449, Accuracy: 9852/10000 (99%)


Test set: Average loss: 0.0439, Accuracy: 9856/10000 (99%)


Test set: Average loss: 0.0416, Accuracy: 9857/10000 (99%)


Test set: Average loss:

In [19]:
### Kaiming init model
model_Kaiming = BN_Kamming().to(device)
optimizer = optim.SGD(model_Kaiming.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_Kaiming, device, train_loader, optimizer, epoch)
    test(model_Kaiming, device, test_loader)

if (save_model):
    torch.save(model_Kaiming.state_dict(),"SavedWeights/mnist_cnn_kaiming.pt")


Test set: Average loss: 0.2233, Accuracy: 9379/10000 (94%)


Test set: Average loss: 0.1507, Accuracy: 9571/10000 (96%)


Test set: Average loss: 0.1232, Accuracy: 9660/10000 (97%)


Test set: Average loss: 0.1032, Accuracy: 9698/10000 (97%)


Test set: Average loss: 0.0910, Accuracy: 9733/10000 (97%)


Test set: Average loss: 0.0830, Accuracy: 9749/10000 (97%)


Test set: Average loss: 0.0767, Accuracy: 9767/10000 (98%)


Test set: Average loss: 0.0727, Accuracy: 9779/10000 (98%)


Test set: Average loss: 0.0678, Accuracy: 9784/10000 (98%)


Test set: Average loss: 0.0648, Accuracy: 9803/10000 (98%)


Test set: Average loss: 0.0599, Accuracy: 9808/10000 (98%)


Test set: Average loss: 0.0589, Accuracy: 9800/10000 (98%)


Test set: Average loss: 0.0578, Accuracy: 9809/10000 (98%)


Test set: Average loss: 0.0556, Accuracy: 9814/10000 (98%)


Test set: Average loss: 0.0541, Accuracy: 9821/10000 (98%)


Test set: Average loss: 0.0532, Accuracy: 9820/10000 (98%)


Test set: Average loss:

In [20]:
### Basic architecture with Xavier initialization, no BatchNorm

class TmpXavier(nn.Module):
    def __init__(self):
        super(TmpXavier, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        nn.init.xavier_uniform_(self.conv1.weight)
        nn.init.xavier_uniform_(self.conv2.weight)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [21]:
### Basic Xavier init model
model_XavTEmp = TmpXavier().to(device)
optimizer = optim.SGD(model_XavTEmp.parameters(), lr=lr, momentum=momentum)

for epoch in range(1, epochs + 1):
    train(model_XavTEmp, device, train_loader, optimizer, epoch)
    test(model_XavTEmp, device, test_loader)

if (save_model):
    torch.save(model_XavTEmp.state_dict(),"SavedWeights/mnist_cnn_XavTemp.pt")


Test set: Average loss: 0.2906, Accuracy: 9160/10000 (92%)


Test set: Average loss: 0.1933, Accuracy: 9433/10000 (94%)


Test set: Average loss: 0.1549, Accuracy: 9545/10000 (95%)


Test set: Average loss: 0.1262, Accuracy: 9623/10000 (96%)


Test set: Average loss: 0.1102, Accuracy: 9663/10000 (97%)


Test set: Average loss: 0.0944, Accuracy: 9743/10000 (97%)


Test set: Average loss: 0.0891, Accuracy: 9751/10000 (98%)


Test set: Average loss: 0.0819, Accuracy: 9777/10000 (98%)


Test set: Average loss: 0.0824, Accuracy: 9760/10000 (98%)


Test set: Average loss: 0.0741, Accuracy: 9796/10000 (98%)


Test set: Average loss: 0.0686, Accuracy: 9790/10000 (98%)


Test set: Average loss: 0.0657, Accuracy: 9812/10000 (98%)


Test set: Average loss: 0.0647, Accuracy: 9802/10000 (98%)


Test set: Average loss: 0.0594, Accuracy: 9818/10000 (98%)


Test set: Average loss: 0.0559, Accuracy: 9831/10000 (98%)


Test set: Average loss: 0.0570, Accuracy: 9831/10000 (98%)


Test set: Average loss:

From the above few cells we can conclude the following

1. BatchNormalisation speeds up training. The hidden layers are now receiving a lesser deviation in input if batch normalisation operation is done. 

2. A good initialisation can always save time for some particular tasks. In our case, Xavier + BN gave the best result so far within 20 epochs

3. Batch Normalisation reduced the gap between the convergence point of networks after 20 epochs. Roughly all networks gave more than 9830 images correctly classified