# MNIST data set: recognizing handwritten digits

In [0]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

random_seed = 1
torch.manual_seed(random_seed);


# Preparing the data set

In [0]:
batch_size_train = 128
batch_size_test = 128

# training set
train_dataset = torchvision.datasets.MNIST('./files/', 
                train=True, download=True,transform = torchvision.transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=batch_size_train, shuffle=True)
# test set
test_dataset = torchvision.datasets.MNIST('./files/', train=False, download=True,
                             transform = torchvision.transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=batch_size_test, shuffle=True)

Let us look at some examples.

In [0]:
examples = enumerate(train_loader)
batch_idx, (example_data, example_targets) = next(examples)

print('Shape of one training mini batch',example_data.shape)
print('Shape of one target mini batch',example_targets.shape)
print('Example training sample', example_data[1])
print('Target values', example_targets[:])
print(train_loader.dataset)

Shape of one training mini batch torch.Size([128, 1, 28, 28])
Shape of one target mini batch torch.Size([128])
Example training sample tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.00

To get a feeling for the data, we visualize some examples.

In [0]:
import matplotlib.pyplot as plt
for i in range(6):
    plt.subplot(2,3,i+1)
    plt.tight_layout()
    plt.imshow(example_data[i][0], cmap='gray', interpolation='none')
    plt.title("Ground Truth: {}".format(example_targets[i]))
    plt.xticks([])
    plt.yticks([])


# Defining the Network

Define the network. A few things to keep in mind
<ul>
    <li> Make sure that the networks dimensions are compatible with our data. The network automatically takes care of handling batches (the first dimension of our data). The remaining dimensions must be met by the data. Thus, we first need to convert the images into vectors before they can be passed through the first linear layer. This can be done in the forward computation of the network.</li>
    <li> Each image belongs to one of ten classes. Thus, our output should be 10-dimensional with each output representing one class. To transform the outputs into probabilities, we use can the *softmax* function
    \begin{align}
        \text{softmax}\left(\underline{y}\right) = \frac{e^{-y_i}}{\sum_{i}e^{-y_i}}
    \end{align}
        that transforms a vector of real numbers into a vector of probabilities. The network can then be trained with the cross entropy loss function. For this to work, we need to transform the labels $y_i\in\{0,1\ldots,9\}$ into *one-hot* labels 
    \begin{align}
    y_i = 3 \quad \rightarrow \quad {\hat{y}}_i = [0,0,0,1,0,0,0,0,0,0]
    \end{align}
    </li>
<li> **However:** PyTorch has the built-in cost function `nn.CrossEntropyLoss` (see https://pytorch.org/docs/stable/nn.html#crossentropyloss) that takes care of all this. That is, we can just define our network with a linear output layer with ten neurons and then pass the outputs as well as the target values (from 0 to 9) to the loss function. For prediction, we can then simply take the largest value of the outputs are apply the `F.softmax` function if we wish to have probabilities. </li>
 </ul>


In [0]:
class Net(nn.Module):
    # initialize the network and define all learnable parameters
    def __init__(self):
        super(Net, self).__init__()
        self.h1  = nn.Linear(28*28,64, bias=True)
        self.h2  = nn.Linear(64,64, bias=True)
        self.h3  = nn.Linear(64,32, bias=True)
        self.h4  = nn.Linear(32,24, bias=True)
        self.out = nn.Linear(24,10, bias=True)
    # define the forward pass
    def forward(self, x):
        #x = x.view(-1)
        x = torch.flatten(x,start_dim=1) # need to flatten 28x28 image to 784 vector
        x = F.relu(self.h1(x)) # First hidden layer
        x = F.relu(self.h2(x)) # Second hidden layer
        x = F.relu(self.h3(x))
        x = F.relu(self.h4(x))
        x = self.out(x) # Output layer - no softmax for training since it is included in CrossEntropyLoss!!
        #x = torch.nn.Softmax(self.out(x)) # Output layer
        return x


Initialize the network.

In [0]:
net_mnist = Net()
print(net_mnist)

Net(
  (h1): Linear(in_features=784, out_features=64, bias=True)
  (h2): Linear(in_features=64, out_features=64, bias=True)
  (h3): Linear(in_features=64, out_features=32, bias=True)
  (h4): Linear(in_features=32, out_features=24, bias=True)
  (out): Linear(in_features=24, out_features=10, bias=True)
)


Define the training procedure as before. To judge the training process, it makes sense to print both the loss values as well as the classification error rates.

In [0]:
def train(NeuralNetwork,train_loader,loss_function,num_epochs, learning_rate=0.001, wd=0 ):
    """
    Trains a neural network.
    
    NeuralNetwork = neural network to be trained
    dataloader = DataLoader that deals batches for mini-batch learning
    loss_function = cost function to be optimized
    num_epochs = number of training epochs
    l_rate = learning rate (default value 0.001)
    wd = weight decay regularization (default value 0)
    """
    optimizer = torch.optim.Adam(NeuralNetwork.parameters(), lr = learning_rate, weight_decay=wd)
    for epoch in range(num_epochs):
        running_loss = 0.0
        errors = 0
        for batch_idx , data in enumerate(train_loader,0):
            inputs, labels = data
            optimizer.zero_grad()
            #print(inputs.shape)
            outputs = NeuralNetwork(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            # error rate
            predicted = torch.max(outputs,dim=1)
            errors += sum(predicted[1] != labels)
            #
            if (batch_idx % 100) == 0:
                #print(batch_idx)
                print('Current loss ',running_loss/(batch_idx+1))
                #print('Error rate ',errors.numpy())
        print('Epoch: ',epoch+1,'Error rate on training set:', round(100.0* errors.numpy() / len(train_loader.dataset),2), '%')
    


Train the network. Note that since we have a large dataset, we will train fewer epochs than for the small data set used in the previous exercises.

In [0]:
train(net_mnist,train_loader,nn.CrossEntropyLoss(),2,10**-2 )
train(net_mnist,train_loader,nn.CrossEntropyLoss(),2,10**-3 )
train(net_mnist,train_loader,nn.CrossEntropyLoss(),5,10**-4 )

Current loss  2.304649591445923
Current loss  0.7189613953970446
Current loss  0.5105771936142622
Current loss  0.42165253130700503
Current loss  0.37492396288604807
Epoch:  1 Error rate on training set: 10.77 %
Current loss  0.24892480671405792
Current loss  0.1738187460468547
Current loss  0.17345916500213135
Current loss  0.16835951015501718
Current loss  0.1673343702686249
Epoch:  2 Error rate on training set: 4.81 %
Current loss  0.1304633617401123
Current loss  0.09535175178310659
Current loss  0.09047527539900582
Current loss  0.08939188388037028
Current loss  0.08834423536337224
Epoch:  1 Error rate on training set: 2.51 %
Current loss  0.08207610994577408
Current loss  0.0668713088599172
Current loss  0.0686448635626121
Current loss  0.0685886718577201
Current loss  0.071328482489587
Epoch:  2 Error rate on training set: 2.08 %
Current loss  0.06343777477741241
Current loss  0.06065725842214162
Current loss  0.057312307188945324
Current loss  0.05675134513267251
Current loss  

In order to evaluate our model properly and avoid overfitting, we need to run the network on the training set.
Write a routine that computes the classification error rate on the training data.

In [0]:
errors_test = 0
for batch_idx , data in enumerate(test_loader,0):
    inputs, labels = data
    outputs = net_mnist(inputs)
    # error rate
    predicted = torch.max(outputs,dim=1)
    errors_test += sum(predicted[1] != labels)
print('Error rate on test set:', round(100.0* errors_test.numpy() / len(test_loader.dataset),2), '%')
    

Error rate on test set: 2.88 %


Suggestion for further work: 
<ul>
    <li> If your error rates on the training and test set differ significantly, your model is overfitting. What can you do against this? </li>
<li> If you achieve a low error rate on the test set: find the images that are classified incorrectly by the network. Would you classify those correctly? </li>
<li> For comparison of your networks performance, you can take a look at the Wikipedia page: https://en.wikipedia.org/wiki/MNIST_database </li>
</ul>
