In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

This is an example on how to build a convolutional neural network (CNN) to recognize handwritten digits in the MNIST data set.

There are three main types of layers used to build a CNN: <b> Convolution Layer</b>, <b>Pooling Layer</b>, and <b>Fully-Connected Layer</b>

### Convolution layer 

The convolution layer extracts features from the input image. It consists of a set of filters. During the forward pass, we slide each filter across the width and height of the input image preserving the depth of the input volume, and compute the dot products. 

Every image can be considered as a matrix of pixel values. Consider a 5x5 image whose pixel values are only 0 and 1 (for a grayscale image, the pixel values range from 0 to 255, the green matrix below is a special case). Also, consider another 3x3 matrix which is the filter. The convolution of the 5x5 image and the 3x3 filter can computed as shown in the animation below:   
<img src="https://i.stack.imgur.com/I7DBr.gif" style="width: 400px;">

Three parameters control the size of the output:
<ul>
<li><b>Depth</b>: Depth corresponds to the number of filters used for the convolution operation.

<li><b>Stride</b>: Stride is the number of pixels by which we slide our filter matrix over the input matrix. By default, the <b> stride </b> is 1, which results in the filter sliding by one pixel at a time. When the stride is 2, then the filter jumps two pixels at a time resulting in smaller output volumes.

<li><b>Zero-padding</b>: Zero-padding is used to pad the input volume with zeros around the border. This allows us to control the spatial size of the output volume.
</ul>

### Non-Linear Layer (ReLU)

ReLU (Rectified Linear Units) is an element-wise activation function, and replaces all negative pixel values in the feature map by zero. It implements the function $y = max(0, x)$, so the input and ouput sizes of this layer are the same.

<img src="https://www.embedded-vision.com/sites/default/files/technical-articles/CadenceCNN/Figure8.jpg" style="width: 600px;">

### Pooling Layer

This layer reduces the spatial size of the representation. It controls overfitting by reducing the amount of parameters and computation in the network. The most common form of pooling uses the Max operation. The example shown below uses max pooling with a 2x2 window. We slide our window with a stride of 2 and take the maximum value in each region.

<img src="https://qph.ec.quoracdn.net/main-qimg-8afedfb2f82f279781bfefa269bc6a90" style="width: 600px;">

### Fully Connected Layer

This layer is fully connected with the output of the previous layer. This layer performs classification on the features extracted by the convolutional layer and downsampled by the pooling layer by using a weighted sum of the features followed by a bias offset.

<img src="https://cdn-images-1.medium.com/max/1600/1*Kdnux0Kw1yQ4D8dq__mYCA.png" style="width: 300px;">

In [2]:
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.conv1 = nn.Conv2d(1, 10, 5)
    self.conv2 = nn.Conv2d(10, 20, 5)
    self.fc1 = nn.Linear(320, 50)
    self.fc2 = nn.Linear(50, 10)

  def forward(self, x):
    x = F.relu(F.max_pool2d(self.conv1(x), 2))
    x = F.relu(F.max_pool2d(self.conv2(x), 2))
    x = x.view(-1, 320)
    x = F.relu(self.fc1(x))
    x = F.dropout(x, training=self.training)
    x = self.fc2(x)
    return F.log_softmax(x)

model = Model()
model

Model (
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear (320 -> 50)
  (fc2): Linear (50 -> 10)
)

In [3]:
batch_size = 50
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),
    batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
    batch_size=1000)

<b>Optimizer</b>: This updates the parameters based on the computed gradients. Here, we have used Stochastic Gradient Descent (SGD).

In [4]:
optimizer = optim.SGD(model.parameters(), lr=0.0003)

In [5]:
criterion = nn.CrossEntropyLoss()

In [6]:
def train(epoch):
  model.train()
  i = 1
  for data, target in train_loader:
    data, target = Variable(data), Variable(target)
    optimizer.zero_grad()
    output = model(data)
    # make_dot(output)
    loss = criterion(output, target)
    prediction = output.data.max(1)[1]
    accuracy = prediction.eq(target.data).sum()/batch_size*100
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
      print('Train Step: {}\tLoss: {:.3f}'.format(epoch, loss.data[0]))
    i += 1

In [7]:
def test():
  model.eval()
  correct = 0
  for data, target in test_loader:
    data, target = Variable(data), Variable(target)
    output = model(data)
    prediction = output.data.max(1)[1]
    correct += prediction.eq(target.data).sum()

  print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))

In [8]:
for epoch in range(15):
  train(epoch)
  test()

Train Step: 0	Loss: 2.294

Test set: Accuracy: 11.67%
Train Step: 1	Loss: 2.308

Test set: Accuracy: 13.01%
Train Step: 2	Loss: 2.269

Test set: Accuracy: 13.99%
Train Step: 3	Loss: 2.275

Test set: Accuracy: 17.20%
Train Step: 4	Loss: 2.260

Test set: Accuracy: 31.70%
Train Step: 5	Loss: 2.224

Test set: Accuracy: 39.61%
Train Step: 6	Loss: 2.236

Test set: Accuracy: 40.84%
Train Step: 7	Loss: 2.191

Test set: Accuracy: 41.44%
Train Step: 8	Loss: 2.148

Test set: Accuracy: 42.54%
Train Step: 9	Loss: 1.974

Test set: Accuracy: 45.87%
Train Step: 10	Loss: 1.896

Test set: Accuracy: 51.00%
Train Step: 11	Loss: 1.891

Test set: Accuracy: 57.07%
Train Step: 12	Loss: 1.695

Test set: Accuracy: 63.78%
Train Step: 13	Loss: 1.343

Test set: Accuracy: 69.35%
Train Step: 14	Loss: 1.511

Test set: Accuracy: 74.77%
