In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

There are three main types of layers used to build a CNN: <b> Convolution Layer</b>, <b>Pooling Layer</b>, and <b>Fully-Connected Layer</b>

### Convolution layer 

The convolution layer extracts features from the input image. It consists of a set of filters. During the forward pass, we slide each filter across the width and height of the input image, preserving the depth of the input volume, and compute the dot products. 

Every image can be considered as a matrix of pixel values. Consider a 5x5 image whose pixel values are only 0 and 1 (for a grayscale image, the pixel values range from 0 to 255, the green matrix below is a special case). Also, consider another 3x3 matrix which is the filter. The convolution of the 5x5 image and the 3x3 filter can computed as shown in the animation below:   
<img src="https://i.stack.imgur.com/I7DBr.gif" style="width: 400px;">

We can also specify the <b> stride </b>, and the <b>zero-padding</b>. The <b> stride </b> by default is 1, which results in the filter sliding by one pixel at a time. When the stride is 2, then the filter jumps two pixels at a time resulting in smaller output volumes. The <b>zero-padding</b> is used to pad the input volume with zeros around the border. This allows us to control the spatial size of the output volume. 

### Non-Linear Layer (ReLU)

ReLU (Rectified Linear Units) is an element-wise activation function, and replaces all negative pixel values in the feature map by zero. It implements the function $y = max(0, x)$, so the input and ouput sizes of this layer are the same.

<img src="https://www.embedded-vision.com/sites/default/files/technical-articles/CadenceCNN/Figure8.jpg" style="width: 600px;">

### Pooling Layer

This layer reduces the spatial size of the representation. It controls overfitting by reducing the amount of parameters and computation in the network. The most common form of pooling uses the Max operation. The example shown below uses max pooling with a 2x2 window. We slide our window with a stride of 2 and take the maximum value in each region.

<img src="https://qph.ec.quoracdn.net/main-qimg-8afedfb2f82f279781bfefa269bc6a90" style="width: 600px;">

### Fully Connected Layer

This layer is fully connected with the output of the previous layer. A weighted sum of the  input features followed by a bias offset is used for classification.

In [1]:
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.conv1 = nn.Conv2d(1, 10, 5)
    self.conv2 = nn.Conv2d(10, 20, 5)
    self.fc1 = nn.Linear(320, 50)
    self.fc2 = nn.Linear(50, 10)

  def forward(self, x):
    x = F.relu(F.max_pool2d(self.conv1(x), 2))
    x = F.relu(F.max_pool2d(self.conv2(x), 2))
    x = x.view(-1, 320)
    x = F.relu(self.fc1(x))
    x = F.dropout(x, training=self.training)
    x = self.fc2(x)
    return F.log_softmax(x)

model = Model()
model

NameError: name 'nn' is not defined

In [26]:
batch_size = 50
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),
    batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
    batch_size=1000)

### Optimizer

This updates the parameters based on the computed gradients. Here, we have used Stochastic Gradient Descent (SGD).

In [27]:
optimizer = optim.SGD(model.parameters(), lr=0.0003)

In [28]:
criterion = nn.CrossEntropyLoss()

In [29]:
def train(epoch):
  model.train()
  i = 1
  for data, target in train_loader:
    data, target = Variable(data), Variable(target)
    optimizer.zero_grad()
    output = model(data)
    # make_dot(output)
    loss = criterion(output, target)
    prediction = output.data.max(1)[1]
    accuracy = prediction.eq(target.data).sum()/batch_size*100
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
      print('Train Step: {}\tLoss: {:.3f}'.format(epoch, loss.data[0]))
    i += 1

In [30]:
def test():
  model.eval()
  correct = 0
  for data, target in test_loader:
    data, target = Variable(data), Variable(target)
    output = model(data)
    prediction = output.data.max(1)[1]
    correct += prediction.eq(target.data).sum()

  print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))

In [31]:
for epoch in range(15):
  train(epoch)
  test()

Train Step: 0	Loss: 2.326

Test set: Accuracy: 10.09%
Train Step: 1	Loss: 2.314

Test set: Accuracy: 10.08%
Train Step: 2	Loss: 2.306

Test set: Accuracy: 10.14%
Train Step: 3	Loss: 2.298

Test set: Accuracy: 10.45%
Train Step: 4	Loss: 2.304

Test set: Accuracy: 11.30%
Train Step: 5	Loss: 2.288

Test set: Accuracy: 13.36%
Train Step: 6	Loss: 2.284

Test set: Accuracy: 17.02%
Train Step: 7	Loss: 2.290

Test set: Accuracy: 19.98%
Train Step: 8	Loss: 2.280

Test set: Accuracy: 22.44%
Train Step: 9	Loss: 2.264

Test set: Accuracy: 24.17%
Train Step: 10	Loss: 2.257

Test set: Accuracy: 28.03%
Train Step: 11	Loss: 2.248

Test set: Accuracy: 33.33%
Train Step: 12	Loss: 2.244

Test set: Accuracy: 43.51%
Train Step: 13	Loss: 2.197

Test set: Accuracy: 52.82%
Train Step: 14	Loss: 2.120

Test set: Accuracy: 60.58%
