<a href="https://colab.research.google.com/github/ganesh3/pytorch-work/blob/master/mnist_nn_resnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader

In [None]:
train_data = datasets.MNIST('data', train=True, download=True, transform = transforms.ToTensor())
train, val = random_split(train_data, [55000, 5000])
train_loader = DataLoader(train, batch_size = 32)
valid_loader = DataLoader(val, batch_size = 32)

In [None]:
#Define model -- ignore as we use flexible model
#model = nn.Sequential(
#    nn.Linear(28 * 28, 64),
#    nn.ReLU(),
#    nn.Linear(64, 64),
#    nn.ReLU(),
#    nn.Linear(64, 10)
#)

In [None]:
#Define a more flexible model

class ResNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.l1 = nn.Linear(28 * 28, 64)
    self.l2 = nn.Linear(64, 64)
    self.l3 = nn.Linear(64, 10)
    self.do = nn.Dropout(0.1)

  def forward(self, x):
    h1 = nn.functional.relu(self.l1(x))
    h2 = nn.functional.relu(self.l2(h1))
    do = self.do(h2 + h1)
    logits = self.l3(do)
    return logits

model = ResNet().cuda()

In [None]:
#define params for flexible model
params = model.parameters()

# Define optimizer
optimizer = optim.SGD(params, lr=1e-2)

In [None]:
#Define loss
loss = nn.CrossEntropyLoss()

In [None]:
#training & validation loops
no_epochs = 5
for epoch in range(no_epochs):
  losses = list()
  accuracies = list()
  for batch in train_loader:
    x , y = batch

    #x: batchsize * 1 (no. of channel) * 28 (height) * 28 (width)
    b = x.size(0)
    x = x.view(b, -1).cuda()

    #1 forward
    l = model(x) # l = logits i.e. output form the last layer of the  model

    #2 compute objective function
    J = loss (l, y.cuda())

    #3 J cleaning the gradient
    model.zero_grad()
    #optimizer.zero_grad()
    #params.grad._zero()

    #4 accumulate the partial derivative of J w.r.t params
    J.backward()
    #params.grad.add_(dJ/dparams)

    #5 step in the opposite direction of the gradient
    optimizer.step()
    #with torch.no_grad: params = params - eta( i.e. learning-rate) * params.grad

    losses.append(J.item())
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

  print(f'Epoch {epoch + 1}, train loss: {torch.Tensor(losses).mean(): .2f}')
  print(f'Training accuracy: {torch.tensor(accuracies).mean(): .2f}')

  losses = list()
  accuracies = list()
  for batch in valid_loader:
    x , y = batch

    #x: batchsize * 1 (no. of channel) * 28 (height) * 28 (width)
    b = x.size(0)
    x = x.view(b, -1).cuda()

    #1 forward
    with torch.no_grad():
      l = model(x) # l = logits i.e. output form the last layer of the  model

    #2 compute objective function
    J = loss (l, y.cuda())

    losses.append(J.item())
    accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

  print(f'Epoch {epoch + 1}, validation loss: {torch.Tensor(losses).mean():.2f}')
  print(f'Validation accuracy: {torch.tensor(accuracies).mean(): .2f}')

Epoch 1, train loss:  1.57
Training accuracy:  0.47
Epoch 1, validation loss: 0.52
Validation accuracy:  0.85
Epoch 2, train loss:  0.45
Training accuracy:  0.87
Epoch 2, validation loss: 0.37
Validation accuracy:  0.90
Epoch 3, train loss:  0.35
Training accuracy:  0.90
Epoch 3, validation loss: 0.30
Validation accuracy:  0.92
Epoch 4, train loss:  0.30
Training accuracy:  0.92
Epoch 4, validation loss: 0.26
Validation accuracy:  0.93
Epoch 5, train loss:  0.26
Training accuracy:  0.93
Epoch 5, validation loss: 0.25
Validation accuracy:  0.93
