First, we can load the dataset and create the validation set as 10% of the training set.

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from torchvision import datasets
from torch.utils.data import DataLoader, Subset


In [None]:
batch_size = 128

transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

original_train_set = datasets.MNIST(
    root='./drive/MyDrive', train=True, download=True, transform=transform_train
)

test_set = datasets.MNIST(
    root='./drive/MyDrive', train=False, download=True, transform=transform_test
)

train_indexes, val_indexes = train_test_split(
    range(len(original_train_set)), test_size=0.1, random_state=42
)

train_set = Subset(original_train_set, train_indexes)
validation_set = Subset(original_train_set, val_indexes)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

Now, we are defining the MPL network which consists in 2 hidden layers and 1 output layer.

In [None]:
n1 = 256
n2 = 128
N_classes = 10

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(28*28, n1)
    self.fc2 = nn.Linear(n1, n2)
    self.fc3 = nn.Linear(n2, N_classes)

  def forward(self, x):
    x = torch.flatten(x, 1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

Now we will train this network.

In [None]:
from torch import optim

EPOCHS = 30
lr1 = 1
lr2 = 0.1
lr3 = 0.01

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()

In [None]:
def train(model, train_loader, validation_loader, criterion, optimizer):
  train_losses = []
  validation_losses = []

  train_accuracy = []
  validation_accuracy = []

  model.to(device)

  for epoch in range(EPOCHS):
    model.train()
    corrects = 0
    losses = 0
    samples = 0
    print(f'Epoch {epoch}')
    for i, batch in enumerate(train_loader):
      samples += batch[0].shape[0] #the first part of the tensor contains images of the batch
      images, labels = batch
      images, labels = images.to(device), labels.to(device)

      optimizer.zero_grad()

      predictions = model(images) #as output I obtain the logits

      _, label_pred = torch.max(predictions, 1) #index of the predicted class
      corrects += torch.sum(labels == label_pred).item()

      loss = criterion(predictions, labels) #mean of the loss in the batch
      losses += loss.item() * batch[0].shape[0] #weigthed mean

      loss.backward()
      optimizer.step()
    train_losses.append(losses / samples)
    print(f'Train loss {epoch}: {losses/samples:.4f}')
    train_accuracy.append(1.0*corrects / float(samples))

    model.eval()
    corrects_val = 0
    losses_val = 0
    samples_val = 0

    with torch.no_grad():
      for batch in validation_loader:
        images, labels = batch
        images, labels = images.to(device), labels.to(device)

        samples_val += batch[0].shape[0]
        predictions = model(images)
        _, label_pred = torch.max(predictions, 1)
        corrects_val += torch.sum(labels == label_pred).item()
        loss = criterion(predictions, labels)
        losses_val += loss.item() * batch[0].shape[0]
      validation_losses.append(losses_val / samples_val)
      print(f'Validation loss {epoch}: {losses_val/samples_val:.4f}')
      validation_accuracy.append(1.0*corrects_val / float(samples_val))

  return train_losses, validation_losses, train_accuracy, validation_accuracy

Now we'll perform the train with different values of learning rates and same number of epochs.

1) Learning Rate = 1

In [None]:
net = MLP()
optimizer = optim.SGD(net.parameters(), lr=lr1, momentum=0.9)

validation_losses1, train_losses1, train_accuracy1, validation_accuracy1 = train(net, train_loader, validation_loader, criterion, optimizer)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(train_losses1, c='blue', label='train')
ax[0].plot(validation_losses1, c='red', label='validation')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].legend()

ax[1].plot(train_accuracy1, c='blue', label='train')
ax[1].plot(validation_accuracy1, c='red', label='validation')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')
ax[1].legend()
plt.show()


Epoch 0


The learning rate is too high so we have gradient explosion.
Now we will train the model with learning rate 0.1

In [None]:
net = MLP()
optimizer = optim.SGD(net.parameters(), lr=lr2, momentum=0.9)

validation_losses2, train_losses2, train_accuracy2, validation_accuracy2 = train(net, train_loader, validation_loader, criterion, optimizer)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(train_losses2, c='blue', label='train')
ax[0].plot(validation_losses2, c='red', label='validation')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].legend()

ax[1].plot(train_accuracy2, c='blue', label='train')
ax[1].plot(validation_accuracy2, c='red', label='validation')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')
ax[1].legend()
plt.show()


It is better then the first model.
Now we will train with learning rate 0.01

In [None]:
net = MLP()
optimizer = optim.SGD(net.parameters(), lr=lr3, momentum=0.9)

validation_losses3, train_losses3, train_accuracy3, validation_accuracy3 = train(net, train_loader, validation_loader, criterion, optimizer)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(train_losses3, c='blue', label='train')
ax[0].plot(validation_losses3, c='red', label='validation')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].legend()

ax[1].plot(train_accuracy3, c='blue', label='train')
ax[1].plot(validation_accuracy3, c='red', label='validation')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')
ax[1].legend()
plt.show()


It is worst tha the second model, because the train curve and the validation curve diverge, this means that there is overfitting.

**Now, let's build a CNN**

In [None]:
n1 = 64
n2 = 128

In [None]:
class CNN(nn.Module):
  def __init__(self, n1, n2):
    super().__init__()
    self.conv1 = nn.Conv2d(1, n1, 3, padding=1)
    self.norm1 = nn.BatchNorm2d(n1)
    self.conv2 = nn.Conv2d(n1, n2, 3, padding=1)
    self.norm2 = nn.BatchNorm2d(n2)
    self.pool = nn.MaxPool2d(2, 2)
    self.fc = nn.Linear(n2*7*7, 10)
    self.dropout = nn.Dropout(0.25)

  def forward(self, x):
    x = self.conv1(x)
    x = self.norm1(x)
    x = F.relu(x)
    x = self.dropout(x)
    x = self.pool(x)
    x = self.conv2(x)
    x = self.norm2(x)
    x = F.relu(x)
    x = self.dropout(x)
    x = self.pool(x)
    x = torch.flatten(x, 1)
    x = self.fc(x)
    return x

In [None]:
net = CNN(n1, n2)
optimizer = optim.SGD(net.parameters(), lr=lr2, momentum=0.9)

validation_losses, train_losses, train_accuracy, validation_accuracy = train(net, train_loader, validation_loader, criterion, optimizer)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(train_losses, c='blue', label='train')
ax[0].plot(validation_losses, c='red', label='validation')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss')
ax[0].legend()

ax[1].plot(train_accuracy, c='blue', label='train')
ax[1].plot(validation_accuracy, c='red', label='validation')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')
ax[1].legend()
plt.show()

TypeError: train() missing 1 required positional argument: 'version'

Firstly, I tried the training of the network with a simple CNN, composed by two convutional layers, a pool layer and a fully connected layer. But there was overfitting, so I added two Batch Norm layer after the convutional ones and a dropout layer after the full connected one. The CNN work much better.

**Hyperparamter Optimization**


In [None]:
import random
import numpy as np
from torch.utils.data import DataLoader, Subset

EPOCHS = 10

parameters = {
    'n1': [32, 64, 128, 256],
    'n2': [32, 64, 128, 256],
    'lr': [0.1, 0.01, 0.001, 0.0001],
    'weight_decay': [0, 0.0001, 0.001, 0.01],
    'batch_size': [64, 128, 256, 512],
    'optimizer': ['SGD', 'Adam', 'RMSprop'],
    'momentum': [0.95, 0.9, 0.8]
}

Niteration = 10
bestAccuracy = 0
bestParameters = {}
results = []

for i in range(Niteration):
  print(f'Iteration {i}')
  params = {
      'n1': random.choice(parameters['n1']),
      'n2': random.choice(parameters['n2']),
      'lr': random.choice(parameters['lr']),
      'weight_decay': random.choice(parameters['weight_decay']),
      'batch_size': random.choice(parameters['batch_size']),
      'optimizer': random.choice(parameters['optimizer']),
      'momentum': random.choice(parameters['momentum'])
  }

  train_loader = DataLoader(train_set, batch_size=params['batch_size'], shuffle=True, num_workers=2, drop_last=True)
  validation_loader = DataLoader(validation_set, batch_size=params['batch_size'], shuffle=False, num_workers=2, drop_last=True)

  net = CNN(params['n1'], params['n2'])

  params_model = net.parameters()
  lr = params['lr']

  if params['optimizer'] == 'SGD':
    optimizer = optim.SGD(params_model, lr=lr, momentum=params['momentum'], weight_decay=params['weight_decay'])
  elif params['optimizer'] == 'Adam':
    optimizer = optim.Adam(params_model, lr=lr, weight_decay=params['weight_decay'])
  elif params['optimizer'] == 'RMSprop':
    optimizer = optim.RMSprop(params_model, lr=lr, weight_decay=params['weight_decay'])

  criterion = nn.CrossEntropyLoss()
  _, _, accuracy, _ = train(net, train_loader, validation_loader, criterion, optimizer)

  results.append({'accuracy': accuracy, 'params': params})
  print(f'Accuracy: {accuracy[-1]}')

  if accuracy[-1] > bestAccuracy:
    bestAccuracy = accuracy[-1]
    bestParameters = params

print('Best accuracy:', bestAccuracy)
for k, v in bestParameters.items():
  print(k, v)


Iteration 0


TypeError: train() missing 1 required positional argument: 'version'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
EPOCHS = 30

n1 = 32
n2 = 128
lr = 0.001
weight_decay = 0
batch_size = 64
momentum = 0.9

net1 = CNN(n1, n2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net1.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

trainLosses, valLosses, trainAccuracy, validationAccuracy = train(net1, train_loader, validation_loader, criterion, optimizer, 1)

print(f'Accuracy on validation set: {validationAccuracy[-1]}')

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(trainLosses, c='blue', label='train_loss')
ax[0].plot(valLosses, c='red', label='val_loss')
ax[1].plot(trainAccuracy, c='blue', label='train_accuracy')
ax[1].plot(validationAccuracy, c='red', label='val_accuracy')

ax[0].set_xlabel('num epochs')
ax[0].set_ylabel('loss')

ax[1].set_xlabel('num epochs')
ax[1].set_ylabel('accuracy')

ax[0].legend()
ax[1].legend()

plt.show()


**First Model**

In [None]:
EPOCHS = 30

n1 = 32
n2 = 128
lr = 0.001
weight_decay = 0
batch_size = 64
momentum = 0.9

net1 = CNN(n1, n2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net1.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

trainLosses, valLosses, trainAccuracy, validationAccuracy = train(net1, train_loader, validation_loader, criterion, optimizer, 1)

print(f'Accuracy on validation set: {validationAccuracy[-1]}')

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(trainLosses, c='blue', label='train_loss')
ax[0].plot(valLosses, c='red', label='val_loss')
ax[1].plot(trainAccuracy, c='blue', label='train_accuracy')
ax[1].plot(validationAccuracy, c='red', label='val_accuracy')

ax[0].set_xlabel('num epochs')
ax[0].set_ylabel('loss')

ax[1].set_xlabel('num epochs')
ax[1].set_ylabel('accuracy')

ax[0].legend()
ax[1].legend()

plt.show()


Epoch 0


**Second Model**

In [None]:
EPOCHS = 30

n1 = 32
n2 = 128
lr = 0.001
weight_decay = 0
batch_size = 64
momentum = 0.9

net2 = CNN(n1, n2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net2.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

trainLosses, valLosses, trainAccuracy, validationAccuracy = train(net2, train_loader, validation_loader, criterion, optimizer, 2)

print(f'Accuracy on validation set: {validationAccuracy[-1]}')

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(trainLosses, c='blue', label='train_loss')
ax[0].plot(valLosses, c='red', label='val_loss')
ax[1].plot(trainAccuracy, c='blue', label='train_accuracy')
ax[1].plot(validationAccuracy, c='red', label='val_accuracy')

ax[0].set_xlabel('num epochs')
ax[0].set_ylabel('loss')

ax[1].set_xlabel('num epochs')
ax[1].set_ylabel('accuracy')

ax[0].legend()
ax[1].legend()

plt.show()

**Third Model**

In [None]:
EPOCHS = 30

n1 = 32
n2 = 128
lr = 0.001
weight_decay = 0
batch_size = 64
momentum = 0.9

net3 = CNN(n1, n2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net3.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

trainLosses, valLosses, trainAccuracy, validationAccuracy = train(net3, train_loader, validation_loader, criterion, optimizer, 3)

print(f'Accuracy on validation set: {validationAccuracy[-1]}')

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].plot(trainLosses, c='blue', label='train_loss')
ax[0].plot(valLosses, c='red', label='val_loss')
ax[1].plot(trainAccuracy, c='blue', label='train_accuracy')
ax[1].plot(validationAccuracy, c='red', label='val_accuracy')

ax[0].set_xlabel('num epochs')
ax[0].set_ylabel('loss')

ax[1].set_xlabel('num epochs')
ax[1].set_ylabel('accuracy')

ax[0].legend()
ax[1].legend()

plt.show()

Finally, we can test the models

**TEST MODEL 1**

In [None]:
losses = 0
samples = 0
corrects = 0

net1.load_state_dict(torch.load('./drive/MyDrive/model1.pth'))

net1.to(device)
net1.eval()
with torch.no_grad():
  for batch in test_loader:
    images, labels = batch
    images, labels = images.to(device), labels.to(device)

    samples += batch[0].shape[0]
    preds = net1(images)

    _, label_pred = torch.max(preds.data, 1)
    corrects += torch.sum(labels == label_pred).item()
acc = 100 * 1.0 * corrects / float(samples)
print(f'Accuracy on test set 1: {acc}')

FileNotFoundError: [Errno 2] No such file or directory: 'model1.pht'

**TEST MODEL 2**

In [None]:
losses = 0
samples = 0
corrects = 0

net2.load_state_dict(torch.load('./drive/MyDrive/model2.pth'))

net2.to(device)
net2.eval()
with torch.no_grad():
  for batch in test_loader:
    images, labels = batch
    images, labels = images.to(device), labels.to(device)

    samples += batch[0].shape[0]
    preds = net2(images)  

    _, label_pred = torch.max(preds.data, 1)
    corrects += torch.sum(labels == label_pred).item()
acc = 100 * 1.0 * corrects / float(samples)
print(f'Accuracy on test set 2: {acc}')

**TEST MODEL 3**

In [None]:
losses = 0
samples = 0
corrects = 0

net3.load_state_dict(torch.load('./drive/MyDrive/model3.pth'))

net3.to(device)
net3.eval()
with torch.no_grad():
  for batch in test_loader:
    images, labels = batch
    images, labels = images.to(device), labels.to(device)

    samples += batch[0].shape[0]
    preds = net3(images)  

    _, label_pred = torch.max(preds.data, 1)
    corrects += torch.sum(labels == label_pred).item()
acc = 100 * 1.0 * corrects / float(samples)
print(f'Accuracy on test set 3: {acc}')  