### Task 5.1: Multi-digit Recognition on Multi-MNIST Dataset

#### Loading the Images

In [129]:
import os
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms

class MultiDigitMNISTDataset(Dataset):
  def __init__(self, root_dir, transform=None):
    self.root_dir = root_dir
    self.transform = transform
    self.folder_names = sorted(os.listdir(root_dir))
    self.image_paths = []
    self.labels = []

    for folder_name in self.folder_names:
      folder_path = os.path.join(root_dir, folder_name)
      if os.path.isdir(folder_path):
        for image_name in os.listdir(folder_path):
          image_path = os.path.join(folder_path, image_name)
          self.image_paths.append(image_path)
          self.labels.append(folder_name)

  def __len__(self):
    return len(self.image_paths)

  def __getitem__(self, idx):
    image_path = self.image_paths[idx]
    image = Image.open(image_path).convert('L')
    label = self.labels[idx]

    if self.transform:
      image = self.transform(image)

    tens_digit = int(label[0])
    units_digit = int(label[1])

    return image, (tens_digit, units_digit)

transform = transforms.Compose([
  transforms.ToTensor(),
])

root = 'double_mnist_seed_123_image_size_64_64'

train_dataset = MultiDigitMNISTDataset(root_dir=root+'/train', transform=transform)
val_dataset = MultiDigitMNISTDataset(root_dir=root+'/val', transform=transform)
test_dataset = MultiDigitMNISTDataset(root_dir=root+'/test', transform=transform)

### Task 5.1.1: MLP Model

In [198]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

class Reshape(nn.Module):
  def __init__(self, shape):
    super(Reshape, self).__init__()
    self.shape = shape

  def forward(self, x):
    return x.view(self.shape)

class MultiDigitMLP(nn.Module):
  def __init__(self, input_size=2048, hidden_size=128, dropout=0.2, learning_rate=0.001, batch_size=64, num_epochs=10):
    super(MultiDigitMLP, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size

    self.hidden_layer = nn.Sequential(
      nn.Linear(input_size, hidden_size),
      nn.ReLU(),
      nn.Dropout(dropout)
    )

    self.output = nn.Sequential(
      nn.Linear(hidden_size, 10)
    )

    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    self.num_epochs = num_epochs
    self.batch_size = batch_size

  def forward(self, x):
    x = x.reshape(x.size(0), -1)
    hidden_output = self.hidden_layer(x)
    outputs = self.output(hidden_output)
    return outputs

  def train_model(self, train_data, val_data):
    train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=self.batch_size, shuffle=True)

    for epoch in range(self.num_epochs):
      self.train()
      train_loss = 0.0
      correct = 0
      total = 0

      val_loss = 0.0
      val_correct = 0
      val_total = 0

      for inputs, (labels_tens, labels_ones) in train_loader:
        self.optimizer.zero_grad()
        outputs_tens = self(inputs[:, :, :, :32])
        outputs_ones = self(inputs[:, :, :, 32:])
        losses_tens = self.criterion(outputs_tens, labels_tens)
        losses_ones = self.criterion(outputs_ones, labels_ones)
        total_loss = losses_tens + losses_ones
        total_loss.backward()

        train_loss += total_loss.item()
        _, predicted_ones = torch.max(outputs_ones.data, 1)
        _, predicted_tens = torch.max(outputs_tens.data, 1)
        total += labels_ones.size(0)
        correct += (torch.logical_and(predicted_ones == labels_ones, predicted_tens == labels_tens)).sum().item()

      self.eval()
      for inputs, (labels_tens, labels_ones) in val_loader:
        self.optimizer.zero_grad()
        outputs_tens, outputs_ones = self(inputs[:, :, :, :32]), self(inputs[:, :, :, 32:])
        losses_tens = self.criterion(outputs_tens, labels_tens)
        losses_ones = self.criterion(outputs_ones, labels_ones)
        total_loss = losses_tens + losses_ones
        total_loss.backward()
        self.optimizer.step()

        val_loss += total_loss.item()
        _, predicted_ones = torch.max(outputs_ones.data, 1)
        _, predicted_tens = torch.max(outputs_tens.data, 1)
        val_total += labels_ones.size(0)
        val_correct += (torch.logical_and(predicted_ones == labels_ones, predicted_tens == labels_tens)).sum().item()

      val_accuracy = val_correct / val_total
      accuracy = correct / total

      print(
        f'Epoch {epoch + 1}/{self.num_epochs}, '
        f'Loss(Train): {train_loss:.4f}, '
        f'Accuracy(Train): {accuracy:.2f}, '
        f'Loss(Val): {val_loss:.4f}, '
        f'Accuracy(Val): {val_accuracy:.2f}'
      )

    return train_loss, val_loss, accuracy, val_accuracy

  def predict(self, pred_dataset):
    pred_loader = DataLoader(pred_dataset, batch_size=self.batch_size, shuffle=False)
    self.eval()
    predictions = []
    labels_true = []

    for inputs, (labels_tens, labels_ones) in pred_loader:
      outputs_tens, outputs_ones = self(inputs[:, :, :, :32]), self(inputs[:, :, :, 32:])
      _, predicted_ones = torch.max(outputs_ones.data, 1)
      _, predicted_tens = torch.max(outputs_tens.data, 1)
      predictions.append((predicted_tens, predicted_ones))
      labels_true.append((labels_tens, labels_ones))
    return predictions, labels_true

In [199]:
multi_mlp = MultiDigitMLP()

multi_mlp.train_model(train_dataset, val_dataset)

Epoch 1/10, Loss(Train): 4607.7942, Accuracy(Train): 0.01, Loss(Val): 782.7547, Accuracy(Val): 0.26
Epoch 2/10, Loss(Train): 2613.5495, Accuracy(Train): 0.32, Loss(Val): 422.0192, Accuracy(Val): 0.58
Epoch 3/10, Loss(Train): 2014.5849, Accuracy(Train): 0.44, Loss(Val): 288.9817, Accuracy(Val): 0.70
Epoch 4/10, Loss(Train): 1931.6163, Accuracy(Train): 0.46, Loss(Val): 223.5257, Accuracy(Val): 0.77
Epoch 5/10, Loss(Train): 1861.3163, Accuracy(Train): 0.49, Loss(Val): 182.6402, Accuracy(Val): 0.81
Epoch 6/10, Loss(Train): 1812.6654, Accuracy(Train): 0.51, Loss(Val): 153.9736, Accuracy(Val): 0.83
Epoch 7/10, Loss(Train): 1851.8488, Accuracy(Train): 0.52, Loss(Val): 131.9802, Accuracy(Val): 0.86
Epoch 8/10, Loss(Train): 1953.2371, Accuracy(Train): 0.51, Loss(Val): 115.3261, Accuracy(Val): 0.88
Epoch 9/10, Loss(Train): 1971.0719, Accuracy(Train): 0.52, Loss(Val): 100.3284, Accuracy(Val): 0.90
Epoch 10/10, Loss(Train): 2017.8935, Accuracy(Train): 0.51, Loss(Val): 89.0828, Accuracy(Val): 0.91


(2017.8935002088547, 89.08284649252892, 0.513828125, 0.9125625)

In [191]:
y_pred, y_true = multi_mlp.predict(test_dataset)
# for accuracy
correct = 0
total = 0
for i in range(len(y_pred)):
  total += len(y_pred[i][0])
  correct += (torch.logical_and(y_pred[i][1] == y_true[i][1], y_pred[i][0] == y_true[i][0])).sum().item()
print(correct / total)

0.75935


### Task 5.1.2: Multi-MNIST Model

In [269]:
import torch
import torch.nn as nn
import torch.optim as optim

class Reshape(nn.Module):
  def __init__(self, shape):
    super(Reshape, self).__init__()
    self.shape = shape

  def forward(self, x):
    return x.view(self.shape)

class MultiDigitCNN(nn.Module):
  def __init__(self, kernel_size=3, stride=1, padding=1, dropout=0.2, learning_rate=0.001, batch_size=64, num_epochs=5):
    super(MultiDigitCNN, self).__init__()

    self.num_epochs = num_epochs
    self.batch_size = batch_size

    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=kernel_size, stride=stride, padding=padding)
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.adjust_shape = Reshape((-1, 8192))
    self.dropout = nn.Dropout(p=dropout)
    self.fc = nn.Linear(8192, 10)

    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
  def forward(self, x):
    # print(x.shape)
    x = self.conv1(x)
    # print(x.shape)
    x = self.relu(x)
    x = self.pool(x)
    # print(x.shape)
    x = self.dropout(x)
    x = self.adjust_shape(x)
    # print(x.shape)
    x = self.fc(x)
    # print(x.shape)
    return x

  def train_model(self, train_data, val_data):
    train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=self.batch_size, shuffle=True)
    
    train_loss, val_loss, accuracy, val_accuracy = 0, 0, 0, 0

    for epoch in range(self.num_epochs):
      self.train()
      train_loss = 0.0
      correct = 0
      total = 0

      val_loss = 0.0
      val_correct = 0
      val_total = 0

      for inputs, (labels_tens, labels_ones) in train_loader:
        self.optimizer.zero_grad()
        outputs_tens, outputs_ones = self(inputs[:, :, :, :32]), self(inputs[:, :, :, 32:])
        # print(outputs_tens, outputs_ones)
        losses_tens = self.criterion(outputs_tens, labels_tens)
        losses_ones = self.criterion(outputs_ones, labels_ones)
        total_loss = losses_tens + losses_ones
        total_loss.backward()
        self.optimizer.step()

        train_loss += total_loss.item()
        _, predicted_ones = torch.max(outputs_ones.data, 1)
        _, predicted_tens = torch.max(outputs_tens.data, 1)
        total += labels_ones.size(0)
        correct += (torch.logical_and(predicted_ones == labels_ones, predicted_tens == labels_tens)).sum().item()

      self.eval()
      for inputs, (labels_tens, labels_ones) in val_loader:
        self.optimizer.zero_grad()
        outputs_tens, outputs_ones = self(inputs[:, :, :, :32]), self(inputs[:, :, :, 32:])
        losses_tens = self.criterion(outputs_tens, labels_tens)
        losses_ones = self.criterion(outputs_ones, labels_ones)
        total_loss = losses_tens + losses_ones
        total_loss.backward()
        self.optimizer.step()

        val_loss += total_loss.item()
        _, predicted_ones = torch.max(outputs_ones.data, 1)
        _, predicted_tens = torch.max(outputs_tens.data, 1)
        val_total += labels_ones.size(0)
        val_correct += (torch.logical_and(predicted_ones == labels_ones, predicted_tens == labels_tens)).sum().item()
        
      val_accuracy = val_correct / val_total
      accuracy = correct / total

      print(
        f'Epoch {epoch + 1}/{self.num_epochs}, '
        f'Loss(Train): {train_loss:.4f}, '
        f'Accuracy(Train): {accuracy:.2f}, '
        f'Loss(Val): {val_loss:.4f}, '
        f'Accuracy(Val): {val_accuracy:.2f}'
      )
    return train_loss, val_loss, accuracy, val_accuracy

  def predict(self, pred_data):
    pred_loader = DataLoader(pred_data, batch_size=self.batch_size, shuffle=False)
    self.eval()
    predictions = []
    labels_true = []

    for inputs, (labels_tens, labels_ones) in pred_loader:
      outputs_tens, outputs_ones = self(inputs[:, :, :, :32]), self(inputs[:, :, :, 32:])
      _, predicted_ones = torch.max(outputs_ones.data, 1)
      _, predicted_tens = torch.max(outputs_tens.data, 1)
      predictions.append((predicted_tens, predicted_ones))
      labels_true.append((labels_tens, labels_ones))
    return predictions, labels_true

#### Training the Model

In [270]:
multi_cnn = MultiDigitCNN()

multi_cnn.train_model(train_dataset, val_dataset)

Epoch 1/5, Loss(Train): 2154.3533, Accuracy(Train): 0.44, Loss(Val): 322.7100, Accuracy(Val): 0.65
Epoch 2/5, Loss(Train): 1235.0429, Accuracy(Train): 0.65, Loss(Val): 251.4263, Accuracy(Val): 0.72
Epoch 3/5, Loss(Train): 1051.2651, Accuracy(Train): 0.70, Loss(Val): 216.0777, Accuracy(Val): 0.76
Epoch 4/5, Loss(Train): 941.0196, Accuracy(Train): 0.73, Loss(Val): 193.7650, Accuracy(Val): 0.78
Epoch 5/5, Loss(Train): 867.9561, Accuracy(Train): 0.75, Loss(Val): 177.1804, Accuracy(Val): 0.80


(867.9561473727226, 177.18040171265602, 0.74928125, 0.7995)

In [271]:
y_pred, y_true = multi_cnn.predict(test_dataset)

# for accuracy
correct = 0
total = 0
for i in range(len(y_pred)):
  total += len(y_pred[i][0])
  correct += (torch.logical_and(y_pred[i][1] == y_true[i][1], y_pred[i][0] == y_true[i][0])).sum().item()
print(correct / total)

0.76125


### Task 5.1.3: Testing on Single Digit MNIST

In [225]:
from torchvision import datasets, transforms

# load mnist dataset
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

single_digit_mnist_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

In [245]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    image, label = self.data[index]
    tens_digit = label // 5
    ones_digit = label % 5
    return image, (tens_digit, ones_digit)

In [246]:
pseudo_double_digit_mnist = CustomDataset(single_digit_mnist_dataset)

#### Testing on multi mlp

In [None]:
y_pred_mlp, y_true_mlp = multi_mlp.predict(pseudo_double_digit_mnist)

# for accuracy
correct = 0
total = 0
for i in range(len(y_pred_mlp)):
  total += len(y_pred_mlp[i][0])
  correct += (5*y_pred_mlp[i][0] + y_pred_mlp[i][1] == 5*y_true_mlp[i][0] + y_true_mlp[i][1]).sum().item()
  
print(correct / total)

0.009183333333333333


#### Testing on multi cnn

In [272]:
y_pred_cnn, y_true_cnn = multi_cnn.predict(pseudo_double_digit_mnist)

# for accuracy
correct = 0
total = 0
for i in range(len(y_pred_cnn)):
  total += len(y_pred_cnn[i][0])
  correct += (y_pred_cnn[i][1] + 5*y_pred_cnn[i][0] == y_true_cnn[i][1] + 5*y_true_cnn[i][0]).sum().item()
  
print(correct / total)

0.012816666666666667
