In [1]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Normalize
from torchvision.transforms import transforms

In [2]:
# Load MNIST dataset
mnist_train = MNIST('mnist', train=True, download=True, transform=ToTensor())
mnist_test = MNIST('mnist', train=False, download=True, transform=ToTensor())

In [3]:
#### Unnecassary???####
# Determine the mean and standard deviation
# mnist_train_pixels = np.concatenate([np.array(x) for x, y in mnist_train], axis=0)
# mnist_test_pixels = np.concatenate([np.array(x) for x, y in mnist_test], axis=0)
#
# mean_train = np.mean(mnist_train_pixels)
# mean_test = np.mean(mnist_test_pixels)
# print(mean_train, mean_test)
#
# std_train = np.std(mnist_train_pixels)
# std_test = np.std(mnist_test_pixels)
# print(std_train, std_test)
#
# transform_train = transforms.Compose([transforms.ToTensor(),
#                                       Normalize(mean_train, std_train)])
# transform_test = transforms.Compose([transforms.ToTensor(),
#                                         Normalize(mean_test, std_test)])
#
# mnist_train = MNIST('mnist', train=True, download=True, transform=transform_train)
# mnist_test = MNIST('mnist', train=False, download=True, transform=transform_test)

In [4]:
# Split the data into training set (70%) and testing set (30%)
mnist_train_size = int(0.7 * len(mnist_train))
mnist_test_size = len(mnist_train) - mnist_train_size
mnist_train_data, mnist_test_data = torch.utils.data.random_split(mnist_train, [mnist_train_size, mnist_test_size])

In [5]:
# Create data loaders
train_loader = DataLoader(mnist_train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(mnist_test_data, batch_size=32, shuffle=True)

In [6]:
# Define the 2-layer MLP
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, reg_strength):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.reg_strength = reg_strength
        # Using 2 hidden layers
        # self.fc1 = nn.Linear(input_size * input_size, hidden_size)
        # self.fc2 = nn.Linear(hidden_size, hidden_size)
        # Using 3 hidden layers
        # self.fc1 = nn.Linear(input_size * input_size, hidden_size)
        # self.fc2 = nn.Linear(hidden_size, hidden_size)
        # self.fc3 = nn.Linear(hidden_size, hidden_size)
        # self.fc4 = nn.Linear(hidden_size, output_size)
        # Using 4 hidden layers
        self.fc1 = nn.Linear(input_size * input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(-1, self.input_size * self.input_size)
        # Using ReLU
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        # Using Tanh
        # x = torch.tanh(self.fc1(x))
        # x = torch.tanh(self.fc2(x))
        # x = torch.tanh(self.fc3(x))
        # x = torch.tanh(self.fc4(x))
        # Using Sigmoid
        # x = torch.sigmoid(self.fc1(x))
        # x = torch.sigmoid(self.fc2(x))
        # x = torch.sigmoid(self.fc3(x))
        # x = torch.sigmoid(self.fc4(x))
        x = self.fc5(x)
        return x

    def l2_regularization_loss(self):
        l2_loss = 0.0
        for param in self.parameters():
            l2_loss += torch.norm(param)
        return self.reg_strength * l2_loss


In [7]:
# Initialize the MLP
mlp = MLP(28, 100, 10, 0.0002)

In [8]:
# Define the loss function and optimizer

# Using CrossEntropyLoss
criterion = nn.CrossEntropyLoss()

# Using MSELoss
# criterion = nn.MSELoss()

# Add regularization to the loss function
def reularized_loss(output, target):
    loss = criterion(output, target)
    reg_loss = mlp.l2_regularization_loss()
    return loss + reg_loss

# Using SGD
# optimizer = optim.SGD(mlp.parameters(), lr=0.0002, momentum=0.9)

# Using Adam
optimizer = optim.Adam(mlp.parameters(), lr=0.0002)

In [9]:
# Train the model for 50 epochs

# (Record start time)

training_start_time = time.time()
losses = []
for epoch in range(50):
    epoch_loss = 0
    for data, target in train_loader:
        optimizer.zero_grad()
        output = mlp(data)
        loss = reularized_loss(output, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    losses.append(epoch_loss)
    print(f'Epoch {epoch + 1} loss: {epoch_loss}')

# (Record end time)
training_end_time = time.time()

# (Calculate total time)
training_total_time = training_end_time - training_start_time

# (Report total time)
print(f'Total training time: {training_total_time}')

Epoch 1 loss: 855.1603162661195
Epoch 2 loss: 406.5071855150163
Epoch 3 loss: 299.40786983817816
Epoch 4 loss: 233.27788413502276
Epoch 5 loss: 194.08659490942955
Epoch 6 loss: 165.03183232992887
Epoch 7 loss: 144.65966725256294
Epoch 8 loss: 128.81449887622148
Epoch 9 loss: 113.39699163008481
Epoch 10 loss: 101.54378873482347
Epoch 11 loss: 91.49456492997706
Epoch 12 loss: 80.060516923666
Epoch 13 loss: 73.87208669818938
Epoch 14 loss: 63.629541067406535
Epoch 15 loss: 59.91077868267894
Epoch 16 loss: 54.51254006661475
Epoch 17 loss: 51.37781218905002
Epoch 18 loss: 44.55264545790851
Epoch 19 loss: 42.29377803392708
Epoch 20 loss: 37.97591426130384
Epoch 21 loss: 35.977053481154144
Epoch 22 loss: 32.51917611062527
Epoch 23 loss: 32.215857402421534
Epoch 24 loss: 32.41206343751401
Epoch 25 loss: 25.910215868614614
Epoch 26 loss: 30.19633192103356
Epoch 27 loss: 26.58585260435939
Epoch 28 loss: 25.48805855680257
Epoch 29 loss: 24.180408499203622
Epoch 30 loss: 25.30983182694763
Epoch 31

In [10]:
# Evaluate the model on the testing set

# (Record start time)
eval_test_start_time = time.time()

with torch.no_grad():
    correct = 0
    total = 0
    for data, target in test_loader:
        output = mlp(data)
        prediction = output.argmax(dim=1)
        correct += (prediction == target).sum().item()
        total += target.shape[0]
    accuracy = correct / total
    print('Accuracy on test set: {:.2f}%'.format(accuracy * 100))

# (Record end time)
eval_test_end_time = time.time()

# (Calculate total time)
eval_test_total_time = eval_test_end_time - eval_test_start_time

# (Report total time)
print(f'Total evaluation time on test set: {eval_test_total_time}')

Accuracy on test set: 97.55%
Total evaluation time on test set: 1.7249670028686523


Results with ReLU:
    Accuracy on test set: 95%

In [11]:
# Evaluate the model on the training set

# (Record start time)
eval_train_start_time = time.time()

with torch.no_grad():
    correct = 0
    total = 0
    for data, target in train_loader:
        output = mlp(data)
        prediction = output.argmax(dim=1)
        correct += (prediction == target).sum().item()
        total += target.shape[0]
    accuracy = correct / total
    print('Accuracy on train set: {:.2f}%'.format(accuracy * 100))

# (Record end time)
eval_train_end_time = time.time()

# (Calculate total time)
eval_train_total_time = eval_train_end_time - eval_train_start_time

# (Report total time)
print(f'Total evaluation time on train set: {eval_train_total_time}')

Accuracy on train set: 99.98%
Total evaluation time on train set: 3.721993923187256
