## Load Data

In [1]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import numpy as np
import random

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
batch_size = 512
num_epochs = 10

train_dataset = MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)

100%|██████████| 9.91M/9.91M [00:10<00:00, 973kB/s] 
100%|██████████| 28.9k/28.9k [00:00<00:00, 188kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.09MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 4.53MB/s]


In [3]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0.0, std=0.05)
                nn.init.constant_(m.bias, 0.0)


    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = nn.Sigmoid()(x)
        x = self.layer2(x)
        x = nn.Sigmoid()(x)
        x = self.layer3(x)
        x = nn.Sigmoid()(x)
        out = self.output(x)

        return out

Q1

In [4]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters())

In [5]:
for epoch in range(num_epochs):    
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 2.3407, Train_Acc: 0.0978
Epoch 2/10, Train_Loss: 2.3094, Train_Acc: 0.1066
Epoch 3/10, Train_Loss: 2.3030, Train_Acc: 0.1124
Epoch 4/10, Train_Loss: 2.3016, Train_Acc: 0.1124
Epoch 5/10, Train_Loss: 2.3013, Train_Acc: 0.1124
Epoch 6/10, Train_Loss: 2.3012, Train_Acc: 0.1124
Epoch 7/10, Train_Loss: 2.3012, Train_Acc: 0.1124
Epoch 8/10, Train_Loss: 2.3012, Train_Acc: 0.1124
Epoch 9/10, Train_Loss: 2.3011, Train_Acc: 0.1124
Epoch 10/10, Train_Loss: 2.3011, Train_Acc: 0.1124


Q2

In [6]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [7]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 2.3175, Train_Acc: 0.1377
Epoch 2/10, Train_Loss: 1.7754, Train_Acc: 0.4937
Epoch 3/10, Train_Loss: 0.8369, Train_Acc: 0.7633
Epoch 4/10, Train_Loss: 0.5028, Train_Acc: 0.8691
Epoch 5/10, Train_Loss: 0.3605, Train_Acc: 0.9012
Epoch 6/10, Train_Loss: 0.2940, Train_Acc: 0.9173
Epoch 7/10, Train_Loss: 0.2506, Train_Acc: 0.9283
Epoch 8/10, Train_Loss: 0.2181, Train_Acc: 0.9376
Epoch 9/10, Train_Loss: 0.1897, Train_Acc: 0.9455
Epoch 10/10, Train_Loss: 0.1657, Train_Acc: 0.9530


Q3

In [8]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0.0, std=0.05)
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer2(x)
        x = nn.LeakyReLU()(x)
        x = self.layer3(x)
        x = nn.LeakyReLU()(x)
        out = self.output(x)

        return out

In [9]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [10]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 1.4149, Train_Acc: 0.6175
Epoch 2/10, Train_Loss: 0.3008, Train_Acc: 0.9136
Epoch 3/10, Train_Loss: 0.2044, Train_Acc: 0.9404
Epoch 4/10, Train_Loss: 0.1498, Train_Acc: 0.9562
Epoch 5/10, Train_Loss: 0.1176, Train_Acc: 0.9657
Epoch 6/10, Train_Loss: 0.0959, Train_Acc: 0.9722
Epoch 7/10, Train_Loss: 0.0784, Train_Acc: 0.9759
Epoch 8/10, Train_Loss: 0.0622, Train_Acc: 0.9815
Epoch 9/10, Train_Loss: 0.0529, Train_Acc: 0.9835
Epoch 10/10, Train_Loss: 0.0429, Train_Acc: 0.9871


Q4

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer2(x)
        x = nn.LeakyReLU()(x)
        x = self.layer3(x)
        x = nn.LeakyReLU()(x)
        out = self.output(x)

        return out

In [12]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [13]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 1.1696, Train_Acc: 0.6636
Epoch 2/10, Train_Loss: 0.2465, Train_Acc: 0.9284
Epoch 3/10, Train_Loss: 0.1610, Train_Acc: 0.9532
Epoch 4/10, Train_Loss: 0.1173, Train_Acc: 0.9659
Epoch 5/10, Train_Loss: 0.0893, Train_Acc: 0.9735
Epoch 6/10, Train_Loss: 0.0726, Train_Acc: 0.9785
Epoch 7/10, Train_Loss: 0.0576, Train_Acc: 0.9832
Epoch 8/10, Train_Loss: 0.0464, Train_Acc: 0.9863
Epoch 9/10, Train_Loss: 0.0364, Train_Acc: 0.9898
Epoch 10/10, Train_Loss: 0.0289, Train_Acc: 0.9917


Q5

In [14]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)
        self.bn1 = nn.BatchNorm1d(hidden_dims)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer2(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer3(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        out = self.output(x)

        return out

In [15]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [16]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 1.1274, Train_Acc: 0.6554
Epoch 2/10, Train_Loss: 0.2558, Train_Acc: 0.9315
Epoch 3/10, Train_Loss: 0.1504, Train_Acc: 0.9589
Epoch 4/10, Train_Loss: 0.1028, Train_Acc: 0.9723
Epoch 5/10, Train_Loss: 0.0721, Train_Acc: 0.9816
Epoch 6/10, Train_Loss: 0.0520, Train_Acc: 0.9871
Epoch 7/10, Train_Loss: 0.0365, Train_Acc: 0.9916
Epoch 8/10, Train_Loss: 0.0243, Train_Acc: 0.9956
Epoch 9/10, Train_Loss: 0.0178, Train_Acc: 0.9970
Epoch 10/10, Train_Loss: 0.0127, Train_Acc: 0.9985


Q6

In [17]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)
        self.bn1 = nn.BatchNorm1d(hidden_dims)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        identity = x # Skip connection
        
        x = self.layer2(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        
        x = self.layer3(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        
        x += identity  # Skip connection
        out = self.output(x)

        return out

In [18]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [19]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 1.2211, Train_Acc: 0.6288
Epoch 2/10, Train_Loss: 0.2635, Train_Acc: 0.9255
Epoch 3/10, Train_Loss: 0.1601, Train_Acc: 0.9546
Epoch 4/10, Train_Loss: 0.1103, Train_Acc: 0.9695
Epoch 5/10, Train_Loss: 0.0803, Train_Acc: 0.9787
Epoch 6/10, Train_Loss: 0.0588, Train_Acc: 0.9849
Epoch 7/10, Train_Loss: 0.0424, Train_Acc: 0.9898
Epoch 8/10, Train_Loss: 0.0303, Train_Acc: 0.9940
Epoch 9/10, Train_Loss: 0.0220, Train_Acc: 0.9966
Epoch 10/10, Train_Loss: 0.0154, Train_Acc: 0.9979
