In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import copy
import numpy as np

In [2]:
# Download training data from open datasets.
training_data = datasets.EMNIST(
    root="data",
    split="digits",
    train=True,
    download=False,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.EMNIST(
    root="data",
    split="digits",
    train=False,
    download=False,
    transform=ToTensor(),
)

In [3]:
batch_size = 64

# Create data loaders
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader  = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [4]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 100),
            nn.ReLU(),
            nn.Linear(100, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

Using cuda device


In [5]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    
    loss = 0
    loss_array = []

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            loss_array.append(float(loss))
            # print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

    return np.array(loss_array)

In [6]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    return 100*correct, test_loss

In [97]:
params_Adam = dict()
params_AMS  = dict()
params_SGD  = dict()

beta2_params = np.array([0.99, 0.999])
lr_params    = np.array([1e-2, 1e-3, 1e-4])
epochs = 3

# beta2_params = np.array([0.99])
# lr_params    = np.array([1e-2])

for beta2 in beta2_params:
    print(f"beta_2 = {beta2}")
    for lr in lr_params:
        print(f"lr = {lr}")
        model_Adam = NeuralNetwork().to(device)
        loss_fn_Adam = nn.CrossEntropyLoss()
        optimizer_Adam = torch.optim.Adam(model_Adam.parameters(), lr=lr, betas=(0.9, beta2),
                                            eps=1e-8, amsgrad=False)
        
        model_AMS = copy.deepcopy(model_Adam)
        loss_fn_AMS  = nn.CrossEntropyLoss()
        optimizer_AMS  = torch.optim.Adam(model_AMS.parameters(), lr=lr, betas=(0.9, beta2),
                                            eps=1e-8, amsgrad=True)
        
        model_SGD = copy.deepcopy(model_Adam)
        loss_fn_SGD  = nn.CrossEntropyLoss()
        optimizer_SGD  = torch.optim.SGD(model_SGD.parameters(), lr=lr)
        
        for t in range(epochs):
            print(f"Epoch {t+1}\n-------------------------------")
            print("Adam")
            train_loss_Adam = train(train_dataloader, model_Adam, loss_fn_Adam, optimizer_Adam)
            test_correct_Adam, test_loss_Adam = test(test_dataloader, model_Adam, loss_fn_Adam)

            params_Adam[(str(beta2), str(lr), str(t))] = np.array([train_loss_Adam, test_correct_Adam, test_loss_Adam])

            print("AMSGrad")
            train_loss_AMS  = train(train_dataloader, model_AMS, loss_fn_AMS, optimizer_AMS)
            test_correct_AMS, test_loss_AMS   = test(test_dataloader, model_AMS, loss_fn_AMS)

            params_AMS[(str(beta2), str(lr), str(t))] = np.array([train_loss_AMS, test_correct_AMS, test_loss_AMS])

            print("SGD")
            train_loss_SGD  = train(train_dataloader, model_SGD, loss_fn_SGD, optimizer_SGD)
            test_correct_SGD, test_loss_SGD   = test(test_dataloader, model_SGD, loss_fn_SGD)

            params_SGD[(str(beta2), str(lr), str(t))] = np.array([train_loss_SGD, test_correct_SGD, test_loss_SGD])
            
print("Done!")

beta_2 = 0.99
lr = 0.01
Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
lr = 0.001
Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
lr = 0.0001
Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
beta_2 = 0.999
lr = 0.01
Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
lr = 0.001
Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
lr = 0.0001
Epoch 1
-------------------------------
Adam
AMSGr

In [7]:
# for key in params_Adam.keys():
#     print(f"{key} {params_Adam[key]}")

# print()

# for key in params_AMS.keys():
#     print(f"{key} {params_AMS[key]}")

# print()

# for key in params_SGD.keys():
#     print(f"{key} {params_SGD[key]}")

# Adam: ('0.999', '0.001', '2') [1.87005438e-02 9.84575000e+01 5.03956466e-02]
# AMS:  ('0.999', '0.001', '2') [3.23241986e-02 9.84100000e+01 5.18268165e-02]
# SGD:  ('0.999', '0.01', '2') [ 0.29259926 94.595       0.19515315]

good_model_Adam = NeuralNetwork().to(device)
good_loss_fn_Adam = nn.CrossEntropyLoss()
good_optimizer_Adam = torch.optim.Adam(good_model_Adam.parameters(), lr=1e-3, betas=(0.9, 0.999),
                                       eps=1e-8, amsgrad=False)
good_train_losses_Adam  = []
good_test_accuracies_Adam = []
good_test_losses_Adam     = []

good_model_AMS = copy.deepcopy(good_model_Adam)
good_loss_fn_AMS  = nn.CrossEntropyLoss()
good_optimizer_AMS  = torch.optim.Adam(good_model_AMS.parameters(), lr=1e-3, betas=(0.9, 0.999),
                                       eps=1e-8, amsgrad=True)
good_train_losses_AMS   = []
good_test_accuracies_AMS  = []
good_test_losses_AMS      = []

good_model_SGD = copy.deepcopy(good_model_Adam)
good_loss_fn_SGD  = nn.CrossEntropyLoss()
good_optimizer_SGD  = torch.optim.SGD(good_model_SGD.parameters(), lr=1e-2)
good_train_losses_SGD  = []
good_test_accuracies_SGD = []
good_test_losses_SGD     = []

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    print("Adam")
    good_train_loss_Adam = train(train_dataloader, good_model_Adam, good_loss_fn_Adam, good_optimizer_Adam)
    good_test_correct_Adam, good_test_loss_Adam = test(test_dataloader, good_model_Adam, good_loss_fn_Adam)
    good_train_losses_Adam.append(good_train_loss_Adam)
    good_test_accuracies_Adam.append(good_test_correct_Adam)
    good_test_losses_Adam.append(good_test_loss_Adam)

    print("AMSGrad")
    good_train_loss_AMS  = train(train_dataloader, good_model_AMS, good_loss_fn_AMS, good_optimizer_AMS)
    good_test_correct_AMS, good_test_loss_AMS   = test(test_dataloader, good_model_AMS, good_loss_fn_AMS)
    good_train_losses_AMS.append(good_train_loss_AMS)
    good_test_accuracies_AMS.append(good_test_correct_AMS)
    good_test_losses_AMS.append(good_test_loss_AMS)

    print("SGD")
    good_train_loss_SGD  = train(train_dataloader, good_model_SGD, good_loss_fn_SGD, good_optimizer_SGD)
    good_test_correct_SGD, good_test_loss_SGD   = test(test_dataloader, good_model_SGD, good_loss_fn_SGD)
    good_train_losses_SGD.append(good_train_loss_SGD)
    good_test_accuracies_SGD.append(good_test_correct_SGD)
    good_test_losses_SGD.append(good_test_loss_SGD)

Epoch 1
-------------------------------
Adam
AMSGrad
SGD
Epoch 2
-------------------------------
Adam
AMSGrad
SGD
Epoch 3
-------------------------------
Adam
AMSGrad
SGD
Epoch 4
-------------------------------
Adam
AMSGrad
SGD
Epoch 5
-------------------------------
Adam
AMSGrad
SGD
Epoch 6
-------------------------------
Adam
AMSGrad
SGD
Epoch 7
-------------------------------
Adam
AMSGrad
SGD
Epoch 8
-------------------------------
Adam
AMSGrad
SGD
Epoch 9
-------------------------------
Adam
AMSGrad
SGD
Epoch 10
-------------------------------
Adam
AMSGrad
SGD


In [10]:
# Adam: Use beta_2 = 0.999 and lr = 10^-3
# AMSGrad: Use beta_2 = 0.99 and lr = 10^-2
# print(good_train_losses_Adam)

print(good_train_losses_SGD)
print(good_test_accuracies_SGD)
print(good_test_losses_SGD)

[array([2.34286952, 2.04196715, 1.71680999, 1.38247991, 1.14689434,
       0.94908231, 0.72385979, 0.58400148, 0.56706083, 0.56488204,
       0.48820007, 0.37965101, 0.43816322, 0.37025988, 0.59125662,
       0.33464909, 0.27333459, 0.60560334, 0.33296308, 0.25460836,
       0.34455279, 0.33494651, 0.31490606, 0.26430351, 0.42113432,
       0.49100289, 0.30978239, 0.20664634, 0.29159144, 0.31199586,
       0.35981828, 0.44043028, 0.26117927, 0.36150742, 0.31666774,
       0.33548129, 0.19195606, 0.19907674]), array([0.30406481, 0.24999826, 0.19818862, 0.2810472 , 0.33935598,
       0.34882039, 0.31055421, 0.28021383, 0.38722619, 0.31210619,
       0.2222635 , 0.16387573, 0.27621138, 0.17609943, 0.3756094 ,
       0.21704771, 0.13946007, 0.49745467, 0.21493199, 0.1414905 ,
       0.2745108 , 0.22462121, 0.20236883, 0.20826869, 0.32155734,
       0.46153012, 0.22611983, 0.16997351, 0.25521633, 0.2337656 ,
       0.29373926, 0.34612051, 0.21001521, 0.28858733, 0.25543985,
       0.2645424