In [17]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
train_dataset = torchvision.datasets.MNIST(root='/Users/ekinokos2/datasets/MNIST', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='/Users/ekinokos2/datasets/MNIST', train=False, transform=torchvision.transforms.ToTensor(), download=True)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [14]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [16]:
model = LeNet()

criterion = nn.CrossEntropyLoss()  # Logistic loss

# Define different learning rates to try
learning_rates = [0.001, 0.01, 0.1]

# Define different learning rate decay values (for AdaGrad)
lr_decay = [0.001, 0.01, 0.1]

# Define three different values for adaptivity momentum (for Adam)
momentum = [0.001, 0.01, 0.1]

# Define three different values for gradient momentum (for Adam)
grad_momentum = [0.001, 0.01, 0.1]

# Kaiming Initialization
def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight)

In [18]:
# Training loop
def train_and_test(model, optimizer, criterion, train_loader, test_loader, epochs, learning_rate):
    model.train()
    optimizer.param_groups[0]['lr'] = learning_rate  # Set the learning rate
    training_losses = []
    for epoch in range(epochs):
        running_loss = 0.0
        for i, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        training_losses.append(running_loss / len(train_loader))
    accuracy = test(model, test_loader)
    return training_losses, accuracy

def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    accuracy = 100 * correct / total
    return accuracy

results = {}
for lr in learning_rates:
    model = LeNet()
    optimizers = {
    "SGD": optim.SGD(model.parameters(), lr=lr),
    "BatchGD": optim.SGD(model.parameters(), lr=lr, momentum=0.0, nesterov=False),
    "AdaGrad": optim.Adagrad(model.parameters(), lr=lr),
    "Adam": optim.Adam(model.parameters(), lr=lr)
    }
    for name, optimizer in optimizers.items():
        model.apply(weights_init)
        training_losses, accuracy = train_and_test(model, optimizer, criterion, train_loader, test_loader, epochs=10, learning_rate=lr)
        print("Optimizer: {}, Learning Rate: {}, Accuracy: {}".format(name, lr, accuracy))
        results[(name, lr)] = (training_losses, accuracy)


# Plot the training losses for different learning rates
plt.figure(figsize=(10, 6))
for lr in learning_rates:
    losses, _ = results[lr]
    plt.plot(range(1, 11), losses, label=f'LR={lr}')
plt.xlabel('Epochs')
plt.ylabel('Training Loss')
plt.legend()
plt.title('Training Loss for Different Learning Rates')
plt.show()

# Print accuracy results for different learning rates
for lr, (_, accuracy) in results.items():
    print(f'Accuracy for LR={lr}: {accuracy:.2f}%')

Optimizer: SGD, Learning Rate: 0.001, Accuracy: 94.71


KeyboardInterrupt: 

In [None]:
optim.Adagrad?