In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
# Transformations to apply to the data
transform = transforms.Compose([
    transforms.ToTensor() # Convert image to PyTorch Tensor
])

# Download and load the training data
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2)

# Download and load the test data
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=1000, shuffle=False, num_workers=2)


100.0%
100.0%
100.0%
100.0%


In [3]:
class SimpleMLP(nn.Module):
    def __init__(self):
        super(SimpleMLP, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 128) # Input layer -> Hidden layer 1
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)   # Hidden layer 1 -> Hidden layer 2
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 10)    # Hidden layer 2 -> Output layer

    def forward(self, x):
        x = self.flatten(x)    # Flatten the image
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)        # Raw scores (logits)
        return x

# Instantiate the model and move it to the appropriate device (GPU or CPU)
model = SimpleMLP().to(device)
print(model)

SimpleMLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=10, bias=True)
)


In [4]:
# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001) # Learning rate = 0.001

In [5]:
num_epochs = 10 # Number of times to iterate over the training dataset
training_losses = [] # To store loss values for plotting

print("Starting Training...")
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if (i + 1) % 200 == 0: # Print every 200 mini-batches
            avg_loss = running_loss / 200
            print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{i + 1}/{len(trainloader)}], Loss: {avg_loss:.4f}')
            training_losses.append({"epoch": epoch + (i+1)/len(trainloader), "loss": avg_loss})
            running_loss = 0.0

print('Finished Training')

Starting Training...
Epoch [1/10], Batch [200/938], Loss: 0.7158
Epoch [1/10], Batch [400/938], Loss: 0.3005
Epoch [1/10], Batch [600/938], Loss: 0.2483
Epoch [1/10], Batch [800/938], Loss: 0.2110
Epoch [2/10], Batch [200/938], Loss: 0.1654
Epoch [2/10], Batch [400/938], Loss: 0.1458
Epoch [2/10], Batch [600/938], Loss: 0.1398
Epoch [2/10], Batch [800/938], Loss: 0.1257
Epoch [3/10], Batch [200/938], Loss: 0.1010
Epoch [3/10], Batch [400/938], Loss: 0.0997
Epoch [3/10], Batch [600/938], Loss: 0.1004
Epoch [3/10], Batch [800/938], Loss: 0.0924
Epoch [4/10], Batch [200/938], Loss: 0.0731
Epoch [4/10], Batch [400/938], Loss: 0.0722
Epoch [4/10], Batch [600/938], Loss: 0.0704
Epoch [4/10], Batch [800/938], Loss: 0.0763
Epoch [5/10], Batch [200/938], Loss: 0.0571
Epoch [5/10], Batch [400/938], Loss: 0.0571
Epoch [5/10], Batch [600/938], Loss: 0.0539
Epoch [5/10], Batch [800/938], Loss: 0.0612
Epoch [6/10], Batch [200/938], Loss: 0.0381
Epoch [6/10], Batch [400/938], Loss: 0.0457
Epoch [6/10

In [6]:
correct = 0
total = 0
# Since we're not training, we don't need to calculate gradients
with torch.no_grad():
    model.eval() # Set model to evaluation mode
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # Calculate outputs by running images through the network
        outputs = model(images)
        # The class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the network on the 10000 test images: {accuracy:.2f} %')


Accuracy of the network on the 10000 test images: 97.62 %
