In [None]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.optim.lr_scheduler import CyclicLR
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import KFold
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split

In [None]:
# Hyper-parameters 
input_size = 784 # 28x28
hidden_size = 500 
num_classes = 10
num_epochs = 12
batch_size = 96
learning_rate = 0.001
LR_DROP = 0.5
LR_EPOCHS_DROP = 10

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load MNIST dataset
dataset = datasets.MNIST(root='data/', train=True, download=True, transform=transforms.ToTensor())

train = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

# Split dataset into train, validation, and test sets
train_size = int(0.8 * len(train))
val_size = int(0.2 * len(train))

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 101875035.21it/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 27694488.76it/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 26655045.29it/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 13444268.71it/s]


Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



In [None]:
def lr_schedule(epoch, initial_lr, drop, epochs_drop):
    learning_rate = initial_lr * (drop ** (epoch // epochs_drop))

In [None]:
# with a custom learning rate function

num_epochs = 9

class NeuralNetworkRelu(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkRelu, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size) 
    self.relu = nn.ReLU()
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.relu(out)
    out = self.l2(out)
    return out
    

modelRelu = NeuralNetworkRelu(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelRelu.parameters(), lr=learning_rate)

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelRelu(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelRelu.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()
        lr_schedule(epoch, learning_rate, LR_DROP, LR_EPOCHS_DROP)

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
                
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelRelu(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/9], Step [100/500], Loss: 0.2913
Epoch [1/9], Step [200/500], Loss: 0.2715
Epoch [1/9], Step [300/500], Loss: 0.1768
Epoch [1/9], Step [400/500], Loss: 0.1611
Epoch [1/9], Step [500/500], Loss: 0.1748
Epoch [1/9], Train Loss: 0.3212, Val Loss: 0.1771, Train Acc: 91.13%, Val Acc: 94.77%
Epoch [2/9], Step [100/500], Loss: 0.1025
Epoch [2/9], Step [200/500], Loss: 0.1953
Epoch [2/9], Step [300/500], Loss: 0.0960
Epoch [2/9], Step [400/500], Loss: 0.2054
Epoch [2/9], Step [500/500], Loss: 0.0780
Epoch [2/9], Train Loss: 0.1291, Val Loss: 0.1274, Train Acc: 96.28%, Val Acc: 96.31%
Epoch [3/9], Step [100/500], Loss: 0.0882
Epoch [3/9], Step [200/500], Loss: 0.0404
Epoch [3/9], Step [300/500], Loss: 0.0236
Epoch [3/9], Step [400/500], Loss: 0.1003
Epoch [3/9], Step [500/500], Loss: 0.1092
Epoch [3/9], Train Loss: 0.0853, Val Loss: 0.1007, Train Acc: 97.51%, Val Acc: 96.79%
Epoch [4/9], Step [100/500], Loss: 0.0246
Epoch [4/9], Step [200/500], Loss: 0.0723
Epoch [4/9], Step [300/500],

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelRelu(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 97.85000000000001 %


In [None]:
# With cosine annealing learning rate function

num_epochs = 9

class NeuralNetworkRelu(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkRelu, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size) 
    self.relu = nn.ReLU()
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.relu(out)
    out = self.l2(out)
    return out

modelRelu = NeuralNetworkRelu(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelRelu.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer,
                              T_max = 32, # Maximum number of iterations.
                             eta_min = 1e-4) # Minimum learning rate.

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelRelu(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelRelu.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelRelu(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/9], Step [100/500], Loss: 0.3348
Epoch [1/9], Step [200/500], Loss: 0.2457
Epoch [1/9], Step [300/500], Loss: 0.3087
Epoch [1/9], Step [400/500], Loss: 0.1656
Epoch [1/9], Step [500/500], Loss: 0.2191
Epoch [1/9], Train Loss: 0.3924, Val Loss: 0.2435, Train Acc: 89.68%, Val Acc: 93.10%
Epoch [2/9], Step [100/500], Loss: 0.3000
Epoch [2/9], Step [200/500], Loss: 0.1059
Epoch [2/9], Step [300/500], Loss: 0.1673
Epoch [2/9], Step [400/500], Loss: 0.2012
Epoch [2/9], Step [500/500], Loss: 0.2507
Epoch [2/9], Train Loss: 0.1782, Val Loss: 0.1662, Train Acc: 94.92%, Val Acc: 94.98%
Epoch [3/9], Step [100/500], Loss: 0.2204
Epoch [3/9], Step [200/500], Loss: 0.1674
Epoch [3/9], Step [300/500], Loss: 0.1229
Epoch [3/9], Step [400/500], Loss: 0.1656
Epoch [3/9], Step [500/500], Loss: 0.1196
Epoch [3/9], Train Loss: 0.1251, Val Loss: 0.1292, Train Acc: 96.37%, Val Acc: 96.05%
Epoch [4/9], Step [100/500], Loss: 0.1151
Epoch [4/9], Step [200/500], Loss: 0.0640
Epoch [4/9], Step [300/500],

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelRelu(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 97.61 %


In [None]:
# With CyclicLR learning rate function

num_epochs = 12
learning_rate = 0.1

class NeuralNetworkRelu(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkRelu, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size) 
    self.relu = nn.ReLU()
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.relu(out)
    out = self.l2(out)
    return out

modelRelu = NeuralNetworkRelu(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

#Stochastic gradient descend optimizer
optimizer = torch.optim.SGD(modelRelu.parameters(), lr=learning_rate, momentum=0.9)
scheduler = CyclicLR(optimizer, 
                     base_lr = learning_rate, # Initial learning rate which is the lower boundary in the cycle for each parameter group
                     max_lr = 1e-1, # Upper learning rate boundaries in the cycle for each parameter group
                     step_size_up = 4, # Number of training iterations in the increasing half of a cycle
                     mode = "triangular")

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelRelu(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelRelu.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelRelu(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/12], Step [100/500], Loss: 0.3540
Epoch [1/12], Step [200/500], Loss: 0.3389
Epoch [1/12], Step [300/500], Loss: 0.1977
Epoch [1/12], Step [400/500], Loss: 0.0890
Epoch [1/12], Step [500/500], Loss: 0.1945
Epoch [1/12], Train Loss: 0.2976, Val Loss: 0.1533, Train Acc: 91.29%, Val Acc: 95.49%
Epoch [2/12], Step [100/500], Loss: 0.1413
Epoch [2/12], Step [200/500], Loss: 0.1078
Epoch [2/12], Step [300/500], Loss: 0.0688
Epoch [2/12], Step [400/500], Loss: 0.0767
Epoch [2/12], Step [500/500], Loss: 0.1204
Epoch [2/12], Train Loss: 0.1119, Val Loss: 0.1140, Train Acc: 96.72%, Val Acc: 96.78%
Epoch [3/12], Step [100/500], Loss: 0.1300
Epoch [3/12], Step [200/500], Loss: 0.0213
Epoch [3/12], Step [300/500], Loss: 0.0340
Epoch [3/12], Step [400/500], Loss: 0.0629
Epoch [3/12], Step [500/500], Loss: 0.0508
Epoch [3/12], Train Loss: 0.0734, Val Loss: 0.0990, Train Acc: 97.86%, Val Acc: 96.97%
Epoch [4/12], Step [100/500], Loss: 0.0218
Epoch [4/12], Step [200/500], Loss: 0.1129
Epoch [4

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelRelu(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 98.19 %


In [None]:
num_epochs = 12
learning_rate = 0.001

class NeuralNetworkSigmoid(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkSigmoid, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size) 
    self.sigmoid = nn.Sigmoid()
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.sigmoid(out)
    out = self.l2(out)
    return out

modelSigmoid = NeuralNetworkSigmoid(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelSigmoid.parameters(), lr=learning_rate)

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelSigmoid(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelSigmoid.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelSigmoid(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/12], Step [100/500], Loss: 0.6597
Epoch [1/12], Step [200/500], Loss: 0.4014
Epoch [1/12], Step [300/500], Loss: 0.3262
Epoch [1/12], Step [400/500], Loss: 0.3082
Epoch [1/12], Step [500/500], Loss: 0.1448
Epoch [1/12], Train Loss: 0.5217, Val Loss: 0.2950, Train Acc: 86.46%, Val Acc: 91.41%
Epoch [2/12], Step [100/500], Loss: 0.2304
Epoch [2/12], Step [200/500], Loss: 0.2254
Epoch [2/12], Step [300/500], Loss: 0.2599
Epoch [2/12], Step [400/500], Loss: 0.1437
Epoch [2/12], Step [500/500], Loss: 0.1048
Epoch [2/12], Train Loss: 0.2420, Val Loss: 0.2404, Train Acc: 93.01%, Val Acc: 92.78%
Epoch [3/12], Step [100/500], Loss: 0.2670
Epoch [3/12], Step [200/500], Loss: 0.1831
Epoch [3/12], Step [300/500], Loss: 0.1892
Epoch [3/12], Step [400/500], Loss: 0.2039
Epoch [3/12], Step [500/500], Loss: 0.0958
Epoch [3/12], Train Loss: 0.1857, Val Loss: 0.1941, Train Acc: 94.68%, Val Acc: 94.28%
Epoch [4/12], Step [100/500], Loss: 0.0855
Epoch [4/12], Step [200/500], Loss: 0.1606
Epoch [4

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelSigmoid(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 97.68 %


In [None]:
class NeuralNetworkTanh(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkTanh, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size) 
    self.tanh = nn.Tanh()
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.tanh(out)
    out = self.l2(out)
    return out

modelTanh = NeuralNetworkTanh(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(modelTanh.parameters(), lr=learning_rate)

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelTanh(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelTanh.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelTanh(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/12], Step [100/500], Loss: 0.3936
Epoch [1/12], Step [200/500], Loss: 0.3190
Epoch [1/12], Step [300/500], Loss: 0.2793
Epoch [1/12], Step [400/500], Loss: 0.4455
Epoch [1/12], Step [500/500], Loss: 0.2655
Epoch [1/12], Train Loss: 0.3489, Val Loss: 0.2579, Train Acc: 90.02%, Val Acc: 92.26%
Epoch [2/12], Step [100/500], Loss: 0.0909
Epoch [2/12], Step [200/500], Loss: 0.1204
Epoch [2/12], Step [300/500], Loss: 0.1389
Epoch [2/12], Step [400/500], Loss: 0.0587
Epoch [2/12], Step [500/500], Loss: 0.3987
Epoch [2/12], Train Loss: 0.1844, Val Loss: 0.1704, Train Acc: 94.76%, Val Acc: 95.03%
Epoch [3/12], Step [100/500], Loss: 0.0539
Epoch [3/12], Step [200/500], Loss: 0.1732
Epoch [3/12], Step [300/500], Loss: 0.1107
Epoch [3/12], Step [400/500], Loss: 0.2103
Epoch [3/12], Step [500/500], Loss: 0.0464
Epoch [3/12], Train Loss: 0.1251, Val Loss: 0.1358, Train Acc: 96.33%, Val Acc: 95.81%
Epoch [4/12], Step [100/500], Loss: 0.0964
Epoch [4/12], Step [200/500], Loss: 0.0412
Epoch [4

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelTanh(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 97.77 %


In [None]:
class NeuralNetworkTanh(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(NeuralNetworkTanh, self).__init__()
    self.l1 = nn.Linear(input_size, hidden_size)
    #batch normalisation
    self.bn1 = nn.BatchNorm1d(hidden_size)
    self.tanh = nn.Tanh()
    self.bn2 = nn.BatchNorm1d(hidden_size)
    self.l2 = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.bn1(out)
    out = self.tanh(out)
    out = self.bn2(out)
    out = self.l2(out)
    return out

modelTanh = NeuralNetworkTanh(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

# weightdecay for L2 regularization
optimizer = torch.optim.Adam(modelTanh.parameters(), lr=learning_rate, weight_decay=1e-5)

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelTanh(images)
        loss = criterion(outputs, labels)
        # L1 regularization
        l1_reg = torch.tensor(0.)
        for name, param in modelTanh.named_parameters():
            if 'weight' in name:
                l1_reg += torch.norm(param, 1)
        loss += l1_reg * 0.0005
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelTanh.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelTanh(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/12], Step [100/500], Loss: 1.6814
Epoch [1/12], Step [200/500], Loss: 1.3085
Epoch [1/12], Step [300/500], Loss: 1.2635
Epoch [1/12], Step [400/500], Loss: 1.1058
Epoch [1/12], Step [500/500], Loss: 0.9718
Epoch [1/12], Train Loss: 1.4895, Val Loss: 0.3774, Train Acc: 88.63%, Val Acc: 88.24%
Epoch [2/12], Step [100/500], Loss: 0.9862
Epoch [2/12], Step [200/500], Loss: 1.3098
Epoch [2/12], Step [300/500], Loss: 0.9048
Epoch [2/12], Step [400/500], Loss: 0.8066
Epoch [2/12], Step [500/500], Loss: 0.7367
Epoch [2/12], Train Loss: 0.9518, Val Loss: 0.3177, Train Acc: 90.36%, Val Acc: 90.61%
Epoch [3/12], Step [100/500], Loss: 0.8100
Epoch [3/12], Step [200/500], Loss: 0.7652
Epoch [3/12], Step [300/500], Loss: 0.8474
Epoch [3/12], Step [400/500], Loss: 0.5802
Epoch [3/12], Step [500/500], Loss: 0.5454
Epoch [3/12], Train Loss: 0.7156, Val Loss: 0.2523, Train Acc: 92.30%, Val Acc: 92.83%
Epoch [4/12], Step [100/500], Loss: 0.5628
Epoch [4/12], Step [200/500], Loss: 0.4951
Epoch [4

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelTanh(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 96.33 %


In [None]:
# Define a neural network with ELU activation
class NeuralNetworkELU(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetworkELU, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.elu = nn.ELU()
        self.l2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.l1(x)
        out = self.elu(out)
        out = self.l2(out)
        return out

# Create an instance of the network
modelELU = NeuralNetworkELU(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(modelELU.parameters(), lr=learning_rate)

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass and loss calculation
        outputs = modelELU(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        # clip gradients 
        torch.nn.utils.clip_grad_norm_(modelELU.parameters(), 5)
        optimizer.step()
        optimizer.zero_grad()

        # Update the training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation set
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = modelELU(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Compute the average training and validation loss and accuracy for the epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    if train_total > 0:
        train_acc = 100 * train_correct / train_total
    else:
        train_acc = 0

    val_acc = 100 * val_correct / val_total

    # Print the training and validation loss and accuracy for the epoch
    print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

    # Record the training and validation loss and accuracy for the epoch
    train_loss_history.append(train_loss)
    val_loss_history.append

Epoch [1/12], Step [100/500], Loss: 0.2169
Epoch [1/12], Step [200/500], Loss: 0.2572
Epoch [1/12], Step [300/500], Loss: 0.1297
Epoch [1/12], Step [400/500], Loss: 0.2457
Epoch [1/12], Step [500/500], Loss: 0.1730
Epoch [1/12], Train Loss: 0.3562, Val Loss: 0.2559, Train Acc: 89.83%, Val Acc: 92.72%
Epoch [2/12], Step [100/500], Loss: 0.2025
Epoch [2/12], Step [200/500], Loss: 0.1819
Epoch [2/12], Step [300/500], Loss: 0.1394
Epoch [2/12], Step [400/500], Loss: 0.1661
Epoch [2/12], Step [500/500], Loss: 0.1083
Epoch [2/12], Train Loss: 0.1891, Val Loss: 0.1709, Train Acc: 94.46%, Val Acc: 94.85%
Epoch [3/12], Step [100/500], Loss: 0.1123
Epoch [3/12], Step [200/500], Loss: 0.1219
Epoch [3/12], Step [300/500], Loss: 0.1191
Epoch [3/12], Step [400/500], Loss: 0.1054
Epoch [3/12], Step [500/500], Loss: 0.0845
Epoch [3/12], Train Loss: 0.1265, Val Loss: 0.1390, Train Acc: 96.29%, Val Acc: 95.81%
Epoch [4/12], Step [100/500], Loss: 0.1227
Epoch [4/12], Step [200/500], Loss: 0.0730
Epoch [4

In [None]:
# Test the model: we don't need to compute gradients
with torch.no_grad():
    n_correct = 0
    n_examples = len(test_loader.dataset)

    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)

        outputs = modelELU(images)

        # max returns (output_value ,index)
        _, predicted = torch.max(outputs, 1)
        n_correct += (predicted == labels).sum().item()

    acc = n_correct / n_examples
    print(f'Accuracy of the network on the {n_examples} test images: {100*acc} %')

Accuracy of the network on the 10000 test images: 97.49 %


In [None]:
# param_grid = [{
#     'hidden_size': [64, 128, 256],
#     'learning_rate': [0.001, 0.01, 0.1],
#     'weight_decay': [1e-8, 1e-7, 1e-6],
#     'batch_size': [32, 64, 128]
# }]

# class TorchClassifier(BaseEstimator, ClassifierMixin):
#     def __init__(self, model, criterion, optimizer):
#         self.model = model
#         self.criterion = criterion
#         self.optimizer = optimizer

#     def fit(self, images, labels):
#         batch_size = len(images) 
#         for epoch in range(num_epochs):
#             for i in range(batch_size):
#                 start_idx = i * batch_size
#                 end_idx = start_idx + batch_size
#                 batch_images = images[start_idx:end_idx].reshape(-1, 28*28).to(device)
#                 batch_labels = labels[start_idx:end_idx].to(device)

#                 # Forward pass and loss calculation
#                 outputs = self.model(batch_images)
#                 loss = self.criterion(outputs, batch_labels)
#                 l1_reg = torch.tensor(0.)
#                 for name, param in self.model.named_parameters():
#                     if 'weight' in name:
#                         l1_reg += torch.norm(param, 1)
#                 loss += l1_reg * 0.00005

#                 # Backward and optimize
#                 loss.backward()
#                 # clip gradients 
#                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5)
#                 self.optimizer.step()
#                 self.optimizer.zero_grad()

#                 if (i+1) % 100 == 0:
#                     print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')


# # Define a neural network with ELU activation
# class NeuralNetworkELU(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(NeuralNetworkELU, self).__init__()
#         self.l1 = nn.Linear(input_size, hidden_size)
#         self.bn1 = nn.BatchNorm1d(hidden_size)
#         self.elu = nn.ELU()
#         self.bn2 = nn.BatchNorm1d(hidden_size)
#         self.l2 = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         out = self.l1(x)
#         out = self.bn1(out)
#         out = self.elu(out)
#         out = self.bn2(out)
#         out = self.l2(out)
#         return out

# # Create an instance of the network
# modelELU = NeuralNetworkELU(input_size, hidden_size, num_classes).to(device)

# criterion = nn.CrossEntropyLoss()

# optimizer = torch.optim.Adam(modelELU.parameters(), lr=learning_rate, weight_decay=1e-8)

# classifier = TorchClassifier(modelELU, criterion, optimizer)

# grid_search = GridSearchCV(classifier, param_grid=param_grid, scoring='accuracy', cv=2)
# images, labels = next(iter(train_loader))
# grid_search.fit(images, labels)


# print("Best Hyperparameters: ", grid_search.best_params_)
# print("Best Score: ", grid_search.best_score_)



In [None]:
# Define a neural network with ELU activation
class NeuralNetworkELU(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetworkELU, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        #Batch normalisation
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.elu = nn.ELU()
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.l2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.l1(x)
        out = self.bn1(out)
        out = self.elu(out)
        out = self.bn2(out)
        out = self.l2(out)
        return out

#K folds
k = 5
kf = KFold(n_splits=k, shuffle=True)
eval_scores = []

# Create an instance of the network
modelELU = NeuralNetworkELU(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

#L2 regularisation
optimizer = torch.optim.Adam(modelELU.parameters(), lr=learning_rate, weight_decay=1e-5)

#Cosine annealing scheduler
scheduler = CosineAnnealingLR(optimizer,
                              T_max = 32, # Maximum number of iterations.
                             eta_min = 1e-4) # Minimum learning rate.

# Track the training and validation loss and accuracy for each epoch
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

for fold, (train_idx, test_idx) in enumerate(kf.split(train)):
  train_loader_fold = torch.utils.data.DataLoader(train, batch_size=batch_size, sampler=torch.utils.data.SubsetRandomSampler(train_idx))
  test_loader_fold = torch.utils.data.DataLoader(train, batch_size=batch_size, sampler=torch.utils.data.SubsetRandomSampler(test_idx))
  
  n_total_steps = len(train_loader)
  for epoch in range(num_epochs):
      train_loss = 0.0
      train_correct = 0
      train_total = 0
      for i, (images, labels) in enumerate(train_loader_fold):  
          # origin shape: [100, 1, 28, 28]
          # resized: [100, 784]
          images = images.reshape(-1, 28*28).to(device)
          labels = labels.to(device)
          
          # Forward pass and loss calculation
          outputs = modelELU(images)
          loss = criterion(outputs, labels)
          #L1 regularisation
          l1_reg = torch.tensor(0.)
          for name, param in modelELU.named_parameters():
              if 'weight' in name:
                  l1_reg += torch.norm(param, 1)
          loss += l1_reg * 0.0005
          
          # Backward and optimize
          loss.backward()
          # clip gradients 
          torch.nn.utils.clip_grad_norm_(modelELU.parameters(), 5)
          optimizer.step()
          optimizer.zero_grad()
          scheduler.step()
          
          # Update the training loss and accuracy
          train_loss += loss.item() * images.size(0)
          _, predicted = torch.max(outputs.data, 1)
          train_correct += (predicted == labels).sum().item()
          train_total += labels.size(0)

          # Update the training loss and accuracy
          train_loss += loss.item() * images.size(0)
          _, predicted = torch.max(outputs.data, 1)
          train_correct += (predicted == labels).sum().item()
          train_total += labels.size(0)
          
          if (i+1) % 100 == 0:
             print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

      # Evaluate the model on the validation set
      val_loss = 0.0
      val_correct = 0
      val_total = 0
      with torch.no_grad():
          for images, labels in val_loader:
              images = images.reshape(-1, 28*28).to(device)
              labels = labels.to(device)

              outputs = modelELU(images)
              loss = criterion(outputs, labels)

              val_loss += loss.item() * images.size(0)
              _, predicted = torch.max(outputs.data, 1)
              val_correct += (predicted == labels).sum().item()
              val_total += labels.size(0)

      # Compute the average training and validation loss and accuracy for the epoch
      train_loss /= len(train_loader.dataset)
      val_loss /= len(val_loader.dataset)
      
      if train_total > 0:
          train_acc = 100 * train_correct / train_total
      else:
          train_acc = 0

      val_acc = 100 * val_correct / val_total

      # Print the training and validation loss and accuracy for the epoch
      print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

      # Record the training and validation loss and accuracy for the epoch
      train_loss_history.append(train_loss)
      val_loss_history.append

  correct = 0
  total = 0
  with torch.no_grad():
      for images, labels in test_loader_fold:
          images = images.reshape(-1, 28*28).to(device)
          labels = labels.to(device)
          outputs = modelELU(images)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()
  accuracy = 100 * correct / total
  eval_scores.append(accuracy)

for fold, score in enumerate(eval_scores):
    print(f'Fold {fold+1} score: {score:.2f}%')
print(f'Average score: {sum(eval_scores)/len(eval_scores):.2f}%')

Epoch [1/12], Step [100/500], Loss: 1.8940
Epoch [1/12], Step [200/500], Loss: 1.6033
Epoch [1/12], Step [300/500], Loss: 1.1439
Epoch [1/12], Step [400/500], Loss: 1.1662
Epoch [1/12], Step [500/500], Loss: 1.1000
Epoch [1/12], Train Loss: 3.3135, Val Loss: 0.2945, Train Acc: 89.40%, Val Acc: 91.37%
Epoch [2/12], Step [100/500], Loss: 1.0802
Epoch [2/12], Step [200/500], Loss: 1.0954
Epoch [2/12], Step [300/500], Loss: 1.1155
Epoch [2/12], Step [400/500], Loss: 1.0262
Epoch [2/12], Step [500/500], Loss: 0.7875
Epoch [2/12], Train Loss: 2.0507, Val Loss: 0.2611, Train Acc: 91.03%, Val Acc: 92.66%
Epoch [3/12], Step [100/500], Loss: 0.9673
Epoch [3/12], Step [200/500], Loss: 0.8164
Epoch [3/12], Step [300/500], Loss: 0.9836
Epoch [3/12], Step [400/500], Loss: 0.8596
Epoch [3/12], Step [500/500], Loss: 0.9090
Epoch [3/12], Train Loss: 1.7833, Val Loss: 0.2609, Train Acc: 91.87%, Val Acc: 92.52%
Epoch [4/12], Step [100/500], Loss: 0.8808
Epoch [4/12], Step [200/500], Loss: 0.8861
Epoch [4