In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
from torch.optim import Adam
import wandb


# Define transformations
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])

# Load CIFAR10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Create data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(testset, batch_size=128,
                                        shuffle=False, num_workers=2)




Files already downloaded and verified
Files already downloaded and verified


In [3]:
def log_metrics(epoch, train_loss, val_loss, val_top1_error, val_top5_error):
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "val_top1_error": val_top1_error,
        "val_top5_error": val_top5_error
    })

def validate_model(model, valloader, criterion, device):
    model.eval()
    correct_top1 = 0
    correct_top5 = 0
    total = 0
    val_loss = 0.0

    with torch.no_grad():
        for images, labels in valloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Top-1 and Top-5 accuracy
            _, predicted = outputs.topk(5, 1, largest=True, sorted=True)
            total += labels.size(0)
            correct_top1 += (predicted[:, 0] == labels).sum().item()
            correct_top5 += labels.unsqueeze(1).eq(predicted).sum().item()

    val_loss /= len(valloader)
    top1_error = 100. * (1 - correct_top1 / total)
    top5_error = 100. * (1 - correct_top5 / total)
    
    return val_loss, top1_error, top5_error

def train_model(model, trainloader, valloader, criterion, optimizer, device, num_epochs=10):
    model.to(device)

    best_val_loss = float('inf')
    patience = 5
    counter = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (images, labels) in enumerate(trainloader):
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        epoch_loss = running_loss / len(trainloader)

        # Validation
        val_loss, top1_error, top5_error = validate_model(model, valloader, criterion, device)

        # Log metrics
        log_metrics(epoch, epoch_loss, val_loss, top1_error, top5_error)
        if epoch == 0:
            num_params = sum(p.numel() for p in model.parameters())
            print(f"Number of parameters: {num_params}")
            print(f"Training model on {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

        # Print epoch results
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Train Loss: {epoch_loss:.4f}, '
              f'Val Loss: {val_loss:.4f}, '
              f'Val Top-1 Error: {top1_error:.2f}%, '
              f'Val Top-5 Error: {top5_error:.2f}%')

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f'Early stopping after {epoch+1} epochs')
                break

    print('Finished Training')
    wandb.finish()  # Disconnect from wandb at the end of training


In [4]:
from krizhevsky2012imagenet.alexnet import AlexNet

wandb.init(project="cifar10-alexnet", name="alexnet-baseline")

# Log hyperparameters
wandb.config.update({
    "learning_rate": 0.001,
    "epochs": 50,
    "batch_size": 128,
    "model": "AlexNet"
})

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AlexNet(out_dim=10)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

train_model(model, trainloader, valloader, criterion, optimizer, device, num_epochs=50)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdieplstks[0m. Use [1m`wandb login --relogin`[0m to force relogin


Number of parameters: 46787978
Training model on NVIDIA GeForce RTX 4090
Epoch [1/50], Train Loss: 1.8248, Val Loss: 1.4903, Val Top-1 Error: 54.90%, Val Top-5 Error: 8.24%
Epoch [2/50], Train Loss: 1.4263, Val Loss: 1.3505, Val Top-1 Error: 49.52%, Val Top-5 Error: 6.43%
Epoch [3/50], Train Loss: 1.2680, Val Loss: 1.2491, Val Top-1 Error: 44.51%, Val Top-5 Error: 5.79%
Epoch [4/50], Train Loss: 1.1521, Val Loss: 1.1281, Val Top-1 Error: 40.04%, Val Top-5 Error: 4.42%
Epoch [5/50], Train Loss: 1.0693, Val Loss: 1.0958, Val Top-1 Error: 38.35%, Val Top-5 Error: 4.09%
Epoch [6/50], Train Loss: 1.0020, Val Loss: 1.0396, Val Top-1 Error: 36.21%, Val Top-5 Error: 3.92%
Epoch [7/50], Train Loss: 0.9354, Val Loss: 1.0069, Val Top-1 Error: 34.98%, Val Top-5 Error: 3.45%
Epoch [8/50], Train Loss: 0.8777, Val Loss: 1.0174, Val Top-1 Error: 35.15%, Val Top-5 Error: 3.63%
Epoch [9/50], Train Loss: 0.8202, Val Loss: 1.0063, Val Top-1 Error: 34.24%, Val Top-5 Error: 3.76%
Epoch [10/50], Train Loss: 

0,1
epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train_loss,█▆▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁
val_loss,█▆▅▃▃▂▂▂▂▂▁▁▁▂▂▂▂
val_top1_error,█▆▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁
val_top5_error,█▆▅▃▂▂▂▂▂▂▁▁▁▁▁▂▁

0,1
epoch,17.0
train_loss,0.47404
val_loss,1.06381
val_top1_error,32.88
val_top5_error,3.35


In [5]:
from hu2019squeezeexcitation.alexnet_se import AlexNetSE


wandb.init(project="cifar10-alexnet", name="alexnet-se")

# Log hyperparameters
wandb.config.update({
    "learning_rate": 0.001,
    "epochs": 50,
    "batch_size": 128,
    "model": "AlexNetSE"
})

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AlexNetSE(out_dim=10)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

train_model(model, trainloader, valloader, criterion, optimizer, device, num_epochs=50)

Number of parameters: 46842378
Training model on NVIDIA GeForce RTX 4090
Epoch [1/50], Train Loss: 1.8100, Val Loss: 1.4146, Val Top-1 Error: 52.54%, Val Top-5 Error: 6.79%
Epoch [2/50], Train Loss: 1.3065, Val Loss: 1.1621, Val Top-1 Error: 41.36%, Val Top-5 Error: 4.98%
Epoch [3/50], Train Loss: 1.0663, Val Loss: 0.9810, Val Top-1 Error: 34.77%, Val Top-5 Error: 3.24%
Epoch [4/50], Train Loss: 0.9201, Val Loss: 0.8732, Val Top-1 Error: 30.50%, Val Top-5 Error: 2.55%
Epoch [5/50], Train Loss: 0.8008, Val Loss: 0.8224, Val Top-1 Error: 28.88%, Val Top-5 Error: 2.37%
Epoch [6/50], Train Loss: 0.7014, Val Loss: 0.7907, Val Top-1 Error: 27.02%, Val Top-5 Error: 2.41%
Epoch [7/50], Train Loss: 0.6249, Val Loss: 0.7570, Val Top-1 Error: 26.08%, Val Top-5 Error: 2.00%
Epoch [8/50], Train Loss: 0.5589, Val Loss: 0.7643, Val Top-1 Error: 25.25%, Val Top-5 Error: 1.94%
Epoch [9/50], Train Loss: 0.4868, Val Loss: 0.7595, Val Top-1 Error: 25.33%, Val Top-5 Error: 2.11%
Epoch [10/50], Train Loss: 

0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█
train_loss,█▆▄▄▃▃▂▂▂▁▁▁
val_loss,█▅▃▂▂▁▁▁▁▁▁▂
val_top1_error,█▅▄▂▂▂▁▁▁▁▁▁
val_top5_error,█▅▃▂▂▂▁▁▁▁▁▁

0,1
epoch,12.0
train_loss,0.34203
val_loss,0.84849
val_top1_error,24.99
val_top5_error,2.2
