# Baseline setup

## Load Data

## Resnet or some other model that might be a better base for hierarchical classification

### One classification head first
Start with one classification head that tries to predict all 3(4?) classes at once

### N classification heads

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, models, transforms
import os
import time
import copy

In [27]:
# 1. Configuration
# The Stanford Cars dataset has 196 classes
NUM_CLASSES = 196
BATCH_SIZE = 32
NUM_EPOCHS = 15 
LEARNING_RATE = 0.001

# Standard ImageNet normalization parameters for ResNet
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
IMAGE_SIZE = 224 # Standard input size for ResNet

# IMPORTANT: Set your dataset root path here!
DATA_ROOT = './data/' 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available and device is set")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available and device is set")
else:
    device = torch.device("cpu")
    print("No device is available device set to cpu")

print(f"Using device: {device}")

MPS is available and device is set
Using device: mps


In [47]:
# --- 2. DATA PREPROCESSING ---

# Only resizing and ImageNet normalization, no random operations.
data_transforms = transforms.Compose([
    transforms.Resize(256),              # 1. Standardize image size
    transforms.CenterCrop(IMAGE_SIZE),   # 2. Crop to ResNet's input size (224x224)
    transforms.ToTensor(),               # 3. Convert to PyTorch Tensor
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD) # 4. ImageNet Normalization
])

try:
    # Load the full training dataset
    full_train_dataset = datasets.StanfordCars(root=DATA_ROOT, download=False, transform=data_transforms)
    test_dataset = datasets.StanfordCars(root=DATA_ROOT, download=False, transform=data_transforms)
    #full_train_dataset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'cars_train'), data_transforms)
    #test_dataset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'cars_test'), data_transforms)
    
    # Split the original training data into training and validation sets (e.g., 80/20)
    train_size = int(0.8 * len(full_train_dataset))
    val_size = len(full_train_dataset) - train_size
    train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    dataloaders = {'train': train_loader, 'val': val_loader}
    dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}

except FileNotFoundError as e:
    print(f"\n[ERROR] Data not found. Please check your DATA_ROOT path: {DATA_ROOT}")
    print("Ensure you have 'train' and 'test' subdirectories with images inside.")

In [None]:
# --- 3. MODEL SETUP (ResNet-50 Feature Extractor) ---

def setup_resnet_baseline(num_classes):
    # 1. Load Pretrained ResNet-50
    # Use the default ImageNet weights
    model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

    # 2. Freeze all convolutional layers (Feature Extraction)
    # This is the "Transfer Learning" step. Only the final classification head will learn.
    for param in model.parameters():
        param.requires_grad = False

    # 3. Replace the final Fully Connected layer (the "Head")
    # Get the input feature size of the original FC layer (2048 for ResNet-50)
    num_ftrs = model.fc.in_features

    # Map the features to the new number of classes (196)
    model.fc = nn.Linear(num_ftrs, num_classes)
    
    # NOTE: Only model.fc's parameters are now set to requires_grad=True
    return model

model_baseline = setup_resnet_baseline(NUM_CLASSES)
if device == torch.device("mps"):
    model_baseline.float()
model_baseline.to(device)

# Define Loss function and Optimizer
criterion = nn.CrossEntropyLoss()

# Only pass the parameters of the un-frozen layers (the new 'fc' layer)
optimizer = optim.Adam(model_baseline.fc.parameters(), lr=LEARNING_RATE)

HELLO WORLD


In [57]:
# --- 4. TRAINING LOOP ---

def train_model(model, criterion, optimizer, num_epochs=NUM_EPOCHS):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    print("Starting Training (Fixed Feature Extractor Baseline)...")

    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 20)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad() # Zero the gradients

                # Forward pass
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            if device != torch.device("mps"):
                epoch_acc = running_corrects.double() / dataset_sizes[phase]
            else:
                epoch_acc = running_corrects.float() / dataset_sizes[phase]

            print(f'{phase:5} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Deep copy the model if it is the best validation accuracy
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best validation Acc: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [58]:

# To run the training:
final_baseline_model = train_model(model_baseline, criterion, optimizer, num_epochs=NUM_EPOCHS)

Starting Training (Fixed Feature Extractor Baseline)...

Epoch 1/15
--------------------


train Loss: 4.7412 Acc: 0.0936
val   Loss: 4.1932 Acc: 0.1835

Epoch 2/15
--------------------
train Loss: 3.2999 Acc: 0.4645
val   Loss: 3.6619 Acc: 0.2462

Epoch 3/15
--------------------
train Loss: 2.4667 Acc: 0.6698
val   Loss: 3.3168 Acc: 0.3088

Epoch 4/15
--------------------
train Loss: 1.8938 Acc: 0.7896
val   Loss: 3.1212 Acc: 0.3364

Epoch 5/15
--------------------
train Loss: 1.4792 Acc: 0.8665
val   Loss: 2.9928 Acc: 0.3573

Epoch 6/15
--------------------
train Loss: 1.1594 Acc: 0.9154
val   Loss: 2.9102 Acc: 0.3671

Epoch 7/15
--------------------
train Loss: 0.9286 Acc: 0.9483
val   Loss: 2.7928 Acc: 0.3788

Epoch 8/15
--------------------
train Loss: 0.7469 Acc: 0.9664
val   Loss: 2.7555 Acc: 0.3843

Epoch 9/15
--------------------
train Loss: 0.6074 Acc: 0.9819
val   Loss: 2.6879 Acc: 0.4082

Epoch 10/15
--------------------
train Loss: 0.4925 Acc: 0.9889
val   Loss: 2.6401 Acc: 0.3984

Epoch 11/15
--------------------
train Loss: 0.4103 Acc: 0.9949
val   Loss: 2.625