# Baseline setup

## Load Data

## Resnet or some other model that might be a better base for hierarchical classification

### One classification head first
Start with one classification head that tries to predict all 3(4?) classes at once

### N classification heads

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import time
import copy
import numpy as np



In [13]:
# Loadin saved model
model_path = "models/your_model_name_model.pth"  # replace with actual name

# Load the full model
model = torch.load(model_path)

# Set model to evaluation mode (important for inference)
model.eval()

FileNotFoundError: [Errno 2] No such file or directory: 'models/your_model_name_model.pth'

In [15]:
# Configuration
# The Stanford Cars dataset has 196 classes
NUM_CLASSES = 196
BATCH_SIZE = 32
NUM_EPOCHS = 2
LEARNING_RATE = 0.001

# Standard ImageNet normalization parameters for ResNet
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
IMAGE_SIZE = 224 # Standard input size for ResNet

# IMPORTANT: Set your dataset root path here!
DATA_ROOT = './data'

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available and device is set")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available and device is set")
else:
    device = torch.device("cpu")
    print("No device is available device set to cpu")

print(f"Using device: {device}")

No device is available device set to cpu
Using device: cpu


In [16]:
# --- DATA PREPROCESSING ---

# Only resizing and ImageNet normalization, no random operations.
data_transforms = transforms.Compose([
    transforms.Resize(256),              # 1. Standardize image size
    transforms.CenterCrop(IMAGE_SIZE),   # 2. Crop to ResNet's input size (224x224)
    transforms.ToTensor(),               # 3. Convert to PyTorch Tensor
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD) # 4. ImageNet Normalization
])

try:
    # Load the full training dataset
    full_train_dataset = datasets.StanfordCars(root=DATA_ROOT, download=False, split="train", transform=data_transforms)
    test_dataset = datasets.StanfordCars(root=DATA_ROOT, download=False, split="test", transform=data_transforms)
    #full_train_dataset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'cars_train'), data_transforms)
    #test_dataset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'cars_test'), data_transforms)
    
    # Split the original training data into training and validation sets (e.g., 80/20)
    train_size = int(0.8 * len(full_train_dataset))
    val_size = len(full_train_dataset) - train_size
    train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    dataloaders = {'train': train_loader, 'val': val_loader}
    dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}

    class_names = full_train_dataset.classes
    idx_to_class = {i: class_name for i, class_name in enumerate(class_names)}

except FileNotFoundError as e:
    print(f"\n[ERROR] Data not found. Please check your DATA_ROOT path: {DATA_ROOT}")
    print("Ensure you have 'train' and 'test' subdirectories with images inside.")

In [17]:
# Extract the make (coarse class) from full class name
def get_make(class_name):
    return class_name.split()[0]

In [18]:
# --- MODEL SETUP (ResNet-50 Feature Extractor) ---

def setup_resnet_baseline(num_classes):
    # 1. Load Pretrained ResNet-50
    # Use the default ImageNet weights
    model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

    # 2. Freeze all convolutional layers (Feature Extraction)
    # This is the "Transfer Learning" step. Only the final classification head will learn.
    for param in model.parameters():
        param.requires_grad = False

    # 3. Replace the final Fully Connected layer (the "Head")
    # Get the input feature size of the original FC layer (2048 for ResNet-50)
    num_ftrs = model.fc.in_features

    # Map the features to the new number of classes (196)
    model.fc = nn.Linear(num_ftrs, num_classes)
    
    # NOTE: Only model.fc's parameters are now set to requires_grad=True
    return model

model_baseline = setup_resnet_baseline(NUM_CLASSES)
if device == torch.device("mps"):
    model_baseline.float()
model_baseline.to(device)

# Define Loss function and Optimizer
criterion = nn.CrossEntropyLoss()

# Only pass the parameters of the un-frozen layers (the new 'fc' layer)
optimizer = optim.Adam(model_baseline.fc.parameters(), lr=LEARNING_RATE)

In [19]:
from experiment_logging import ExperimentLogger

logger = ExperimentLogger(
    log_file="experiment_log.md",
    active=True,         # set False to disable logging
    show_console=True    # preview logs in notebook
)

# Base info for this experiment
logger.set(
    name="Name of experiment",
    changes="Changes",
    reason="Improve generalization and convergence speed"
)

In [20]:
def accuracy_topk(outputs, targets, topk=(1,)):
    """Compute the top-k accuracies."""
    maxk = max(topk)
    batch_size = targets.size(0)

    _, pred = outputs.topk(maxk, dim=1, largest=True, sorted=True)
    pred = pred.t()
    correct = pred.eq(targets.view(1, -1).expand_as(pred))

    results = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0)
        acc = correct_k * (100.0 / batch_size)
        results.append(acc.item())
    return results

In [24]:
def train_model(model, criterion, optimizer, num_epochs=NUM_EPOCHS):

    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc= 0.0 # best top-1 accuracy

    # Store epoch metrics
    train_losses, val_losses = [], []
    train_top1s, val_top1s = [], []
    train_top5s, val_top5s = [], []

    print("Starting Training...")

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print("-" * 20)

        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_top1 = 0.0
            running_top5 = 0.0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    # ---- TOP-1 / TOP-5 ----
                    top1, top5 = accuracy_topk(outputs, labels, topk=(1, 5))

                    # Backprop only in training
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                batch_size = inputs.size(0)
                running_loss += loss.item() * batch_size
                running_top1 += top1 * batch_size
                running_top5 += top5 * batch_size

            # Compute epoch metrics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_top1 = running_top1 / dataset_sizes[phase]
            epoch_top5 = running_top5 / dataset_sizes[phase]

            if device.type == "mps":
                epoch_top1 = epoch_top1.float()
                epoch_top5 = epoch_top5.float()
            
            epoch_top1_val = epoch_top1.item() if isinstance(epoch_top1, torch.Tensor) else float(epoch_top1)
            epoch_top5_val = epoch_top5.item() if isinstance(epoch_top5, torch.Tensor) else float(epoch_top5)

            print(f"{phase:5} Loss: {epoch_loss:.4f}  Top-1: {epoch_top1_val:.2f}%  Top-5: {epoch_top5_val:.2f}%")

            # Store metrics for plotting/logging
            if phase == "train":
                train_losses.append(epoch_loss)
                train_top1s.append(epoch_top1_val)
                train_top5s.append(epoch_top5_val)
            else:
                val_losses.append(epoch_loss)
                val_top1s.append(epoch_top1_val)
                val_top5s.append(epoch_top5_val)
            
                # Log to experiment logger
                logger.record_metrics(
                    epoch=epoch + 1,
                    train_loss=train_losses[-1],
                    train_top1=train_top1s[-1],
                    train_top5=train_top5s[-1],
                    val_loss=val_losses[-1],
                    val_top1=val_top1s[-1],
                    val_top5=val_top5s[-1]
                )

                # Update best model
                if epoch_top1_val > best_acc:
                    best_acc = epoch_top1_val
                    best_model_wts = copy.deepcopy(model.state_dict())

    # ------------------------------------------------------
    # END OF TRAINING LOOP
    # ------------------------------------------------------

    time_elapsed = time.time() - since
    print(f"\nTraining complete in {time_elapsed//60:.0f}m {time_elapsed%60:.0f}s")
    print(f"Best validation Top-1 Acc: {best_acc:.2f}%")


    model.load_state_dict(best_model_wts)

    # ------------------------------------------------------
    # Loss curve plot
    # ------------------------------------------------------
    fig, ax = plt.subplots()
    ax.plot(train_losses, label="Train Loss")
    ax.plot(val_losses, label="Val Loss")
    ax.set_title("Loss Curve")
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Loss")
    ax.legend()
    logger.add_plot(fig, name="loss_curve")

    # ------------------------------------------------------
    # Accuracy curve plot
    # ------------------------------------------------------
    fig2, ax2 = plt.subplots()
    ax2.plot(train_top1s, label="Train Top-1")
    ax2.plot(val_top1s, label="Val Top-1")
    ax2.plot(train_top5s, label="Train Top-5")
    ax2.plot(val_top5s, label="Val Top-5")
    ax2.set_title("Accuracy Curve")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy (%)")
    ax2.legend()
    logger.add_plot(fig2, name="accuracy_curve")

    # ------------------------------------------------------
    # Confusion matrix
    # ------------------------------------------------------
    all_preds = []
    val_labels = [label for _, label in val_dataset]

    model.eval()
    with torch.no_grad():
        for inputs, _ in dataloaders["val"]:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
    
    # ------------------------------------------------------
    # Compute hierarchical metrics
    # ------------------------------------------------------
    idx_to_class = {i: name for i, name in enumerate(full_train_dataset.classes)}
    pred_class_names = [idx_to_class[p] for p in all_preds]
    true_class_names = [idx_to_class[t] for t in val_labels]

    pred_makes = [get_make(c) for c in pred_class_names]
    true_makes = [get_make(c) for c in true_class_names]

    # Hierarchical consistency
    hier_consistency = np.mean([p == t for p, t in zip(pred_makes, true_makes)])
    print(f"Hierarchical Consistency: {hier_consistency:.4f}")

    # Per-make accuracy
    from collections import defaultdict
    make_correct = defaultdict(int)
    make_total = defaultdict(int)
    for p_make, t_make in zip(pred_makes, true_makes):
        make_total[t_make] += 1
        if p_make == t_make:
            make_correct[t_make] += 1
    per_make_acc = {make: make_correct[make] / make_total[make] for make in make_total}
    print("Per-make Accuracy:", per_make_acc)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(val_labels, all_preds)
    logger.add_confusion_matrix(cm, class_names=full_train_dataset.classes)
    logger.add_scalar("hierarchical_consistency", hier_consistency)
    for make, acc in per_make_acc.items():
        logger.add_scalar(f"accuracy_{make}", acc)


    # ------------------------------------------------------
    # Save sample validation image
    # ------------------------------------------------------
    sample_img, _ = next(iter(dataloaders["val"]))
    logger.add_image(sample_img[0], name="sample_val_image")

    # ------------------------------------------------------
    # Write final log summary
    # ------------------------------------------------------
    logger.results = f"Best Top-1 Accuracy = {best_acc:.2f}%"
    logger.results += f", Hierarchical Consistency = {hier_consistency:.4f}"
    logger.notes = "Training finished. Logged curves, confusion matrix, sample image."
    logger.commit()

    # ------------------------------------------------------
    # Save trained model
    # ------------------------------------------------------
    model_path = f"models/{logger.name}_model.pth"
    os.makedirs("models", exist_ok=True)
    torch.save(model, model_path)
    print(f"Model saved to {model_path}")

    return model


In [25]:
def check_setup():
    print("\n--- Running Setup Health Check ---")
    
    # 1. Device check (already done, but reconfirm)
    print(f"Model is on: {next(model_baseline.parameters()).device}")
    
    # 2. Data Loader Check (load a single batch)
    try:
        inputs, labels = next(iter(train_loader))
        
        # 3. Check for float32 enforcement and device transfer
        inputs = inputs.to(device, dtype=torch.float32)
        labels = labels.to(device)
        
        # Check input tensor properties
        print(f"Input batch shape: {inputs.shape}")
        print(f"Input tensor dtype: {inputs.dtype}")
        print(f"Input tensor device: {inputs.device}")
        
    except Exception as e:
        print(f"[CRITICAL ERROR] Failed to load or transfer batch. Fix this before training.")
        print(f"Error details: {e}")
        return

    # 4. Model Forward Pass Check
    try:
        model_baseline.eval() # Set to eval mode for the check
        with torch.no_grad():
            outputs = model_baseline(inputs)
            
        print(f"Output shape (Batch x Classes): {outputs.shape}")
        print("Setup check PASSED! Ready for full training.")
        
    except Exception as e:
        print(f"[CRITICAL ERROR] Failed during forward pass.")
        print(f"Error details: {e}")

# check_setup() # Run this once before calling train_model

In [26]:

# To run the training:
check_setup()
final_baseline_model = train_model(model_baseline, criterion, optimizer, num_epochs=NUM_EPOCHS)


--- Running Setup Health Check ---
Model is on: cpu
Input batch shape: torch.Size([32, 3, 224, 224])
Input tensor dtype: torch.float32
Input tensor device: cpu
Output shape (Batch x Classes): torch.Size([32, 196])
Setup check PASSED! Ready for full training.
Starting Training...

Epoch 1/2
--------------------
train Loss: 3.3020  Top-1: 45.96%  Top-5: 75.75%
val   Loss: 3.6568  Top-1: 26.40%  Top-5: 52.85%

Epoch 2/2
--------------------
train Loss: 2.4792  Top-1: 67.20%  Top-5: 88.76%
val   Loss: 3.3185  Top-1: 33.76%  Top-5: 58.01%

Training complete in 26m 54s
Best validation Top-1 Acc: 33.76%
[Plot Saved] plots/Name of experiment_loss_curve.png
[Plot Saved] plots/Name of experiment_accuracy_curve.png
Hierarchical Consistency: 0.4481
Per-make Accuracy: {'Chrysler': 0.3333333333333333, 'Ford': 0.32653061224489793, 'Hyundai': 0.4819277108433735, 'GMC': 0.5853658536585366, 'Toyota': 0.40625, 'Chevrolet': 0.5691489361702128, 'smart': 0.625, 'Suzuki': 0.34146341463414637, 'Bentley': 0.4

In [None]:
def evaluate_model(model, loader, dataset_size, device):
    print("\n--- Starting Test Set Evaluation ---")
    
    model.eval() # Set model to evaluation mode
    running_corrects = 0
    
    # Top-5 accuracy tracking
    top5_corrects = 0
    total_samples = 0

    with torch.no_grad():
        for inputs, labels in loader:
            
            # Ensure consistent device and dtype (float32)
            inputs = inputs.to(device, dtype=torch.float32)
            labels = labels.to(device)
            
            outputs = model(inputs)
            
            # --- Top-1 Accuracy ---
            _, preds = torch.max(outputs, 1)
            running_corrects += torch.sum(preds == labels.data)
            
            # --- Top-5 Accuracy (Good for Fine-Grained Tasks) ---
            # Get the top 5 predicted classes
            _, top5_preds = torch.topk(outputs, 5, dim=1)
            
            # Check if the true label is in the top 5 predictions
            labels_reshaped = labels.view(-1, 1) # Shape [Batch, 1]
            top5_corrects += torch.sum(top5_preds.eq(labels_reshaped)).item()
            total_samples += inputs.size(0)

    # Calculate final metrics
    if device != torch.device("mps"):
        top1_acc = running_corrects.double() / dataset_size
    else:
        top1_acc = running_corrects.float() / dataset_size
    top5_acc = top5_corrects / total_samples
    
    print(f'Test Set Size: {dataset_size}')
    print(f'Final Baseline Top-1 Accuracy: {top1_acc.item():.4f}')
    print(f'Final Baseline Top-5 Accuracy: {top5_acc:.4f}')
    
    return top1_acc.item(), top5_acc

# --- Example of running the test ---

# 1. Load the best weights saved during training
best_model_test = setup_resnet_baseline(NUM_CLASSES)
best_model_test.to(device).float() # Ensure float32 and device transfer
# best_model_test.load_state_dict(torch.load('best_baseline_weights.pth', map_location=device))

# 2. Run the evaluation
test_top1_acc, test_top5_acc = evaluate_model(final_baseline_model, test_loader, len(test_dataset), device)


--- Starting Test Set Evaluation ---
Test Set Size: 8041
Final Baseline Top-1 Accuracy: 0.4279
Final Baseline Top-5 Accuracy: 0.6933


In [None]:
import torch
import numpy as np

def get_predictions_and_labels(model, loader, device):
    """
    Runs the model on the data loader and collects all predictions, 
    true labels, and output logits.
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_outputs = []

    with torch.no_grad():
        for inputs, labels in loader:
            # Ensure consistent device and dtype (float32)
            inputs = inputs.to(device, dtype=torch.float32)
            labels = labels.to(device)

            outputs = model(inputs)
            
            # Get Top-1 prediction index
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())

    return np.array(all_preds), np.array(all_labels), np.array(all_outputs)

# Example usage (assuming 'best_model_test', 'test_loader', 'device' are defined):
test_preds, test_labels, test_outputs = get_predictions_and_labels(
    final_baseline_model, test_loader, device)


In [35]:
class_names = full_train_dataset.classes
idx_to_class = {i: class_name for i, class_name in enumerate(class_names)}

In [None]:
# 1. Find indices where prediction does not match label
incorrect_indices = np.where(test_preds != test_labels)[0]

# 2. Total errors should match (1 - Accuracy)
num_incorrect = len(incorrect_indices)
print(f"Total Incorrect Predictions: {num_incorrect}")
print(f"Total Test Samples: {len(test_labels)}")
print(f"Error Rate (Expected): {1 - 0.8832:.4f} vs Actual: {num_incorrect / len(test_labels):.4f}")

# 3. Print a few specific errors to check for patterns
# Print a few specific errors to check for patterns
for i, idx in enumerate(incorrect_indices[:5]):
    predicted_class_idx = test_preds[idx]
    true_class_idx = test_labels[idx]
    
    # Map the indices to their human-readable names
    predicted_class_name = idx_to_class[predicted_class_idx]
    true_class_name = idx_to_class[true_class_idx]
    
    print(f"\nError #{i+1} (Index {idx}):")
    print(f"  Predicted Class: **{predicted_class_name}**")
    print(f"  True Class:      **{true_class_name}**")

Total Incorrect Predictions: 4600
Total Test Samples: 8041
Error Rate (Expected): 0.1168 vs Actual: 0.5721

Error #1 (Index 0):
  Predicted Class: **BMW X6 SUV 2012**
  True Class:      **Suzuki Aerio Sedan 2007**

Error #2 (Index 4):
  Predicted Class: **Spyker C8 Convertible 2009**
  True Class:      **Tesla Model S Sedan 2012**

Error #3 (Index 5):
  Predicted Class: **Ford Expedition EL SUV 2009**
  True Class:      **Chrysler Town and Country Minivan 2012**

Error #4 (Index 6):
  Predicted Class: **Rolls-Royce Ghost Sedan 2012**
  True Class:      **GMC Terrain SUV 2012**

Error #5 (Index 7):
  Predicted Class: **Hyundai Genesis Sedan 2012**
  True Class:      **Mercedes-Benz S-Class Sedan 2012**
