In [5]:
import os
import opendatasets as od
import pandas as pd
import numpy as np
import random
import csv
import matplotlib.pyplot as plt

import shutil
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.utils.data import DataLoader
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image

In [6]:
SEED = 1

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = device = torch.device("cuda") #mps/cuda
print(device)

cuda


In [7]:
train_folder = "2024-fall-ml-3-hw-4-wheres-waldo/train/train" # Original Train Images
test_folder = "2024-fall-ml-3-hw-4-wheres-waldo/test/test" # Original Test Images
annotations_file = "2024-fall-ml-3-hw-4-wheres-waldo/annotations.csv" # Original Annotations File
image_sz = 512

In [8]:
# Define the dataset (Train and Test Loaders)
class WaldoDataset(torch.utils.data.Dataset):
    def __init__(self, annotations_file, img_dir, transforms=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert("RGB")
        #image = torchvision.transforms.ToTensor()(image)  # Convert to tensor
        
        box_data = self.img_labels.iloc[idx, 1:].values
        boxes = [float(item) for item in box_data]
        
        boxes = torch.as_tensor([boxes], dtype=torch.float32)
        labels = torch.ones((1,), dtype=torch.int64)
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((1,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image = self.transforms(image)

        return image, target


# Set up the dataset and data loaders
train_dataset = WaldoDataset(
    annotations_file="2024-fall-ml-3-hw-4-wheres-waldo/train_annotations.csv",
    img_dir="2024-fall-ml-3-hw-4-wheres-waldo/train/chunks",
    transforms=torchvision.transforms.Compose([
        torchvision.transforms.Resize((image_sz, image_sz)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
)

val_dataset = WaldoDataset(
    annotations_file="2024-fall-ml-3-hw-4-wheres-waldo/test_annotations.csv",
    img_dir="2024-fall-ml-3-hw-4-wheres-waldo/train/val",
    transforms=torchvision.transforms.Compose([
        torchvision.transforms.Resize((image_sz, image_sz)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda x: list(zip(*x))
)

val_data_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=lambda x: list(zip(*x))
)


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleBBoxModel(nn.Module):
    def __init__(self):
        super(SimpleBBoxModel, self).__init__()
        # Define a simple CNN architecture
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)

        # Define a fully connected layer to output 4 values for the bounding box
        self.fc1 = nn.Linear(64 * 64 * 64, 128)  # Flattening 64 channels of 64x64 feature maps
        self.fc2 = nn.Linear(128, 4)  # Outputting the 4 bounding box coordinates

        # Define loss function (L1 Loss for bounding box regression)
        self.criterion = nn.L1Loss()

    def forward(self, x, targets=None):
        # Pass the input through the convolutional layers
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)  # Pooling to reduce the spatial dimensions
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)  # Pooling again
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)  # Final pooling
        
        # Flatten the output for the fully connected layer
        x = torch.flatten(x, 1)  # Flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        bbox = self.fc2(x)  # Output 4 values for the bounding box
        
        # Compute loss if targets are provided
        if targets is not None:
            loss = self.criterion(bbox, targets)
            return bbox, loss
        
        return bbox

# Example usage:
model = SimpleBBoxModel()

# Input a batch of 512x512 RGB images (batch_size=1)
images = torch.randn(1, 3, 512, 512)  # Random images for demonstration
targets = torch.tensor([[50.0, 60.0, 200.0, 220.0]])  # Example target bounding box for batch

# Forward pass with targets (training mode)
output, loss = model(images, targets)
print("Bounding box predictions: ", output)
print("Loss: ", loss.item())

# Forward pass without targets (inference mode)
output = model(images)
print("Bounding box predictions (inference): ", output)


Bounding box predictions:  tensor([[ 0.0912, -0.0641,  0.0536, -0.0539]], grad_fn=<AddmmBackward0>)
Loss:  132.49331665039062
Bounding box predictions (inference):  tensor([[ 0.0912, -0.0641,  0.0536, -0.0539]], grad_fn=<AddmmBackward0>)


In [1]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.ops import box_iou

# Define IoU computation
def calculate_iou(pred_boxes, target_boxes):
    """
    Computes IoU between predicted and target boxes.
    Args:
        pred_boxes (Tensor): Predicted boxes, shape (N, 4).
        target_boxes (Tensor): Target boxes, shape (M, 4).
    Returns:
        IoU scores: Tensor of shape (N, M).
    """
    return ops.box_iou(pred_boxes, target_boxes)

# Training function
def train_one_epoch(model, data_loader, optimizer, device):
    model.train()
    running_loss = 0.0

    for images, targets in data_loader:
        images = torch.stack([img.to(device) for img in images])
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()

        # Forward pass
        losses = model(images, targets)

        # Ensure that losses is a dictionary and sum all loss components
        if isinstance(losses, dict):
            total_loss = sum(loss for loss in losses.values())
        else:
            raise ValueError("Expected losses to be a dictionary")

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        running_loss += total_loss.item()

    epoch_loss = running_loss / len(data_loader)
    return epoch_loss



# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_iou = 0.0
    num_samples = 0

    with torch.no_grad():
        for images, targets in tqdm(data_loader, desc="Validation", leave=False):
            # Move data to device
            images = torch.stack([img.to(device) for img in images])
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Generate predictions
            outputs = model(images)  # During evaluation, model outputs predictions
            
            # Compute IoU between predicted and target boxes
            for output, target in zip(outputs, targets):
                pred_boxes = output["boxes"].detach()
                target_boxes = target["boxes"]

                if len(pred_boxes) > 0 and len(target_boxes) > 0:
                    iou = box_iou(pred_boxes, target_boxes).mean().item()
                    total_iou += iou
                else:
                    total_iou += 0  # No predictions or no targets

                num_samples += 1

    # Calculate average IoU
    avg_iou = total_iou / num_samples if num_samples > 0 else 0.0

    return avg_iou


# Main training loop
def train_model(model, train_loader, val_loader, num_epochs, device, optimizer, scheduler, checkpoint_path):
    best_iou = 0.0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Train the model
        train_loss = train_one_epoch(model, train_loader, optimizer, device)

        # Validate the model
        val_iou = evaluate(model, val_loader, device)

        # Save the best model
        if val_iou > best_iou:
            best_iou = val_iou
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Best model saved with IoU: {best_iou:.4f}")

        # Update the learning rate
        scheduler.step()

        print(f"Training Loss: {train_loss:.4f}, Validation IoU: {val_iou:.4f}")

    print(f"Training complete. Best Validation IoU: {best_iou:.4f}")


# Set up optimizer, scheduler, and device
#model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=2).to(device)
model = SimpleBBoxModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
checkpoint_path = "best_model.pth"
num_epochs = 10

# Train and evaluate the model
train_model(
    model,
    train_data_loader,
    val_data_loader,
    num_epochs,
    device,
    optimizer,
    scheduler,
    checkpoint_path
)


NameError: name 'SimpleBBoxModel' is not defined