# Assignment 4: Wheres Waldo?
### Name: Eileanor LaRocco
In this assignment, you will develop an object detection algorithm to locate Waldo in a set of images. You will develop a model to detect the bounding box around Waldo. Your final task is to submit your predictions on Kaggle for evaluation.

### Process/Issues
- Double-checked that the images we were given were correctly bounded (did this by visualizing the boxes on the images - they look good!)
- Complication: Originally when I creating augmented images, the bounding box labels did not also augment. I also had to try out a few types of augmentation to see what made sense for waldo. The augmented images may still not be as different from one another as they could be which could allow the model to favor the training images that occur more frequently.
- Complication: Similarly, when resizing the images, ensuring the bounding boxes not only are also adjusted if necessary, but ensuring they do not get cut off and the image is not stretched/shrunk too much.

### Imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import torch
from torchvision.io import read_image
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from tqdm import tqdm
import csv
import opendatasets as od
import cv2
import albumentations as A
import random
import shutil
from sklearn.model_selection import train_test_split
from ultralytics import YOLO
import torch
import torch.nn as nn

  data = fetch_version_info()


In [2]:
SEED = 1

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = device = torch.device("mps")
print(device)

mps


### Download Data

In [3]:
od.download('https://www.kaggle.com/competitions/2024-fall-ml-3-hw-4-wheres-waldo/data')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Downloading 2024-fall-ml-3-hw-4-wheres-waldo.zip to ./2024-fall-ml-3-hw-4-wheres-waldo


100%|██████████| 38.2M/38.2M [00:01<00:00, 20.8MB/s]



Extracting archive ./2024-fall-ml-3-hw-4-wheres-waldo/2024-fall-ml-3-hw-4-wheres-waldo.zip to ./2024-fall-ml-3-hw-4-wheres-waldo


### Paths

In [4]:
train_folder = "2024-fall-ml-3-hw-4-wheres-waldo/train/train" # Original Train Images
test_folder = "2024-fall-ml-3-hw-4-wheres-waldo/test/test" # Original Test Images
annotations_file = "2024-fall-ml-3-hw-4-wheres-waldo/annotations.csv" # Original Annotations File

# Preprocess Images (Crop/Augment)

In [5]:
# Split training data into train and validation sets
annotations = pd.read_csv(annotations_file)
image_files = annotations["filename"].unique()
train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

In [6]:
def filter_csv_by_column(input_csv, output_csv, column_name, values_list):
    """
    Filters rows in a CSV file and keeps only those where the specified column's value is in a given list.

    Parameters:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to save the filtered CSV file.
        column_name (str): Column to filter on.
        values_list (list): List of values to keep.
    """
    # Load the CSV into a DataFrame
    df = pd.read_csv(input_csv)

    # Filter the DataFrame
    filtered_df = df[df[column_name].isin(values_list)]

    # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(output_csv, index=False)

In [8]:
#split annotations into train and val
values_list = list(train_images)

# Example usage
output_csv = "2024-fall-ml-3-hw-4-wheres-waldo/train_annotations.csv"  # Replace with your output file path
column_name = "filename"  # Replace with the column you want to filter

filter_csv_by_column(annotations_file, output_csv, column_name, values_list)




values_list = list(val_images)

# Example usage
output_csv = "2024-fall-ml-3-hw-4-wheres-waldo/test_annotations.csv"  # Replace with your output file path
column_name = "filename"  # Replace with the column you want to filter

filter_csv_by_column(annotations_file, output_csv, column_name, values_list)



In [9]:
import shutil

def split_directory(source_dir, target_dir, file_list):
    """Splits files from source_dir to target_dir based on file_list."""

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for file_name in file_list:
        source_path = os.path.join(source_dir, file_name)
        target_path = os.path.join(target_dir, file_name)

        if os.path.exists(source_path):
            shutil.move(source_path, target_path)
            print(f"Moved: {file_name}")
        else:
            print(f"File not found: {file_name}")

if __name__ == "__main__":
    source_dir = train_folder
    target_dir = "2024-fall-ml-3-hw-4-wheres-waldo/train/val"
    file_list = list(val_images)

    split_directory(source_dir, target_dir, file_list)

Moved: 18.jpg
Moved: 5.jpg
Moved: 19.jpg
Moved: 22.jpg
Moved: 1.jpg
Moved: 3.jpg


In [65]:
import torchvision

class WaldoDataset(torch.utils.data.Dataset):
    def __init__(self, annotations_file, img_dir, transforms=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert("RGB")
        image = F.to_tensor(image)
        
        # Read bounding box data, ensuring all are converted to float
        box_data = self.img_labels.iloc[idx, 4:8].values
        boxes = []
        for item in box_data:
            try:
                boxes.append(float(item))
            except ValueError as e:
                raise ValueError(f"Error converting bounding box data to float: {e}")

        # Create tensors
        boxes = torch.as_tensor([boxes], dtype=torch.float32)
        labels = torch.ones((1,), dtype=torch.int64)
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((1,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image = self.transforms(image)
            image = np.array(image)
            target = np.array(target)

        return image, target


# Example usage:
# Create the dataset
train_dataset = WaldoDataset(annotations_file= "2024-fall-ml-3-hw-4-wheres-waldo/train_annotations.csv"
                             , img_dir=train_folder
                             , transforms = torchvision.transforms.Compose([
                                            torchvision.transforms.Resize((224, 224)),
                                            #torchvision.transforms.ToTensor(),
                                            #torchvision.transforms.ToPILImage()
                                        ]))
val_dataset = WaldoDataset(annotations_file= "2024-fall-ml-3-hw-4-wheres-waldo/test_annotations.csv"
                           , img_dir="2024-fall-ml-3-hw-4-wheres-waldo/train/val"
                        , transforms = torchvision.transforms.Compose([
                                        torchvision.transforms.Resize((224, 224)),
                                        #torchvision.transforms.ToTensor(),
                                        #torchvision.transforms.ToPILImage()
                                    ]))

# Now, you can use this dataset with a DataLoader to train your model
from torch.utils.data import DataLoader

train_data_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=lambda x: zip(*x)
)

val_data_loader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=lambda x: zip(*x)
)

# Model

### Architecture

In [67]:
import torch
import torch.nn as nn

class SimpleYOLOv3(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLOv3, self).__init__()

        # Backbone: Feature extractor (simplified)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            # Add more convolutional layers as needed
        )

        # Detection head (simplified)
        self.head = nn.Sequential(
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, (5 + num_classes) * 3, 1),  # 3 bounding boxes per cell
        )

    def forward(self, x):
        #print(f"Input shape: {x.shape}")
        
        x = self.backbone(x)
        #print(f"After backbone: {x.shape}")
        
        x = self.head(x)
        #print(f"After head: {x.shape}")
        
        return x 

# Instantiate and check the model
model = SimpleYOLOv3(num_classes=1)
input_image = torch.randn(1, 3, 1500, 1000)  # Example batch
output = model(input_image)
print(f"Output shape: {output.shape}")


Output shape: torch.Size([1, 18, 750, 500])


In [None]:
import torch
import torch.nn as nn

class SimpleYOLOv3(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLOv3, self).__init__()

        # Backbone: Feature extractor (simplified)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            # Add more convolutional layers as needed
        )

        # Detection head (simplified)
        self.head = nn.Sequential(
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, (5 + num_classes) * 3, 1),  # 3 bounding boxes per cell
        )

    def forward(self, x):
        x = self.backbone(x)
        print(x.size())
        x = self.head(x)
        print(x.size())
        return x 

# Instantiate and check the model
model = SimpleYOLOv3(num_classes=1)
input_image = torch.randn(1, 3, 1500, 1000)  # Example batch
output = model(input_image)
print(output.shape)


torch.Size([1, 16, 750, 500])
torch.Size([1, 18, 750, 500])
torch.Size([1, 18, 750, 500])


In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random

model = SimpleYOLOv3(num_classes=1)

# IoU calculation
def compute_iou(pred_boxes, true_boxes):
    # pred_boxes and true_boxes should be in (x_min, y_min, x_max, y_max)
    inter_xmin = torch.max(pred_boxes[:, 0], true_boxes[:, 0])
    inter_ymin = torch.max(pred_boxes[:, 1], true_boxes[:, 1])
    inter_xmax = torch.min(pred_boxes[:, 2], true_boxes[:, 2])
    inter_ymax = torch.min(pred_boxes[:, 3], true_boxes[:, 3])

    inter_area = torch.clamp(inter_xmax - inter_xmin, min=0) * torch.clamp(inter_ymax - inter_ymin, min=0)
    pred_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
    true_area = (true_boxes[:, 2] - true_boxes[:, 0]) * (true_boxes[:, 3] - true_boxes[:, 1])

    union_area = pred_area + true_area - inter_area
    iou = inter_area / union_area
    return iou

# Simple IoU loss function
def iou_loss(pred_boxes, true_boxes):
    iou = compute_iou(pred_boxes, true_boxes)
    return 1 - iou.mean()  # We want to maximize IoU, so minimize 1 - IoU

# Custom YOLOv3 training loop
def train(model, train_data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for images, targets in train_data_loader:
        #print(images.shape())
        #print(images)
        #print(targets.shape())
        images = images#.to(device)
        targets = targets#.to(device)

        # Forward pass
        predictions = model(images)

        # Extract predicted boxes and target boxes (for simplicity, assuming one grid cell)
        pred_boxes = predictions[:, :4]  # first 4 are bounding box coordinates
        pred_conf = predictions[:, 4]    # 5th is objectness confidence
        pred_class = predictions[:, 5:]  # remaining are class predictions

        true_boxes = targets[:, :4]  # Ground truth boxes
        true_conf = targets[:, 4]    # Objectness confidence
        true_class = targets[:, 5:]  # Ground truth class

        # Losses
        loss_loc = iou_loss(pred_boxes, true_boxes)  # IoU loss
        loss_conf = torch.nn.BCEWithLogitsLoss()(pred_conf, true_conf)  # Confidence loss
        loss_class = torch.nn.BCEWithLogitsLoss()(pred_class, true_class)  # Classification loss

        # Total loss (sum or weighted sum)
        loss = loss_loc + loss_conf + loss_class
        total_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_data_loader)
    print(f"Training loss: {avg_loss}")

# Evaluation (testing) function
def evaluate(model, val_data_loader, device):
    model.eval()
    total_iou = 0
    with torch.no_grad():
        for images, targets in val_data_loader:
            images = images#.to(device)
            targets = targets#.to(device)

            predictions = model(images)

            # Extract predicted boxes and target boxes
            pred_boxes = predictions[:, :4]
            true_boxes = targets[:, :4]

            # Calculate IoU for the batch
            iou = compute_iou(pred_boxes, true_boxes)
            total_iou += iou.mean().item()

    avg_iou = total_iou / len(val_data_loader)
    print(f"Average IoU on test set: {avg_iou}")

# Initialize model, optimizer, and device
model = SimpleYOLOv3(num_classes=1)#.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Train the model
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_data_loader, optimizer, device)
    evaluate(model, val_data_loader, device)


Epoch 1/10


TypeError: conv2d() received an invalid combination of arguments - got (tuple, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias = None, tuple of ints stride = 1, tuple of ints padding = 0, tuple of ints dilation = 1, int groups = 1)
      didn't match because some of the arguments have invalid types: (!tuple of (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)!, !Parameter!, !Parameter!, !tuple of (int, int)!, !tuple of (int, int)!, !tuple of (int, int)!, !int!)
 * (Tensor input, Tensor weight, Tensor bias = None, tuple of ints stride = 1, str padding = "valid", tuple of ints dilation = 1, int groups = 1)
      didn't match because some of the arguments have invalid types: (!tuple of (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)!, !Parameter!, !Parameter!, !tuple of (int, int)!, !tuple of (int, int)!, !tuple of (int, int)!, !int!)


### Training

In [11]:
# Train YOLO model
#model = YOLO("yolov5su.pt")  # Load pretrained weights
#model.train(data="yolo.yaml", epochs=15, imgsz=640, pretrained=True, augment=True,)

# Submission File 

In [None]:
test_folder = "2024-fall-ml-3-hw-4-wheres-waldo/test/test"

# Predict on test images
test_images = [os.path.join(test_folder, img) for img in os.listdir(test_folder) if img.endswith(".jpg")]
results = model.predict(source=test_images, save=True, save_txt=True, project="yolo_test_predictions")

# Prepare to save the predictions
output_csv_path = os.path.join("yolo_test_predictions", "predictions.csv")
predictions = []

# Process results
for result in results:
    image_name = os.path.basename(result.path)  # Get the image name
    if result.boxes is not None and len(result.boxes) > 0:  # Check if there are predictions
        # Convert result.boxes to tensor for easier access
        boxes = result.boxes.xyxy.cpu().numpy()  # Convert bounding boxes to array
        confidences = result.boxes.conf.cpu().numpy()  # Convert confidence scores to array

        # Find the index of the box with the highest confidence
        best_idx = confidences.argmax()
        best_box = boxes[best_idx]
        conf = confidences[best_idx]

        # Extract bounding box coordinates
        x_min, y_min, x_max, y_max = best_box
        predictions.append([image_name, x_min, y_min, x_max, y_max, conf])
    else:
        # No predictions for this image
        predictions.append([image_name, None, None, None, None, None])

# Save predictions to CSV
df = pd.DataFrame(predictions, columns=["filename", "xmin", "ymin", "xmax", "ymax", "confidence"])
df.to_csv(output_csv_path, index=False)

print(f"Predictions saved to {output_csv_path}")



0: 640x640 (no detections), 210.5ms
1: 640x640 (no detections), 210.5ms
2: 640x640 (no detections), 210.5ms
3: 640x640 (no detections), 210.5ms
4: 640x640 (no detections), 210.5ms
5: 640x640 (no detections), 210.5ms
6: 640x640 (no detections), 210.5ms
7: 640x640 (no detections), 210.5ms
8: 640x640 (no detections), 210.5ms
Speed: 1.5ms preprocess, 210.5ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1myolo_test_predictions/train2[0m
0 label saved to yolo_test_predictions/train2/labels
Predictions saved to yolo_test_predictions/predictions.csv
