# Assignment 4: Wheres Waldo?
### Name: Eileanor LaRocco
In this assignment, you will develop an object detection algorithm to locate Waldo in a set of images. You will develop a model to detect the bounding box around Waldo. Your final task is to submit your predictions on Kaggle for evaluation.

### Imports

In [2]:
import os
import pandas as pd
from PIL import Image
import torch
from torchvision.io import read_image
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from tqdm import tqdm
import csv
import opendatasets as od

### Download Data

In [3]:
od.download('https://www.kaggle.com/competitions/2024-fall-ml-3-hw-4-wheres-waldo/data')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Downloading 2024-fall-ml-3-hw-4-wheres-waldo.zip to ./2024-fall-ml-3-hw-4-wheres-waldo


100%|██████████| 38.2M/38.2M [00:01<00:00, 23.7MB/s]


Extracting archive ./2024-fall-ml-3-hw-4-wheres-waldo/2024-fall-ml-3-hw-4-wheres-waldo.zip to ./2024-fall-ml-3-hw-4-wheres-waldo





# Preprocessing

In [23]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from ultralytics import YOLO
import cv2

# Paths
train_folder = "2024-fall-ml-3-hw-4-wheres-waldo/train/train"
annotations_file = "2024-fall-ml-3-hw-4-wheres-waldo/annotations.csv"
yolo_train_dir = "datasets/yolo_dataset/train"
yolo_val_dir = "datasets/yolo_dataset/val"

#Saved Predictions
yolo_test_dir = "yolo_test_predictions"

# Create necessary folders
os.makedirs(yolo_train_dir, exist_ok=True)
os.makedirs(yolo_val_dir, exist_ok=True)
os.makedirs(yolo_test_dir, exist_ok=True)

# Load annotations
annotations = pd.read_csv(annotations_file)

# Function to convert annotations to YOLO format
def convert_to_yolo_format(row, img_width, img_height):
    x_center = (row["xmin"] + row["xmax"]) / 2 / img_width
    y_center = (row["ymin"] + row["ymax"]) / 2 / img_height
    width = (row["xmax"] - row["xmin"]) / img_width
    height = (row["ymax"] - row["ymin"]) / img_height
    return f"0 {x_center} {y_center} {width} {height}"

# Split training data into train and validation sets
image_files = annotations["filename"].unique()
train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

# Function to prepare YOLO format data
def prepare_yolo_data(image_list, output_dir):
    for img_name in image_list:
        img_path = os.path.join(train_folder, img_name)
        img = cv2.imread(img_path)
        if img is None:
            continue
        img_height, img_width, _ = img.shape

        # Filter annotations for this image
        image_annotations = annotations[annotations["filename"] == img_name]

        # YOLO annotations file
        yolo_annotations = []
        for _, row in image_annotations.iterrows():
            yolo_line = convert_to_yolo_format(row, img_width, img_height)
            yolo_annotations.append(yolo_line)

        # Save image and annotation
        base_name = os.path.splitext(img_name)[0]
        shutil.copy(img_path, os.path.join(output_dir, f"{base_name}.jpg"))
        with open(os.path.join(output_dir, f"{base_name}.txt"), "w") as f:
            f.write("\n".join(yolo_annotations))

# Prepare training and validation data
prepare_yolo_data(train_images, yolo_train_dir)
prepare_yolo_data(val_images, yolo_val_dir)


# Create your model here 

In [24]:
# Train YOLO model
model = YOLO("yolov5su.pt")  # Load pretrained weights
model.train(data="yolo.yaml", epochs=3, imgsz=640)

Ultralytics 8.3.40 🚀 Python-3.12.5 torch-2.4.1 CPU (Apple M3)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov5su.pt, data=yolo.yaml, epochs=3, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, 

[34m[1mtrain: [0mScanning /Users/elarocco/Desktop/uva_phd_2024/wheres_waldo/datasets/yolo_dataset/train... 21 images, 0 backgrounds, 0 corrupt: 100%|██████████| 21/21 [00:00<00:00, 1557.29it/s]

[34m[1mtrain: [0mNew cache created: /Users/elarocco/Desktop/uva_phd_2024/wheres_waldo/datasets/yolo_dataset/train.cache



[34m[1mval: [0mScanning /Users/elarocco/Desktop/uva_phd_2024/wheres_waldo/datasets/yolo_dataset/val... 6 images, 0 backgrounds, 0 corrupt: 100%|██████████| 6/6 [00:00<00:00, 3232.60it/s]

[34m[1mval: [0mNew cache created: /Users/elarocco/Desktop/uva_phd_2024/wheres_waldo/datasets/yolo_dataset/val.cache
Plotting labels to runs/detect/train/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 69 weight(decay=0.0), 76 weight(decay=0.0005), 75 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 3 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/3         0G      1.611      8.393      1.391         18        640: 100%|██████████| 2/2 [00:12<00:00,  6.24s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.52it/s]

                   all          6          6   0.000556      0.167   0.000382   0.000191






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/3         0G      1.457      5.494      1.466         12        640: 100%|██████████| 2/2 [00:10<00:00,  5.38s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]

                   all          6          6   0.000556      0.167   0.000452   0.000226






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/3         0G      1.537      4.057      1.436          9        640: 100%|██████████| 2/2 [00:11<00:00,  5.73s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]

                   all          6          6   0.000556      0.167   0.000726   0.000502






3 epochs completed in 0.011 hours.
Optimizer stripped from runs/detect/train/weights/last.pt, 18.5MB
Optimizer stripped from runs/detect/train/weights/best.pt, 18.5MB

Validating runs/detect/train/weights/best.pt...
Ultralytics 8.3.40 🚀 Python-3.12.5 torch-2.4.1 CPU (Apple M3)
YOLOv5s summary (fused): 193 layers, 9,111,923 parameters, 0 gradients, 23.8 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


                   all          6          6   0.000556      0.167   0.000722   0.000497
Speed: 0.5ms preprocess, 88.4ms inference, 0.0ms loss, 2.6ms postprocess per image
Results saved to [1mruns/detect/train[0m


ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x2c055f770>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0.048048,    

# Submission File 

In [25]:
import os
import pandas as pd

test_folder = "2024-fall-ml-3-hw-4-wheres-waldo/test/test"

# Predict on test images
test_images = [os.path.join(test_folder, img) for img in os.listdir(test_folder) if img.endswith(".jpg")]
results = model.predict(source=test_images, save=True, save_txt=True, project="yolo_test_predictions")

# Prepare to save the predictions
output_csv_path = os.path.join("yolo_test_predictions", "predictions.csv")
predictions = []

# Process results
for result in results:
    image_name = os.path.basename(result.path)  # Get the image name
    if result.boxes is not None and len(result.boxes) > 0:  # Check if there are predictions
        # Convert result.boxes to tensor for easier access
        boxes = result.boxes.xyxy.cpu().numpy()  # Convert bounding boxes to array
        confidences = result.boxes.conf.cpu().numpy()  # Convert confidence scores to array

        # Find the index of the box with the highest confidence
        best_idx = confidences.argmax()
        best_box = boxes[best_idx]
        conf = confidences[best_idx]

        # Extract bounding box coordinates
        x_min, y_min, x_max, y_max = best_box
        predictions.append([image_name, x_min, y_min, x_max, y_max, conf])
    else:
        # No predictions for this image
        predictions.append([image_name, None, None, None, None, None])

# Save predictions to CSV
df = pd.DataFrame(predictions, columns=["filename", "xmin", "ymin", "xmax", "ymax", "confidence"])
df.to_csv(output_csv_path, index=False)

print(f"Predictions saved to {output_csv_path}")



0: 640x640 (no detections), 95.6ms
1: 640x640 (no detections), 95.6ms
2: 640x640 (no detections), 95.6ms
3: 640x640 (no detections), 95.6ms
4: 640x640 (no detections), 95.6ms
5: 640x640 (no detections), 95.6ms
6: 640x640 (no detections), 95.6ms
7: 640x640 (no detections), 95.6ms
8: 640x640 (no detections), 95.6ms
Speed: 1.5ms preprocess, 95.6ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1myolo_test_predictions/train[0m
0 label saved to yolo_test_predictions/train/labels
Predictions saved to yolo_test_predictions/predictions.csv
