In [2]:

import os
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt
from collections import defaultdict
from PIL import Image
from tqdm import tqdm
from ultralytics import YOLO  
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import random


# --- CONFIG ---
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {DEVICE}") 
BASE_DATA_ROOT = "../soccernet_data/tracking"
GT_FILENAME = "gt.txt"
IMAGE_FOLDER = "img1"
IMAGE_EXTS = ['.jpg', '.png']
NUM_VISUALS = 10
SCORE_THRESH = 0.8
IOU_THRESH = 0.5
SAMPLE_PER_SEQ = 30

Using device: mps


In [3]:

# --- LOAD MODEL ---
model_yolo = YOLO("yolo11n.pt") 
model_yolo.to(DEVICE)

model_frcnn = fasterrcnn_resnet50_fpn(pretrained=True)
model_frcnn.to(DEVICE)


transform = T.ToTensor()

def load_gt_boxes(gt_path):
    gt_dict = defaultdict(list)
    if not os.path.exists(gt_path):
        return gt_dict
    with open(gt_path, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            frame, _, x, y, w, h, cls, _, _ = map(int, parts[:9])
            gt_dict[frame].append(torch.tensor([x, y, x + w, y + h], device=DEVICE))
    return gt_dict

def compute_iou(box1, box2):
    if box1.size(0) == 0 or box2.size(0) == 0:
        return torch.zeros((box1.size(0), box2.size(0)), device=box1.device)
    area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
    area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
    lt = torch.max(box1[:, None, :2], box2[:, :2])
    rb = torch.min(box1[:, None, 2:], box2[:, 2:])
    wh = (rb - lt).clamp(min=0)
    inter = wh[:, :, 0] * wh[:, :, 1]
    union = area1[:, None] + area2 - inter
    iou = inter / union
    return iou

def plot_gt_and_detections(image_tensor, detections, gt_boxes):
    from torchvision.utils import draw_bounding_boxes
    all_boxes = []
    labels = []
    colors = []

    for box in detections:
        all_boxes.append(box)
        labels.append("pred")
        colors.append("red")

    for box in gt_boxes:
        all_boxes.append(box)
        labels.append("gt")
        colors.append("green")

    if not all_boxes:
        return T.ToPILImage()(image_tensor)
    
    boxes_tensor = torch.stack(all_boxes).cpu()

    
    x1 = torch.min(boxes_tensor[:, 0], boxes_tensor[:, 2])
    y1 = torch.min(boxes_tensor[:, 1], boxes_tensor[:, 3])
    x2 = torch.max(boxes_tensor[:, 0], boxes_tensor[:, 2])
    y2 = torch.max(boxes_tensor[:, 1], boxes_tensor[:, 3])
    boxes_tensor = torch.stack([x1, y1, x2, y2], dim=1).to(torch.int)
    img_uint8 = (image_tensor * 255).byte().cpu()
    drawn = draw_bounding_boxes(img_uint8, boxes_tensor, labels=labels, colors=colors, width=2)
    return T.ToPILImage()(drawn)



In [None]:

import json
from collections import Counter

# --- LOAD MODELS ---
model_yolo   = YOLO("yolo11n.pt").to(DEVICE).eval()
model_frcnn  = fasterrcnn_resnet50_fpn(pretrained=True).to(DEVICE).eval()

transform = T.ToTensor()

def load_gt_boxes(gt_path):
    """
    Load ground‐truth boxes and classes per frame.
    Returns dict: frame_id -> list of dict {'box':Tensor[x1,y1,x2,y2], 'class':int}
    """
    gt_dict = defaultdict(list)
    if not os.path.exists(gt_path):
        return gt_dict
    with open(gt_path, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            frame, _, x, y, w, h, cls, _, _ = map(int, parts[:9])
            box = torch.tensor([x, y, x + w, y + h], device=DEVICE)
            gt_dict[frame].append({'box': box, 'class': cls})
    return gt_dict

def compute_iou(box1, box2):
    if box1.size(0)==0 or box2.size(0)==0:
        return torch.zeros((box1.size(0), box2.size(0)), device=box1.device)
    area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])
    area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])
    lt = torch.max(box1[:,None,:2], box2[:,:2])
    rb = torch.min(box1[:,None,2:], box2[:,2:])
    wh = (rb - lt).clamp(min=0)
    inter = wh[:,:,0] * wh[:,:,1]
    union = area1[:,None] + area2 - inter
    return inter / (union + 1e-6)

def plot_detections(image_tensor, yolo_boxes, frcnn_boxes, gt_entries):
    from torchvision.utils import draw_bounding_boxes
    all_boxes, labels, colors = [], [], []
    # YOLO in purple
    for b in yolo_boxes:
        all_boxes.append(b); labels.append("yolo");   colors.append("purple")
    # FRCNN in blue
    for b in frcnn_boxes:
        all_boxes.append(b); labels.append("frcnn");  colors.append("blue")
    # GT in red
    for e in gt_entries:
        all_boxes.append(e['box']); labels.append(f"gt:{e['class']}"); colors.append("red")
    if not all_boxes:
        return T.ToPILImage()(image_tensor)
    boxes = torch.stack(all_boxes).cpu()
    x1 = torch.min(boxes[:,0], boxes[:,2])
    y1 = torch.min(boxes[:,1], boxes[:,3])
    x2 = torch.max(boxes[:,0], boxes[:,2])
    y2 = torch.max(boxes[:,1], boxes[:,3])
    boxes_int = torch.stack([x1, y1, x2, y2], dim=1).to(torch.int)
    img_uint8 = (image_tensor * 255).byte().cpu()
    drawn = draw_bounding_boxes(img_uint8, boxes_int, labels=labels, colors=colors, width=2)
    return T.ToPILImage()(drawn)

# --- BENCHMARK SETUP ---
metrics_yolo  = []
metrics_frcnn = []
sample_frames = []

# gather sequences
seq_dirs = []
for split in ["train","test"]:
    root = os.path.join(BASE_DATA_ROOT, split)
    if not os.path.isdir(root): continue
    for d in sorted(os.listdir(root)):
        if os.path.isdir(os.path.join(root,d)):
            seq_dirs.append((split, d))

print("Using device:", DEVICE)
print("Processing sequences...")

# loop over sequences
for split, seq_id in tqdm(seq_dirs, desc="Sequences", dynamic_ncols=True):
    seq_path = os.path.join(BASE_DATA_ROOT, split, seq_id)
    img_dir  = os.path.join(seq_path, IMAGE_FOLDER)
    gt_path  = os.path.join(seq_path, "gt", GT_FILENAME)
    gt_dict  = load_gt_boxes(gt_path)
    if not os.path.isdir(img_dir):
        continue

    img_files = sorted([os.path.join(img_dir,f)
                        for f in os.listdir(img_dir)
                        if any(f.lower().endswith(ext) for ext in IMAGE_EXTS)])
    random.shuffle(img_files)
    img_files = img_files[:SAMPLE_PER_SEQ]

    # per‐sequence accumulators
    accs_y, tp_y, fp_y, fn_y = [], 0,0,0
    accs_f, tp_f, fp_f, fn_f = [], 0,0,0
    gt_counter = Counter()

    for path in img_files:
        try:
            img = Image.open(path).convert("RGB")
        except:
            continue
        img_frcnn = transform(img).to(DEVICE)
        filename   = os.path.basename(path)
        try:
            frame_id = int(filename.split('.')[0])
        except:
            continue

        gt_entries = gt_dict.get(frame_id, [])
        gt_boxes   = [e['box'] for e in gt_entries]
        for e in gt_entries:
            gt_counter[e['class']] += 1

        # RUN YOLO
        with torch.no_grad():
            yolo_results = model_yolo(img)       # returns list of Results
        y_res = yolo_results[0]
        preds_y = (y_res.boxes.data.to(DEVICE)
                   if getattr(y_res, 'boxes', None) is not None
                   else torch.empty((0,6), device=DEVICE))
        y_boxes = preds_y[:,:4][preds_y[:,4] > SCORE_THRESH] if preds_y.numel() else torch.empty((0,4), device=DEVICE)

        # RUN FRCNN
        with torch.no_grad():
            fr_out = model_frcnn([img_frcnn])[0]
        fr_scores = fr_out['scores']
        fr_boxes  = fr_out['boxes'].to(DEVICE)
        mask_f    = fr_scores > SCORE_THRESH
        f_boxes   = fr_boxes[mask_f]

        # ACCURACY YOLO
        if gt_boxes:
            gt_t = torch.stack(gt_boxes)
            if y_boxes.size(0):
                ious = compute_iou(y_boxes, gt_t)
                m    = ious.max(1)[0]
                acc_y = float((m > IOU_THRESH).float().mean())
            else:
                acc_y = 0.0
        else:
            acc_y = 1.0 if y_boxes.size(0)==0 else 0.0
        accs_y.append(acc_y)

        # ACCURACY FRCNN
        if gt_boxes:
            if f_boxes.size(0):
                ious = compute_iou(f_boxes, gt_t)
                m    = ious.max(1)[0]
                acc_f = float((m > IOU_THRESH).float().mean())
            else:
                acc_f = 0.0
        else:
            acc_f = 1.0 if f_boxes.size(0)==0 else 0.0
        accs_f.append(acc_f)

        # PREC/RECALL YOLO
        matched = set(); tp=0; fp=0
        if y_boxes.size(0) and gt_boxes:
            ious = compute_iou(y_boxes, gt_t)
            for i in range(y_boxes.size(0)):
                mi, gi = ious[i].max(0)
                if mi > IOU_THRESH and gi.item() not in matched:
                    tp += 1; matched.add(gi.item())
                else:
                    fp += 1
        else:
            fp = y_boxes.size(0)
        fn = len(gt_boxes) - len(matched)
        tp_y += tp; fp_y += fp; fn_y += fn

        # PREC/RECALL FRCNN
        matched = set(); tp=0; fp=0
        if f_boxes.size(0) and gt_boxes:
            ious = compute_iou(f_boxes, gt_t)
            for i in range(f_boxes.size(0)):
                mi, gi = ious[i].max(0)
                if mi > IOU_THRESH and gi.item() not in matched:
                    tp += 1; matched.add(gi.item())
                else:
                    fp += 1
        else:
            fp = f_boxes.size(0)
        fn = len(gt_boxes) - len(matched)
        tp_f += tp; fp_f += fp; fn_f += fn

        # VISUALIZE A FEW
        if len(sample_frames) < NUM_VISUALS and seq_id not in [s[0] for s in sample_frames]:
            vis = plot_detections(img_frcnn, y_boxes, f_boxes, gt_entries)
            sample_frames.append((seq_id, filename, vis))

    # SEQ‐LEVEL METRICS YOLO
    avg_y = sum(accs_y)/len(accs_y) if accs_y else 0.0
    prec_y = tp_y/(tp_y+fp_y+1e-6)
    rec_y  = tp_y/(tp_y+fn_y+1e-6)
    metrics_yolo.append({
        'split': split, 'seq': seq_id,
        'avg_acc': avg_y, 'precision': prec_y, 'recall': rec_y,
        'gt_counts': dict(gt_counter)
    })

    # SEQ‐LEVEL METRICS FRCNN
    avg_f = sum(accs_f)/len(accs_f) if accs_f else 0.0
    prec_f = tp_f/(tp_f+fp_f+1e-6)
    rec_f  = tp_f/(tp_f+fn_f+1e-6)
    metrics_frcnn.append({
        'split': split, 'seq': seq_id,
        'avg_acc': avg_f, 'precision': prec_f, 'recall': rec_f,
        'gt_counts': dict(gt_counter)
    })

# SAVE METRICS TO LOCAL FILES
with open("metrics_yolo.json", "w") as f:
    json.dump(metrics_yolo, f, indent=2)
with open("metrics_frcnn.json", "w") as f:
    json.dump(metrics_frcnn, f, indent=2)

# Optionally show some examples
for seq_id, filename, img in sample_frames:
    plt.imshow(img)
    plt.title(f"Seq {seq_id}, Frame {filename}\nPurple=YOLO, Blue=FRCNN, Red=GT")
    plt.axis("off")
    plt.show()


Using device: mps
Processing sequences...


Sequences:   0%|          | 0/106 [00:00<?, ?it/s]


0: 384x640 18 persons, 44.1ms
Speed: 1.9ms preprocess, 44.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x385d6ab60>
    label = <none> 
    device = <AGXG13XDevice: 0x145729800>
        name = Apple M1 Max 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x1065a6e00>
        label = <none> 
        device = <AGXG13XDevice: 0x145729800>
            name = Apple M1 Max 
    retainedReferences = 1
Sequences:   0%|          | 0/106 [00:09<?, ?it/s]


KeyboardInterrupt: 