In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import pickle as pkl
import sys

import warnings
warnings.simplefilter("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import cv2

yolov5 = torch.hub.load(
    "ultralytics/yolov5", 
    "custom", 
    "yolov5/weights/yolov5x.pt",
)

videos_dir = "data/raw"
filename = "video_1.mp4"
filepath = os.path.join(videos_dir, filename)

Using cache found in /home/clifford/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-1-11 Python-3.10.16 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4060, 7933MiB)

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients, 205.5 GFLOPs
Adding AutoShape... 


Yolov5 is not accurate enough to pickup the position of the ball. Will need to finetune with respect to the dataset we have.

In [3]:
from src.processing import get_ball_locations

# capture = cv2.VideoCapture(filepath)
# ball_locations = get_ball_locations(yolov5, capture, batch_size = 100)

# ball_locations.to_csv("data/processed/ball_locations.csv")
ball_locations = pd.read_csv("data/processed/ball_locations.csv", index_col = 0)
print(ball_locations.shape)
ball_locations.head()

(5313, 8)


Unnamed: 0,xmin,ymin,xmax,ymax,confidence,class,name,frame_num
0,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,1
1,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,2
2,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,3
3,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,4
4,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,5


In [4]:
frame_counts = ball_locations.groupby("frame_num").size()
multiple_annots = frame_counts[frame_counts > 1]

print(multiple_annots.shape)
multiple_annots.head()

(183,)


frame_num
769     2
2346    2
2347    2
2348    2
2349    2
dtype: int64

In [5]:
# Only keep the highest confidence annot

to_drop = []
for frame_num in multiple_annots.index:
    annot_subset = ball_locations[ball_locations["frame_num"] == frame_num]
    max_conf = annot_subset["confidence"].max()

    non_max_annots_idx = annot_subset[annot_subset["confidence"] < max_conf].index
    to_drop.extend(non_max_annots_idx)

single_annots = ball_locations.drop(index = to_drop)
print(single_annots.shape)
single_annots.head()

(5123, 8)


Unnamed: 0,xmin,ymin,xmax,ymax,confidence,class,name,frame_num
0,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,1
1,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,2
2,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,3
3,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,4
4,193.207718,240.91951,198.226913,246.77951,0.591428,32,sports ball,5


In [6]:
from src.utils import plot_image
from tqdm import tqdm

capture = cv2.VideoCapture(filepath)
total_frames = capture.get(cv2.CAP_PROP_FRAME_COUNT)

label_cutouts = []
cutout_frames = []
label_frames = single_annots["frame_num"].unique()

frame_num = 1
ret, frame = capture.read()
for label_frame_num in tqdm(label_frames):
    while frame_num != label_frame_num:
        ret, frame = capture.read()
        frame_num += 1

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    locations = single_annots[single_annots["frame_num"] == label_frame_num]
    for _, row in locations.iterrows():
        x1, y1, x2, y2 = row[["xmin", "ymin", "xmax", "ymax"]].astype(int).values
        conf = row["confidence"]
        if conf < 0.60:
            continue

        cutout = frame_rgb[y1: y2, x1: x2]
        label_cutouts.append(cutout)
        cutout_frames.append(label_frame_num)

len(label_cutouts), len(cutout_frames)

100%|██████████| 5123/5123 [00:05<00:00, 933.17it/s] 


(2119, 2119)

In [7]:
# Prepare train val split

# 70% train, 30% val
np.random.seed(42)

shuffled_idx = cutout_frames.copy()
np.random.shuffle(shuffled_idx)

train_stop = int(0.70 * len(shuffled_idx))

train_frames, val_frames = np.split(shuffled_idx, [train_stop])
print(len(train_frames), len(val_frames))

assert len(np.intersect1d(train_frames, val_frames)) == 0

1483 636


In [8]:
# Store the frames needed as an image file and create the .txt labels for each image

img_dir = "data/retraining/images"
label_dir = "data/retraining/labels"

img_height = 360
img_width = 640

capture = cv2.VideoCapture(filepath)
frame_num = 1

while capture.isOpened():
    ret, frame = capture.read()
    if not ret:
        break

    if frame_num not in cutout_frames:
        frame_num += 1
        continue
    
    sub_dir = "train" if frame_num in train_frames else "val"
    img_name = f"frame_{frame_num}"
    img_path = os.path.join(
        img_dir,
        sub_dir,
        f"{img_name}.jpg"
    )
    cv2.imwrite(img_path, frame)

    locations = single_annots[single_annots["frame_num"] == frame_num]
    if len(locations) > 0:
        label_path = os.path.join(
            label_dir,
            sub_dir,
            f"{img_name}.txt"
        )
        label_file = open(label_path, "a")
    
    for _, row in locations.iterrows():
        x1, y1, x2, y2 = row[["xmin", "ymin", "xmax", "ymax"]].astype(int)
        width = x2 - x1
        height = y2 - y1

        x_center = x1 + width // 2
        y_center = y1 + height // 2

        norm_width = width / img_width
        norm_height = height / img_height

        norm_x_center = x_center / img_width
        norm_y_center = y_center / img_height

        label_file.write(
            f"32 {norm_x_center} {norm_y_center} {norm_width} {norm_height}\n"
        )

    frame_num += 1
    label_file.close()