In [2]:
import os

# In a shell, first run:
# conda env create -f yolo_env.yml
# then interact with this notebook within that environment (e.g select interpreter in VScode)
!pwd

# Do you have the data? No, then gdown it
%cd ..
if not os.path.exists('data'):
    !gdown 'https://drive.google.com/uc?id=1wH7yAYCNVi-64YKsnw9M-0GwkiqkDZaZ'
    !unzip data.zip
    !rm data.zip
%cd CropVideo

/home/charlie/ATPIL/CropVideo
/home/charlie/ATPIL
/home/charlie/ATPIL/CropVideo


# Video Cropping
This notebook demonstrates how videos are cropped to include only the player bounding boxes for further processing.
For this purpose, we use the [**YOLOv8**](https://ultralytics.com) object detection model to detect the players in the video frames. The 'maximum' bounding box coordinates of the detected players are used to crop the video frames. The cropped video frames are then stitched together to form the final video for each clip.

In [3]:
# Imports
from ultralytics import YOLO
import numpy as np
import os
import cv2
import json

In [4]:
# Load the model


In [5]:

data_dir = '../data/full_clips'

PERSON_CLASS = 0
PADDING = 0.05

# Run on m1 mac 
for v in os.listdir(data_dir):
    video = os.path.join(data_dir, v)
    # reset model trackers
    model = YOLO('yolov8s.pt')
    print(video)

    cap = cv2.VideoCapture(video)
    video_width, video_height, fps = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FPS))
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    ret = True
    # the player we want will be the human who spends the most time in the bottom middle half of the frame
    #NOTE: This is a very simple heuristic, and WILL PROBABLY NOT WORK for anything other than broadcast video
    positions = {}
    frameId = 0
    while ret:
        ret, frame = cap.read()
        if not ret: break
        results = model.track(frame, persist=True, verbose=False )
        for r in results:
            for b in r.boxes:
                # each b is a detection
                if b.cls == PERSON_CLASS:
                    id = int(b.id[0])            # Tracking ID of the person
                    # Store the position of the person in the frame (xyxyn format)
                    if id not in positions:
                        positions[id] = np.zeros((num_frames, 4)) # x1, y1, x2, y2
                    positions[id][frameId] = b.xyxyn
        frameId += 1

    cap.release()

    average_positions = {}
    for k in positions:
        # get average (where not all zeros)
        average_positions[k] = np.mean(positions[k][np.sum(positions[k], axis=1) != 0], axis=0)
        average_positions[k] = ((average_positions[k][0] + average_positions[k][2]) / 2, (average_positions[k][1] + average_positions[k][3]) / 2)
    # likely candidate will be closest to (0.5, 0.75)
    target_x, target_y = 0.5, 0.75
    if 'front' in v:
        target_y = 0.25 # grab top player if front in video
    # sort parallel arrays:
    ids = list(average_positions.keys())
    av_positions = list(average_positions.values())
    ids, av_positions = zip(*sorted(zip(ids, av_positions), key=lambda x: (x[1][0] - target_x)**2 + (x[1][1] - target_y)**2))

    best_id = 0
    person = positions[ids[best_id]]      # bounding box in each frame for the person we want

    # we need at least 90% of the frames to have a bounding box for the person or we skip the video
    while np.sum(np.sum(person, axis=1) != 0) < 0.9 * num_frames:
        num_frames_appear = np.sum(np.sum(person, axis=1) != 0)
        print(f"person {int(ids[best_id])} does not appear in enough frames ({num_frames_appear} / {num_frames}), trying next person...")

        if best_id + 1 < len(ids):
            print("No more people in the video")
            break
        best_id += 1
        person = positions[ids[best_id]]

    # # find maximum height and width of the bounding box
    bbox_max_h = 0
    bbox_max_w = 0
    for x1, y1, x2, y2 in person:
        bbox_max_h = max(bbox_max_h, (y2 - y1) * video_height)  # Have to multiply by width because the bounding box is normalized
        bbox_max_w = max(bbox_max_w, (x2 - x1) * video_width)   # Have to multiply by height because the bounding box is normalized

    bbox_max_h = int(bbox_max_h * (1 + PADDING))
    bbox_max_w = int(bbox_max_w * (1 + PADDING))
    # for each frame in the video, crop to a max_height x max_width box centered on the person
    # and save to a new video in '../data/cropped_clips'
    # also save a json file with the actual bounding box for each frame of the video
    # in the same directory
    out_dir = '../data/cropped_clips'
    os.makedirs(out_dir, exist_ok=True)
    out_video = os.path.join(out_dir, v)
    out_json = out_video.replace('.mp4', '.json')

    print("Cropped Video Dims: ", bbox_max_w, bbox_max_h)
    # Video writer for avi
    out_cap = cv2.VideoWriter(out_video, cv2.VideoWriter_fourcc(*'mp4v'), fps, (bbox_max_w, bbox_max_h))
    # JSON writer
    json_data = []

    cap = cv2.VideoCapture(video)
    ret = True
    idx = 0
    while ret:
        ret, frame = cap.read()
        if frame is None: break

        x1, y1, x2, y2 = person[idx] # Normalized bounding box
        if np.sum([x1, y1, x2, y2]) == 0:
            print(f'No person in frame {idx}')
            continue
        # Denormalize
        x1 = int(x1 * video_width)
        x2 = int(x2 * video_width)
        y1 = int(y1 * video_height)
        y2 = int(y2 * video_height)
        # Save the bounding box (video coords)
        json_data.append({'frame': idx, 'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2})

        # Get the center of the bounding box
        center_x, center_y = ((x1 + x2) // 2, (y1 + y2) // 2)

        # Get the new bounding box
        x1_crop = max(0, center_x - bbox_max_w // 2)
        x2_crop = min(frame.shape[1], center_x + bbox_max_w // 2)
        y1_crop = max(0, center_y - bbox_max_h // 2)
        y2_crop = min(frame.shape[0], center_y + bbox_max_h // 2)
 
        # Correct for edge cases (pun intended)
        if x1_crop == 0: x2_crop = bbox_max_w
        if x2_crop == frame.shape[1]: x1_crop = frame.shape[1] - bbox_max_w
        if y1_crop == 0: y2_crop = bbox_max_h
        if y2_crop == frame.shape[0]: y1_crop = frame.shape[0] - bbox_max_h

        cropped = frame[y1_crop:y2_crop, x1_crop:x2_crop]

        cropped = cv2.resize(cropped, (bbox_max_w, bbox_max_h)) # Have to resize so video saves for some reason

        out_cap.write(cropped)


        idx += 1

    with open(out_json, 'w') as f:
        json.dump(json_data, f)

    cap.release()
    out_cap.release()
    cv2.destroyAllWindows()
    print(f'Finished {v}')


Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:03<00:00, 7.19MB/s]


../data/full_clips/forehand.mp4
[31m[1mrequirements:[0m Ultralytics requirement ['lapx>=0.5.2'] not found, attempting AutoUpdate...
Collecting lapx>=0.5.2
  Downloading lapx-0.5.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.9 kB)
Downloading lapx-0.5.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m[31m2.0 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: lapx
Successfully installed lapx-0.5.9

[31m[1mrequirements:[0m AutoUpdate success ✅ 3.3s, installed 1 package: ['lapx>=0.5.2']
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

Cropped Video Dims:  274 307
Finished forehand.mp4
../data/full_clips/run_backhand.mp4
Cropped Video Dims:  226 292
Finished run_backhand.mp4
