# Differentiate players between teams

## Before you start

Let's make sure that we have access to GPU. We can use `nvidia-smi` command to do that.

In [None]:
from ai.config import PROJ_ROOT
!nvidia-smi

## Install dependencies


Let's install PyTorch library, but make sure you're installing version compatible with your environment: https://pytorch.org/get-started/locally/

In [None]:
!python -m pip install torch torchvision torchaudio

Let's install ultralytics library.

In [None]:
!python -m pip install ultralytics

Let's make sure we have the latest features in the supervision library by installing version `0.23.0` or higher.

In [None]:
!python -m pip install supervision==0.23.0

Let's install transformers and sentencepiece libraries.

In [None]:
!python -m pip install 'transformers[torch]'
!python -m pip install sentencepiece

Let's install numpy and more-itertools libraries.

In [None]:
!python -m pip install numpy
!python -m pip install more-itertools

Let's install scikit-learn and umap-learn libraries.

In [None]:
!python -m pip install -U scikit-learn
!python -m pip install umap-learn

## Load player detection model

In [8]:
import os
import torch
from pathlib import Path
from ultralytics import YOLO


NOTEBOOK_PATH = Path(os.getcwd())

DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.mps.is_available():
    DEVICE = "mps"

PLAYER_DETECTION_MODEL_PATH = "../models/player_inference.pt"
PLAYER_DETECTION_MODEL = YOLO(PLAYER_DETECTION_MODEL_PATH).to(DEVICE)

## Split players into teams

### Gathering training data

To gather training data, we'll sample one frame per second, detect players within those frames, and then crop them out.

In [9]:
import supervision as sv
from tqdm import tqdm


PLAYER_ID = 1
STRIDE = 30

def extract_crops(source_video_path: str) -> list:
    frame_generator = sv.get_video_frames_generator(source_path=source_video_path, stride=STRIDE)

    crops = []

    for frame in tqdm(frame_generator, desc="collecting crops"):
        result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]

        detections = sv.Detections.from_ultralytics(result)
        detections = detections.with_nms(threshold=0.5, class_agnostic=True)
        detections = detections[detections.class_id == PLAYER_ID]

        players_crops = [sv.crop_image(frame, xyxy) for xyxy in detections.xyxy]
        crops += players_crops

    return crops

In [None]:
SOURCE_VIDEO_PATH = "../data/test.mp4"
crops = extract_crops(SOURCE_VIDEO_PATH)

In [None]:
len(crops)

In [None]:
sv.plot_images_grid(crops[:100], grid_size=(10, 10))

### Calculating embeddings for each of the crops using SigLip

Let's load SigLip model first.

In [None]:
from transformers import AutoProcessor, SiglipVisionModel


def load_siglip_model():
    SIGLIP_MODEL_PATH = 'google/siglip-base-patch16-224'

    EMBEDDINGS_MODEL = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH).to(DEVICE)
    EMBEDDINGS_PROCESSOR = AutoProcessor.from_pretrained(SIGLIP_MODEL_PATH)

    return EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR

In [14]:
EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR = load_siglip_model()

Let's run SigLip model on crops.

In [15]:
import numpy as np
from more_itertools import chunked

BATCH_SIZE = 32

def extract_features(crops: list, embedding_model, embedding_processor) -> np.ndarray:
    crops = [sv.cv2_to_pillow(crop) for crop in crops]
    batches = chunked(crops, BATCH_SIZE)
    data = []
    with torch.no_grad():
        for batch in tqdm(batches, desc='embedding extraction'):
            inputs = embedding_processor(images=batch, return_tensors="pt").to(DEVICE)
            outputs = embedding_model(**inputs)

            embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
            data.append(embeddings)

    data = np.concatenate(data)
    return data

In [None]:
data = extract_features(crops, EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR)

In [None]:
data.shape

### Projecting our embeddings from (N, 768) to (N, 3) using UMAP and performing a two-cluster division using KMeans

In [None]:
import umap
from sklearn.cluster import KMeans


REDUCER = umap.UMAP(n_components=3)
CLUSTERING_MODEL = KMeans(n_clusters=2)

projections = REDUCER.fit_transform(data)
clusters = CLUSTERING_MODEL.fit_predict(projections)

In [None]:
projections.shape

In [None]:
clusters[:10]

In [21]:
team_0 = [
    crop
    for crop, cluster
    in zip(crops, clusters)
    if cluster == 0
]

In [None]:
sv.plot_images_grid(team_0[:100], grid_size=(10, 10))

### Assigning goalkeepers to teams

In [23]:
import numpy as np
import supervision as sv

def resolve_goalkeepers_team_id(players: sv.Detections, goalkeepers: sv.Detections) -> np.ndarray:
    goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)

    team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
    team_1_centroid = players_xy[players.class_id == 1].mean(axis=0)

    goalkeepers_team_id = []

    for goalkeeper_xy in goalkeepers_xy:
        dist_0 = np.linalg.norm(goalkeeper_xy - team_0_centroid)
        dist_1 = np.linalg.norm(goalkeeper_xy - team_1_centroid)

        goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)

    return np.array(goalkeepers_team_id)

## Show results

In [None]:
crops = extract_crops(SOURCE_VIDEO_PATH)

# Fit
EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR = load_siglip_model()
data = extract_features(crops, EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR)

REDUCER = umap.UMAP(n_components=3)
CLUSTERING_MODEL = KMeans(n_clusters=2)

projections = REDUCER.fit_transform(data)
CLUSTERING_MODEL.fit(projections)

In [None]:
import supervision as sv

SOURCE_VIDEO_PATH = "../data/test.mp4"
GOALKEEPER_ID = 0
PLAYER_ID = 1
REFEREE_ID = 2

ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    thickness=2
)

label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER
)

tracker = sv.ByteTrack()
tracker.reset()

frame_generator = sv.get_video_frames_generator(SOURCE_VIDEO_PATH)
frame = next(frame_generator)

result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]
detections = sv.Detections.from_ultralytics(result)

all_detections = detections.with_nms(threshold=0.5, class_agnostic=True)
all_detections = tracker.update_with_detections(detections=all_detections)

goalkeepers_detections = all_detections[all_detections.class_id == GOALKEEPER_ID]
players_detections = all_detections[all_detections.class_id == PLAYER_ID]
referees_detections = all_detections[all_detections.class_id == REFEREE_ID]

players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]

# Predict
data = extract_features(players_crops, EMBEDDINGS_MODEL, EMBEDDINGS_PROCESSOR)
projections = REDUCER.transform(data)
players_detections.class_id = CLUSTERING_MODEL.predict(projections)

goalkeepers_detections.class_id = resolve_goalkeepers_team_id(players_detections, goalkeepers_detections)

all_detections = sv.Detections.merge([players_detections, goalkeepers_detections, referees_detections])

labels = [
    f"#{tracker_id}"
    for tracker_id
    in all_detections.tracker_id
]

all_detections.class_id = all_detections.class_id.astype(int)

annotated_frame = frame.copy()

annotated_frame = ellipse_annotator.annotate(
    scene=annotated_frame,
    detections=all_detections)

annotated_frame = label_annotator.annotate(
    scene=annotated_frame,
    detections=all_detections,
    labels=labels)

sv.plot_image(annotated_frame)