# Building NN to Identify Individual Players #

## Motivation ##

Player performance analysis requires a strong tracking mechanism. Position-based trackers such as ByteTrack can mistake identities when multiple detections overlap. Pre-trained identification algorithms like SigLip also perform very poorly, in part due to the very low resultion of the detection crops (which can be as small as 10 x 30 pixels).

We will attempt at training a model specific for our purposes, leveraging  tracklet-based self-supervision to create a triplet ([A]nchor, [P]ositive and [N]egative) data set to be fed into a siemese NN.

## Common Elements ##

In [None]:
import sys
import os

# Set this to the absolute path of your project root
project_root = "/Users/fernandomousinho/Documents/Learning_to_Code/LaxAI"
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from typing import Optional, List, Dict, Tuple
import torch
import supervision as sv
from tqdm import tqdm
from collections import deque, defaultdict
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from PIL import Image
import umap
import json

from modules.detection import DetectionModel
from tools.store_driver import Store
from modules.custom_tracker import AffineAwareByteTrack

W0708 20:38:07.787000 94918 .venv312/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [13]:
input_video = "/Users/fernandomousinho/Library/CloudStorage/GoogleDrive-fmousinho76@gmail.com/My Drive/Colab_Notebooks/FCA_Upstate_NY_003.mp4"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
store = Store()
debug_max_frames = None

## Curate Training Data ##

In [3]:
RESULT_JSON_FILE_PATH = "detections.json"

In [None]:
video_info = sv.VideoInfo.from_video_path(video_path=input_video)
generator_params = {
    "source_path": input_video,
    "end": debug_max_frames if debug_max_frames else video_info.total_frames,
}
frames_generator = sv.get_video_frames_generator(**generator_params)
model = DetectionModel(store=store, device=device)

tracker = AffineAwareByteTrack(id_type='external', maintain_separate_track_obj=False)

frame_target = debug_max_frames if debug_max_frames else video_info.total_frames

frame_generator = sv.get_video_frames_generator(stride=1, **generator_params)

json_sink = sv.JSONSink(RESULT_JSON_FILE_PATH)

previous_frame: Optional[np.ndarray] = None
frame_id = 0

with json_sink as sink:
    for frame in tqdm(frame_generator, desc="Processing frames", total=frame_target):
        all_detections = model.generate_detections(frame)
        all_detections = all_detections.with_nms(threshold=0.4, class_agnostic=False)

        if previous_frame is not None:
            affine_matrix = tracker.calculate_affine_transform(previous_frame, frame)
        else:
            affine_matrix = tracker.get_identity_affine_matrix()
        previous_frame = frame.copy()

        all_detections = tracker.update_with_transform(
            detections=all_detections,
            frame=frame,
            affine_matrix=affine_matrix
        )
        sink.append(all_detections, custom_data={"frame_id": frame_id})
        frame_id += 1


In [4]:
def json_to_detections(json_file: str) -> List[sv.Detections]:
    rows_by_frame_number = defaultdict(list)
    with open(json_file, "r") as f:
        data = json.load(f)
    for row in data:
        frame_number = int(row["frame_id"])
        rows_by_frame_number[frame_number].append(row)

    detections_list = []
    for frame_number, rows in rows_by_frame_number.items():
        xyxy = []
        class_id = []
        confidence = []
        tracker_id = []
        custom_data = defaultdict(list)

        for row in rows:
            xyxy.append([row[key] for key in ["x_min", "y_min", "x_max", "y_max"]])
            class_id.append(row["class_id"])
            confidence.append(row["confidence"])
            tracker_id.append(row["tracker_id"])

            for custom_key in row.keys():
                if custom_key in ["x_min", "y_min", "x_max", "y_max", "class_id", "confidence", "tracker_id"]:
                    continue
                custom_data[custom_key].append(row[custom_key])

        if all([val == "" for val in class_id]):
            class_id = None
        if all([val == "" for val in confidence]):
            confidence = None
        if all([val == "" for val in tracker_id]):
            tracker_id = None

        detections_list.append(
            sv.Detections(
                xyxy=np.array(xyxy, dtype=np.float32),
                class_id=np.array(class_id, dtype=int),
                confidence=np.array(confidence, dtype=np.float32),
                tracker_id=np.array(tracker_id, dtype=int),
                data=dict(custom_data)
            )
        )
    
    return detections_list

In [5]:
all_detections = json_to_detections(RESULT_JSON_FILE_PATH)

In [9]:
all_detections[100].xyxy.shape, all_detections[0].class_id.shape, all_detections[0].confidence.shape, all_detections[0].tracker_id.shape

((11, 4), (6,), (6,), (6,))

In [None]:
import shutil
import random
import os

random.seed(42)  # For reproducibility

src_data_dir = "data"
train_dir = "data/train"
val_dir = "data/val"

# Create train and val directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# For each tracker_id directory in data/
for track_id in os.listdir(src_data_dir):
    track_path = os.path.join(src_data_dir, track_id)
    if not os.path.isdir(track_path):
        continue

    # List all crop files for this track
    crop_files = [f for f in os.listdir(track_path) if f.endswith('.jpg')]
    random.shuffle(crop_files)

    split_idx = int(0.8 * len(crop_files))
    train_files = crop_files[:split_idx]
    val_files = crop_files[split_idx:]

    # Create per-track folders in train/ and val/
    train_track_dir = os.path.join(train_dir, track_id)
    val_track_dir = os.path.join(val_dir, track_id)
    os.makedirs(train_track_dir, exist_ok=True)
    os.makedirs(val_track_dir, exist_ok=True)

    # Copy files
    for fname in train_files:
        src = os.path.join(track_path, fname)
        dst = os.path.join(train_track_dir, fname)
        shutil.copy2(src, dst)

    for fname in val_files:
        src = os.path.join(track_path, fname)
        dst = os.path.join(val_track_dir, fname)
        shutil.copy2(src, dst)

print(f"Done! Crops split into '{train_dir}' and '{val_dir}' with per-track structure.")

In [None]:
import os
from tqdm import tqdm
import cv2
from collections import deque

# Create the main data directory
os.makedirs("data", exist_ok=True)

# Build a mapping from track_id to list of (frame_id, bbox, confidence)
track_detections = {}

frame_generator = sv.get_video_frames_generator(stride=1, source_path=input_video)
frame_idx = 0
next_detected_frame = all_detections[0].data["frame_id"][0] if all_detections else 0
all_detections_dq = deque(all_detections)

for frame in tqdm(frame_generator, desc="Processing frames for crop extraction"):
    if frame_idx != next_detected_frame:
        frame_idx += 1
        continue
    
    detections = all_detections_dq.popleft()
    
    # Extract crops for each detection in this frame
    for i in range(len(detections.xyxy)):
        frame_id = detections.data["frame_id"][i]
        bbox = detections.xyxy[i]  # [x1, y1, x2, y2]
        tracker_id = detections.tracker_id[i]
        confidence = detections.confidence[i]
        
        if tracker_id is None:
            continue
            
        # Create folder for this tracker_id if it doesn't exist
        track_folder = os.path.join("data", str(tracker_id))
        os.makedirs(track_folder, exist_ok=True)
        
        # Extract crop from frame
        x1, y1, x2, y2 = map(int, bbox)
        crop = frame[y1:y2, x1:x2]
        
        # Save crop with filename: frame_id_confidence.jpg
        crop_filename = f"{frame_id}_{confidence:.3f}.jpg"
        crop_path = os.path.join(track_folder, crop_filename)
        cv2.imwrite(crop_path, crop)
    
    # Update for next frame
    if len(all_detections_dq) > 0:
        next_detected_frame = all_detections_dq[0].data["frame_id"][0]
    else:
        break
    frame_idx += 1

print(f"Crop extraction complete! Check the 'data' directory for organized crops.")



In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os
import random
import numpy as np

class LacrossePlayerDataset(Dataset):
    """
    Custom Dataset for loading lacrosse player crops for triplet loss.
    Each player's crops are expected to be in a separate folder.
    """
    def __init__(self, image_dir, transform=None, min_images_per_player=3):
        self.image_dir = image_dir
        self.transform = transform
        self.min_images_per_player = min_images_per_player
        
        # Get all player directories
        all_players = [d for d in os.listdir(image_dir) if os.path.isdir(os.path.join(image_dir, d))]
        
        # Filter players with sufficient images
        self.players = []
        self.player_to_images = {}
        
        for player in all_players:
            player_images = [os.path.join(image_dir, player, img) 
                           for img in os.listdir(os.path.join(image_dir, player))
                           if img.lower().endswith(('.jpg', '.png', '.jpeg'))]
            
            # Only include players with enough images for triplet sampling
            if len(player_images) >= self.min_images_per_player:
                self.players.append(player)
                self.player_to_images[player] = player_images
        
        if len(self.players) < 2:
            raise ValueError(f"Need at least 2 players with {min_images_per_player}+ images each. Found {len(self.players)} valid players.")
        
        # Create list of all valid images
        self.all_images = []
        for player in self.players:
            self.all_images.extend(self.player_to_images[player])
        
        self.player_indices = {player: i for i, player in enumerate(self.players)}
        
        print(f"Dataset initialized with {len(self.players)} players and {len(self.all_images)} total images")
        for player in self.players:
            print(f"  Player {player}: {len(self.player_to_images[player])} images")

    def __len__(self):
        return len(self.all_images)

    def __getitem__(self, index):
        # Anchor image
        anchor_path = self.all_images[index]
        anchor_player = os.path.basename(os.path.dirname(anchor_path))
        anchor_label = self.player_indices[anchor_player]
        
        try:
            anchor_img = Image.open(anchor_path).convert('RGB')
        except Exception as e:
            print(f"Error loading anchor image {anchor_path}: {e}")
            # Return the first valid image as fallback
            anchor_path = self.all_images[0]
            anchor_player = os.path.basename(os.path.dirname(anchor_path))
            anchor_label = self.player_indices[anchor_player]
            anchor_img = Image.open(anchor_path).convert('RGB')

        # Select a positive image (different image of the same player)
        positive_list = self.player_to_images[anchor_player]
        if len(positive_list) < 2:
            # If only one image, use the same image (shouldn't happen due to filtering)
            positive_path = anchor_path
        else:
            # Ensure positive is different from anchor
            positive_candidates = [p for p in positive_list if p != anchor_path]
            if positive_candidates:
                positive_path = random.choice(positive_candidates)
            else:
                positive_path = random.choice(positive_list)  # Fallback
        
        try:
            positive_img = Image.open(positive_path).convert('RGB')
        except Exception as e:
            print(f"Error loading positive image {positive_path}: {e}")
            positive_img = anchor_img  # Use anchor as fallback
        
        # Select a negative image (image of a different player)
        negative_candidates = [p for p in self.players if p != anchor_player]
        if not negative_candidates:
            # This shouldn't happen if we have at least 2 players
            negative_player = anchor_player
        else:
            negative_player = random.choice(negative_candidates)
        
        negative_path = random.choice(self.player_to_images[negative_player])
        
        try:
            negative_img = Image.open(negative_path).convert('RGB')
        except Exception as e:
            print(f"Error loading negative image {negative_path}: {e}")
            negative_img = anchor_img  # Use anchor as fallback

        # Apply transformations
        if self.transform:
            try:
                anchor_img = self.transform(anchor_img)
                positive_img = self.transform(positive_img)
                negative_img = self.transform(negative_img)
            except Exception as e:
                print(f"Error applying transforms: {e}")
                # Return tensors without transforms as fallback
                anchor_img = transforms.ToTensor()(anchor_img)
                positive_img = transforms.ToTensor()(positive_img)
                negative_img = transforms.ToTensor()(negative_img)

        return anchor_img, positive_img, negative_img, torch.tensor(anchor_label)

# Define data augmentations
# Note: The input size (80, 40) matches your model expectations
data_transforms = transforms.Compose([
    transforms.Resize((80, 40)), # Height, Width
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [16]:
import torch.nn as nn
import torchvision.models as models

class SiameseNet(nn.Module):
    """
    A Siamese network that uses a pre-trained ResNet as a backbone
    to generate feature embeddings for player crops.
    """
    def __init__(self, embedding_dim=128):
        super(SiameseNet, self).__init__()
        # Use a pre-trained ResNet, but remove its final classification layer
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        
        # Modify the first conv layer to be more suitable for small images if needed
        # For example, smaller kernel and stride
        self.backbone.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        
        # Get the number of features from the backbone's output
        num_ftrs = self.backbone.fc.in_features
        
        # Replace the final layer with our embedding layer
        self.backbone.fc = nn.Linear(num_ftrs, embedding_dim)

    def forward(self, x):
        # The forward pass returns the embedding vector
        embedding = self.backbone(x)
        # L2-normalize the embedding
        embedding = nn.functional.normalize(embedding, p=2, dim=1)
        return embedding

    def forward_triplet(self, anchor, positive, negative):
        # Helper function to compute embeddings for a triplet
        emb_anchor = self.forward(anchor)
        emb_positive = self.forward(positive)
        emb_negative = self.forward(negative)
        return emb_anchor, emb_positive, emb_negative

In [17]:
def train_model(model, dataloader, optimizer, loss_fn, num_epochs=20):
    """The main training loop"""
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        batch_count = 0
        
        for i, (anchor, positive, negative, _) in enumerate(dataloader):
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            emb_anchor, emb_positive, emb_negative = model.forward_triplet(anchor, positive, negative)
            
            # Calculate loss
            loss = loss_fn(emb_anchor, emb_positive, emb_negative)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            batch_count += 1
            if (i + 1) % 10 == 0:  # Reduced frequency for notebook
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")
        
        if batch_count > 0:
            epoch_loss = running_loss / batch_count
        else:
            epoch_loss = 0.0
        print(f"--- Epoch {epoch+1} Summary ---")
        print(f"Average Loss: {epoch_loss:.4f}\n")
        
    print("Finished Training")
    return model

# --- Main training block for notebook ---
# Hyperparameters
TRAIN_DIR = 'data/train'  # Fixed path
EMBEDDING_DIM = 128
LEARNING_RATE = 0.001
BATCH_SIZE = 16  # Reduced batch size for stability
NUM_EPOCHS = 10   # Reduced epochs for testing
MARGIN = 0.5 # Margin for the triplet loss

# Check if train directory exists and has data
if not os.path.exists(TRAIN_DIR):
    print(f"Error: {TRAIN_DIR} directory does not exist!")
    print("Available directories:", [d for d in os.listdir('.') if os.path.isdir(d)])
else:
    train_folders = [d for d in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, d))]
    print(f"Found {len(train_folders)} player folders in {TRAIN_DIR}")
    
    if len(train_folders) < 2:
        print("Error: Need at least 2 player folders for triplet loss training!")
    else:
        # 1. Setup Dataset and DataLoader
        train_dataset = LacrossePlayerDataset(image_dir=TRAIN_DIR, transform=data_transforms)
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)  # Fixed num_workers
        
        print(f"Dataset size: {len(train_dataset)} images")
        print(f"Number of batches: {len(train_dataloader)}")
        
        # 2. Initialize Model, Loss, and Optimizer
        siamese_model = SiameseNet(embedding_dim=EMBEDDING_DIM)
        triplet_loss = nn.TripletMarginLoss(margin=MARGIN, p=2)
        optimizer = torch.optim.Adam(siamese_model.parameters(), lr=LEARNING_RATE)
        
        # 3. Start Training
        trained_model = train_model(siamese_model, train_dataloader, optimizer, triplet_loss, num_epochs=NUM_EPOCHS)
        
        # 4. Save the trained model
        torch.save(trained_model.state_dict(), 'lacrosse_reid_model.pth')
        print("Model saved to lacrosse_reid_model.pth")

Found 61 player folders in data/train
Dataset initialized with 59 players and 10072 total images
  Player 59: 4 images
  Player 50: 108 images
  Player 57: 48 images
  Player 32: 66 images
  Player 35: 323 images
  Player 56: 55 images
  Player 51: 54 images
  Player 58: 50 images
  Player 34: 302 images
  Player 33: 240 images
  Player 20: 590 images
  Player 18: 122 images
  Player 27: 225 images
  Player 9: 20 images
  Player 11: 25 images
  Player 7: 180 images
  Player 29: 66 images
  Player 16: 103 images
  Player 42: 44 images
  Player 45: 71 images
  Player 6: 578 images
  Player 28: 58 images
  Player 17: 29 images
  Player 1: 232 images
  Player 10: 185 images
  Player 19: 81 images
  Player 26: 278 images
  Player 8: 156 images
  Player 21: 48 images
  Player 44: 85 images
  Player 43: 35 images
  Player 38: 125 images
  Player 36: 110 images
  Player 31: 325 images
  Player 54: 102 images
  Player 53: 139 images
  Player 30: 329 images
  Player 37: 112 images
  Player 39: 1

In [18]:
import os
import shutil

src_data_dir = "data/orig"
all_crops_dir = os.path.join("data", "all_unfiltered_crops")
os.makedirs(all_crops_dir, exist_ok=True)

for track_id in os.listdir(src_data_dir):
    track_path = os.path.join(src_data_dir, track_id)
    if not os.path.isdir(track_path) or track_id == "all_unfiltered_crops":
        continue
    for fname in os.listdir(track_path):
        if fname.lower().endswith(('.jpg', '.png')):
            src = os.path.join(track_path, fname)
            dst = os.path.join(all_crops_dir, fname)
            # If duplicate filenames exist, add track_id as prefix
            if os.path.exists(dst):
                dst = os.path.join(all_crops_dir, f"{track_id}_{fname}")
            shutil.copy2(src, dst)

print(f"All crops copied to '{all_crops_dir}' (flat, ungrouped).")

All crops copied to 'data/all_unfiltered_crops' (flat, ungrouped).


In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from sklearn.cluster import DBSCAN
import os
import shutil
from collections import Counter
from tqdm import tqdm
import time

# --- CONFIGURATION ---
# Paths
INITIAL_MODEL_PATH = 'lacrosse_reid_model.pth'
ALL_CROPS_DIR = 'data/all_unfiltered_crops/'
CLUSTERED_DATA_DIR = 'data/clustered_train/'

# Model & Data Loader Params
EMBEDDING_DIM = 128
BATCH_SIZE = 128 # Use a larger batch size for fast inference

# DBSCAN Hyperparameters (IMPORTANT: You MUST tune these)
# eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
# This is the most critical parameter to tune. Start around 0.5-0.7 and adjust.
DBSCAN_EPS = 0.05 
# min_samples: The number of samples in a neighborhood for a point to be considered as a core point.
DBSCAN_MIN_SAMPLES = 20

# --- STEP 1: HELPER DATASET FOR INFERENCE ---
# A simple dataset to load individual images for generating embeddings
class InferenceDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        return img, img_path

# --- STEP 2: GENERATE EMBEDDINGS ---
def generate_all_embeddings(model, dataloader, device):
    """Uses the model to generate embeddings for all images in the dataloader."""
    model.eval() # Set model to evaluation mode
    all_embeddings = []
    all_paths = []

    print(f"🚀 Generating embeddings for {len(dataloader.dataset)} images...")
    start_time = time.time()
    
    with torch.no_grad():
        for i, (images, paths) in enumerate(tqdm(dataloader, desc="Generating embeddings")):
            images = images.to(device)
            embeddings = model(images)
            all_embeddings.append(embeddings.cpu().numpy())
            all_paths.extend(paths)

    elapsed_time = time.time() - start_time
    print(f"✅ Embedding generation complete! ({elapsed_time:.2f}s, {len(all_paths)} images)")
    return np.vstack(all_embeddings), all_paths

# --- STEP 3: CLUSTER EMBEDDINGS ---
def cluster_embeddings(embeddings, eps, min_samples):
    """Performs DBSCAN clustering on the embedding vectors."""
    print(f"\n🧠 Starting DBSCAN clustering on {embeddings.shape[0]} embeddings...")
    print(f"   📊 Embedding dimensions: {embeddings.shape[1]}")
    print(f"   ⚙️  Parameters: eps={eps}, min_samples={min_samples}")
    
    start_time = time.time()
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean', n_jobs=-1)
    
    print("   🔄 Running DBSCAN algorithm...")
    cluster_labels = clustering.fit_predict(embeddings)
    
    elapsed_time = time.time() - start_time
    
    # Print clustering summary
    unique_labels = set(cluster_labels)
    num_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    num_noise = np.sum(cluster_labels == -1)
    
    print(f"✨ Clustering complete! ({elapsed_time:.2f}s)")
    print(f"   🎯 Found {num_clusters} unique players (clusters)")
    print(f"   🗑️  {num_noise} images classified as noise ({num_noise/len(cluster_labels)*100:.1f}%)")
    
    # Show cluster size distribution
    if num_clusters > 0:
        cluster_counts = Counter(cluster_labels)
        if -1 in cluster_counts:
            del cluster_counts[-1]  # Remove noise count
        
        print(f"   📈 Cluster sizes: min={min(cluster_counts.values())}, max={max(cluster_counts.values())}, avg={np.mean(list(cluster_counts.values())):.1f}")
    
    return cluster_labels

# --- STEP 4: REORGANIZE DATASET BASED ON CLUSTERS ---
def reorganize_data_into_clusters(image_paths, cluster_labels, output_dir):
    """Copies image files into new folders named after their cluster ID."""
    print(f"\n📁 Reorganizing data into '{output_dir}'...")
    
    if os.path.exists(output_dir):
        print("   🧹 Cleaning up existing directory...")
        shutil.rmtree(output_dir) # Clean up old directory
    os.makedirs(output_dir)
    
    # Create a mapping from image path to its cluster label
    path_to_label = dict(zip(image_paths, cluster_labels))
    
    # Count clusters and create directories
    valid_labels = [label for label in cluster_labels if label != -1]
    unique_clusters = set(valid_labels)
    
    print(f"   📂 Creating {len(unique_clusters)} cluster directories...")
    for label in unique_clusters:
        cluster_dir = os.path.join(output_dir, f"player_{label:04d}")
        os.makedirs(cluster_dir, exist_ok=True)
    
    copied_count = 0
    print("   📋 Copying files to cluster directories...")
    
    for path, label in tqdm(path_to_label.items(), desc="Copying files"):
        # Ignore noise points (label -1)
        if label == -1:
            continue
            
        # Create a directory for the new cluster ID if it doesn't exist
        cluster_dir = os.path.join(output_dir, f"player_{label:04d}")
        
        # Copy the file
        filename = os.path.basename(path)
        dst_path = os.path.join(cluster_dir, filename)
        shutil.copy2(path, dst_path)
        copied_count += 1
        
    print(f"✅ Successfully copied {copied_count} images into {len(unique_clusters)} cluster folders")
    
    # Show final statistics
    if len(unique_clusters) > 0:
        print(f"   📊 Average images per cluster: {copied_count/len(unique_clusters):.1f}")


# --- MAIN EXECUTION SCRIPT ---

print("🎯 Starting Player Re-identification Clustering Pipeline")
print("=" * 60)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"🖥️  Using device: {device}")

# Check if required files exist
if not os.path.exists(INITIAL_MODEL_PATH):
    print(f"❌ Error: Model file '{INITIAL_MODEL_PATH}' not found!")
    print("   Please ensure the model has been trained and saved first.")
else:
    # Create all_unfiltered_crops directory if it doesn't exist
    if not os.path.exists(ALL_CROPS_DIR):
        print(f"📂 Creating '{ALL_CROPS_DIR}' directory...")
        os.makedirs(ALL_CROPS_DIR, exist_ok=True)
        
        # Copy all crops from individual tracker folders to the all_crops directory
        data_dir = "data"
        if os.path.exists(data_dir):
            print("   🔄 Copying crops from individual tracker folders...")
            for item in os.listdir(data_dir):
                item_path = os.path.join(data_dir, item)
                if os.path.isdir(item_path) and item.isdigit():  # Only process numbered tracker folders
                    for crop_file in os.listdir(item_path):
                        if crop_file.lower().endswith(('.jpg', '.png', '.jpeg')):
                            src = os.path.join(item_path, crop_file)
                            dst = os.path.join(ALL_CROPS_DIR, f"{item}_{crop_file}")  # Prefix with tracker_id
                            shutil.copy2(src, dst)
    
    # Check if we have images to process
    if not os.path.exists(ALL_CROPS_DIR) or len(os.listdir(ALL_CROPS_DIR)) == 0:
        print(f"❌ Error: No images found in '{ALL_CROPS_DIR}'!")
        print("   Please ensure crop extraction has been completed first.")
    else:
        num_images = len([f for f in os.listdir(ALL_CROPS_DIR) if f.lower().endswith(('.jpg', '.png', '.jpeg'))])
        print(f"📊 Found {num_images} images to process")
        
        # 1. Load initial model
        print("\n🔄 Loading trained model...")
        model = SiameseNet(embedding_dim=EMBEDDING_DIM)
        model.load_state_dict(torch.load(INITIAL_MODEL_PATH, map_location=device))
        model.to(device)
        print("✅ Model loaded successfully!")

        # 2. Setup inference pipeline
        print("\n⚙️  Setting up inference pipeline...")
        # Use the same transforms as your validation/test set
        inference_transforms = transforms.Compose([
            transforms.Resize((80, 40)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        inference_dataset = InferenceDataset(image_dir=ALL_CROPS_DIR, transform=inference_transforms)
        inference_dataloader = DataLoader(inference_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
        
        embeddings_array, image_paths = generate_all_embeddings(model, inference_dataloader, device)

        # 3. Cluster the embeddings
        cluster_labels = cluster_embeddings(embeddings_array, eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES)
        
        # 4. Reorganize files into a new, clean training directory
        reorganize_data_into_clusters(image_paths, cluster_labels, CLUSTERED_DATA_DIR)
        
        print("\n🎉 All steps complete!")
        print(f"📁 A new, clean dataset is ready for retraining in '{CLUSTERED_DATA_DIR}'")
        print("=" * 60)

🎯 Starting Player Re-identification Clustering Pipeline
🖥️  Using device: mps
📊 Found 12618 images to process

🔄 Loading trained model...
✅ Model loaded successfully!

⚙️  Setting up inference pipeline...
🚀 Generating embeddings for 12618 images...
✅ Model loaded successfully!

⚙️  Setting up inference pipeline...
🚀 Generating embeddings for 12618 images...


Generating embeddings: 100%|██████████| 99/99 [00:09<00:00, 10.40it/s]



✅ Embedding generation complete! (9.52s, 12618 images)

🧠 Starting DBSCAN clustering on 12618 embeddings...
   📊 Embedding dimensions: 128
   ⚙️  Parameters: eps=0.05, min_samples=20
   🔄 Running DBSCAN algorithm...
✨ Clustering complete! (0.12s)
   🎯 Found 20 unique players (clusters)
   🗑️  5677 images classified as noise (45.0%)
   📈 Cluster sizes: min=20, max=1063, avg=347.1

📁 Reorganizing data into 'data/clustered_train/'...
   🧹 Cleaning up existing directory...
   📂 Creating 20 cluster directories...
   📋 Copying files to cluster directories...
   📂 Creating 20 cluster directories...
   📋 Copying files to cluster directories...


Copying files: 100%|██████████| 12618/12618 [00:01<00:00, 9781.58it/s] 

✅ Successfully copied 6941 images into 20 cluster folders
   📊 Average images per cluster: 347.1

🎉 All steps complete!
📁 A new, clean dataset is ready for retraining in 'data/clustered_train/'





In [22]:
import shutil
import random
import os

random.seed(42)  # For reproducibility

src_data_dir = "data/clustered_train"
train_dir = "data/clustered_train/train"
val_dir = "data/clustered_train/val"

# Create train and val directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# For each tracker_id directory in data/
for track_id in os.listdir(src_data_dir):
    track_path = os.path.join(src_data_dir, track_id)
    if not os.path.isdir(track_path):
        continue

    # List all crop files for this track
    crop_files = [f for f in os.listdir(track_path) if f.endswith('.jpg')]
    random.shuffle(crop_files)

    split_idx = int(0.8 * len(crop_files))
    train_files = crop_files[:split_idx]
    val_files = crop_files[split_idx:]

    # Create per-track folders in train/ and val/
    train_track_dir = os.path.join(train_dir, track_id)
    val_track_dir = os.path.join(val_dir, track_id)
    os.makedirs(train_track_dir, exist_ok=True)
    os.makedirs(val_track_dir, exist_ok=True)

    # Copy files
    for fname in train_files:
        src = os.path.join(track_path, fname)
        dst = os.path.join(train_track_dir, fname)
        shutil.copy2(src, dst)

    for fname in val_files:
        src = os.path.join(track_path, fname)
        dst = os.path.join(val_track_dir, fname)
        shutil.copy2(src, dst)

print(f"Done! Crops split into '{train_dir}' and '{val_dir}' with per-track structure.")

Done! Crops split into 'data/clustered_train/train' and 'data/clustered_train/val' with per-track structure.


In [23]:
def train_model(model, dataloader, optimizer, loss_fn, num_epochs=20):
    """The main training loop"""
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        batch_count = 0
        
        for i, (anchor, positive, negative, _) in enumerate(dataloader):
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            emb_anchor, emb_positive, emb_negative = model.forward_triplet(anchor, positive, negative)
            
            # Calculate loss
            loss = loss_fn(emb_anchor, emb_positive, emb_negative)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            batch_count += 1
            if (i + 1) % 10 == 0:  # Reduced frequency for notebook
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")
        
        if batch_count > 0:
            epoch_loss = running_loss / batch_count
        else:
            epoch_loss = 0.0
        print(f"--- Epoch {epoch+1} Summary ---")
        print(f"Average Loss: {epoch_loss:.4f}\n")
        
    print("Finished Training")
    return model

# --- Main training block for notebook ---
# Hyperparameters
TRAIN_DIR = 'data/clustered_train/train'  # Fixed path
EMBEDDING_DIM = 128
LEARNING_RATE = 0.001
BATCH_SIZE = 16  # Reduced batch size for stability
NUM_EPOCHS = 10   # Reduced epochs for testing
MARGIN = 0.5 # Margin for the triplet loss

# Check if train directory exists and has data
if not os.path.exists(TRAIN_DIR):
    print(f"Error: {TRAIN_DIR} directory does not exist!")
    print("Available directories:", [d for d in os.listdir('.') if os.path.isdir(d)])
else:
    train_folders = [d for d in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, d))]
    print(f"Found {len(train_folders)} player folders in {TRAIN_DIR}")
    
    if len(train_folders) < 2:
        print("Error: Need at least 2 player folders for triplet loss training!")
    else:
        # 1. Setup Dataset and DataLoader
        train_dataset = LacrossePlayerDataset(image_dir=TRAIN_DIR, transform=data_transforms)
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)  # Fixed num_workers
        
        print(f"Dataset size: {len(train_dataset)} images")
        print(f"Number of batches: {len(train_dataloader)}")
        
        # 2. Initialize Model, Loss, and Optimizer
        siamese_model = SiameseNet(embedding_dim=EMBEDDING_DIM)
        triplet_loss = nn.TripletMarginLoss(margin=MARGIN, p=2)
        optimizer = torch.optim.Adam(siamese_model.parameters(), lr=LEARNING_RATE)
        
        # 3. Start Training
        trained_model = train_model(siamese_model, train_dataloader, optimizer, triplet_loss, num_epochs=NUM_EPOCHS)
        
        # 4. Save the trained model
        torch.save(trained_model.state_dict(), 'lacrosse_reid_player_model.pth')
        print("Model saved to lacrosse_reid_player_model.pth")

Found 22 player folders in data/clustered_train/train
Dataset initialized with 20 players and 5546 total images
  Player player_0008: 100 images
  Player player_0006: 529 images
  Player player_0001: 350 images
  Player player_0000: 788 images
  Player player_0007: 215 images
  Player player_0009: 324 images
  Player player_0014: 86 images
  Player player_0013: 128 images
  Player player_0012: 90 images
  Player player_0015: 45 images
  Player player_0002: 486 images
  Player player_0005: 323 images
  Player player_0004: 850 images
  Player player_0003: 828 images
  Player player_0010: 105 images
  Player player_0017: 42 images
  Player player_0019: 16 images
  Player player_0018: 29 images
  Player player_0016: 16 images
  Player player_0011: 196 images
Dataset size: 5546 images
Number of batches: 347
Epoch [1/10], Step [10/347], Loss: 0.2367
Epoch [1/10], Step [10/347], Loss: 0.2367
Epoch [1/10], Step [20/347], Loss: 0.3118
Epoch [1/10], Step [20/347], Loss: 0.3118
Epoch [1/10], Step