In [113]:
# Q: how can we represent players as feature vectors?
import pandas as pd
import os
import shutil
import json
from glob import glob

raw_features_fp = '/playpen-storage/levlevi/player-re-id/src/data/raw_features.json'
raw_features_df_fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
with open(raw_features_fp, 'r') as f:
    raw_features = json.load(f)
raw_features_df = pd.read_csv(raw_features_df_fp)

In [114]:
# which features are redundant?
df_redundant_features_dropped = raw_features_df[['team_id', 'player_id', 'team_colors', 'position', 'jersey_number', 'race']]
# which features can be one-hot encoded?
df_one_hot_team_id = pd.get_dummies(df_redundant_features_dropped, columns=['team_id', 'race',])
# segment positions (i.e. multi-category)
positions_segmented = []
for pos in df_one_hot_team_id['position']:
    if len(pos) == 2:
        positions_segmented.append([pos])
    else:
        positions_segmented.append(pos.split('/'))
df_positions_segmented = df_one_hot_team_id.copy()         
df_positions_segmented['position'] = positions_segmented

In [115]:
# how do we encode positions
unique_positions = set(pos for sublist in df_positions_segmented['position'] for pos in sublist)
# create columns for each unique position
for pos in unique_positions:
    df_positions_segmented[f'position_{pos}'] = df_positions_segmented['position'].apply(lambda x: 1 if pos in x else 0)
# drop the original position column
df_positions_segmented.drop('position', axis=1, inplace=True)

In [116]:
# drop team colors for now
df_no_team_colors = df_positions_segmented.copy()
df_no_team_colors.drop('team_colors', axis=1, inplace=True)

### Convert Ground-Truth Features to Embeddings (1x45)
***

In [117]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

df = df_no_team_colors.copy()
# prepare data for FastText embedding
jersey_numbers = df['jersey_number'].astype(str).tolist()
sentences = [[char for char in number] for number in jersey_numbers]
# train a FastText model
model = FastText(sentences, vector_size=30, window=3, min_count=1, sg=1)
# get embeddings for jersey numbers
def get_embedding(number):
    return model.wv[number]
df['jersey_embedding'] = df['jersey_number'].apply(lambda x: get_embedding(x))
# expand the embeddings into separate columns
embeddings = pd.DataFrame(df['jersey_embedding'].tolist(), index=df.index)
df = df.drop(columns=['jersey_embedding', 'jersey_number']).join(embeddings)
# boolean columns
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype(int)
# convert all columns to float
df = df.astype(float)
# feature columns
feature_columns = [col for col in df.columns if col != 'player_id']
X = df[feature_columns]
num_cols = len(X.columns)
# remove column names
X_no_columns = X.copy()
X_no_columns.columns = np.arange(num_cols)
# scale all features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_no_columns)
# target column (labels)
y = df['player_id']


### Convert Predictions to Embeddings
***

In [118]:
# how can we convert model predictions to feature vectors?
# human labels for 50 tracklet test set
with open('/playpen-storage/levlevi/player-re-id/src/testing/ocr_analysis/_50_game_reid_benchmark_/annotations.json') as f:
    annotations = json.load(f)
# raw mini-cpm predictions
with open('/playpen-storage/levlevi/player-re-id/src/testing/ocr_analysis/predictions.json') as f:
    predictions = json.load(f)
# roster metadata
with open("/playpen-storage/levlevi/player-re-id/src/data/raw_features.json", 'r') as f:
    rosters = json.load(f)

In [119]:
import random
from typing import List
from collections import Counter

# get most common race from list
def get_maj_race(races):
    if len(races) == 0:
        return None
    counter = Counter(races)
    return counter.most_common(1)[0][0]
# get most common player postion from list
def get_maj_position(positions):
    if len(positions) == 0:
        return None
    counter = Counter(positions)
    return counter.most_common(1)[0][0]
# get most common jersey number from list
def get_maj_jersey_number(jersey_numbers):
    if len(jersey_numbers) == 0:
        return None
    counter = Counter(jersey_numbers)
    return counter.most_common(1)[0][0]

In [120]:
# create dataframe for all tracklet predictions
predictions_df = pd.DataFrame()
# all rows in predictions df
rows = []
for tracklet_fp, raw_predictions in predictions.items():
    # get team ids from file path
    team_one_name = tracklet_fp.split('/')[-2].split('_')[3].replace(" ", "_")
    team_two_name = tracklet_fp.split('/')[-2].split('_')[5].replace(" ", "_")
    team_one_id = rosters[team_one_name]['team_id']
    team_two_id = rosters[team_two_name]['team_id']
    predicted_races = []
    predicted_positions = []
    predicted_jersey_numbers = []
    potential_team_ids = [team_one_id, team_two_id]
    for pred in raw_predictions:
        temp_race = pred.get('race')
        temp_position = pred.get('position')
        temp_jersey_number = pred.get('jersey_number')
        if temp_jersey_number:
            predicted_jersey_numbers.append(temp_jersey_number)
        if temp_race:
            predicted_races.append(temp_race)
        if temp_position:
            predicted_positions.append(temp_position)
    maj_race = get_maj_race(predicted_races)
    maj_pos = get_maj_position(predicted_positions)
    maj_jersey_number = get_maj_jersey_number(predicted_jersey_numbers)
    temp_row = [tracklet_fp, maj_race, maj_pos, maj_jersey_number, potential_team_ids]
    rows.append(temp_row)
# add all rows to df
predictions_df = pd.DataFrame(rows, columns=['tracklet_file_path', 'race', 'position', 'jersey_number', 'potential_team_ids'])

In [121]:
import math
import sys

TEAM_COL_START_IDX = 0
TEAM_COL_END_IDX = 30

RACE_START_IDX = 30
RACE_END_IDX = 34

POSITION_START_IDX = 34
POSITION_END_IDX = 41

JERSEY_NUM_START_IDX = 41

# all prediction embeddings
all_prediction_embeddings = []
# team_ids_column_names
team_ids_column_names = list(X.columns)[TEAM_COL_START_IDX:TEAM_COL_END_IDX]
# race_column_names
race_column_names = list(X.columns)[RACE_START_IDX:RACE_END_IDX]
# position_column_names
position_column_names = list(X.columns)[POSITION_START_IDX:POSITION_END_IDX]
# jersey_column_names
jersey_column_names = list(X.columns)[JERSEY_NUM_START_IDX: ]
# tracklet file paths
tracklet_file_paths = list(predictions_df['tracklet_file_path'])
for idx, player_features in predictions_df.iterrows():
    # blank feature vector
    blank_feature = np.zeros(len(X.columns))
    # feature vector index place holder
    index = 0
    # set one-hot encoded team ids indices to 1
    for row_ix, team_str in enumerate(team_ids_column_names):
        team_id = int(team_str.split('_')[-1])
        potential_team_ids = set(player_features['potential_team_ids'])
        if team_id in potential_team_ids:
            blank_feature[row_ix] = 1
        index += 1
    # get player race
    predicted_race = player_features['race']
    for row_ix, race_str in enumerate(race_column_names):
        race_column_label = race_str.split('_')[-1]
        if race_column_label == predicted_race:
            blank_feature[row_ix + index] = 1
        index += 1
    # get player position
    predicted_position = player_features['position']
    if not predicted_position:
        predicted_position = ''
    for row_ix, position_str in enumerate(position_column_names):
        position_column_label = position_str.split('_')[-1]
        if position_column_label in predicted_position:
            blank_feature[row_ix + index] = 1
        index += 1
    # get player jersey number
    predicted_jersey_number = player_features['jersey_number']
    if not predicted_jersey_number:
        predicted_jersey_number = -sys.maxsize + 1
    # generate embedding
    jersey_number_encoding = get_embedding(predicted_jersey_number)
    for row_ix, val in enumerate(jersey_number_encoding):
        blank_feature[row_ix + index] = val
    # add feature vector to list
    all_prediction_embeddings.append(blank_feature)

# convert to array
all_prediction_embeddings = np.array(all_prediction_embeddings)
# scale features
scaler = StandardScaler()
all_prediction_embeddings_scaled = scaler.fit_transform(all_prediction_embeddings)

In [122]:
# euclidean Distance
def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(vec1, vec2)))

# cosine Similarity
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    dot_product = sum(x * y for x, y in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(x ** 2 for x in vec1))
    magnitude2 = math.sqrt(sum(y ** 2 for y in vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Avoid division by zero
    return dot_product / (magnitude1 * magnitude2)

def pearson_correlation(vec1: List[float], vec2: List[float]) -> float:
    n = len(vec1)
    sum1 = sum(vec1)
    sum2 = sum(vec2)
    sum1_sq = sum(x ** 2 for x in vec1)
    sum2_sq = sum(y ** 2 for y in vec2)
    product_sum = sum(x * y for x, y in zip(vec1, vec2))
    
    numerator = product_sum - (sum1 * sum2 / n)
    denominator = math.sqrt((sum1_sq - sum1 ** 2 / n) * (sum2_sq - sum2 ** 2 / n))
    if denominator == 0:
        return 0.0  # Avoid division by zero
    return numerator / denominator

In [123]:
# # ONLY USE JERSEY NUM EMBEDDINGS
# all_prediction_embeddings_scaled = scaler.fit_transform([x[JERSEY_NUM_START_IDX: ] for x in all_prediction_embeddings_scaled])
# X_scaled = scaler.fit_transform([x[JERSEY_NUM_START_IDX: ] for x in X_scaled])

player_ids_gt = y.copy()
human_labels = []

for fp in tracklet_file_paths:
    video_name = fp.split('/')[-2]
    subtrack = fp.split('/')[-1]
    human_label = annotations[video_name]['tracks'][subtrack]['human_annotation']
    human_labels.append(human_label)
    
predictions_from_features = [np.nan] * len(human_labels)
for pred_fv_idx, feature_vector in enumerate(all_prediction_embeddings_scaled):
    best_score = sys.maxsize - 1
    best_match_idx = np.nan
    for gt_fv_idx, gt_feature_vector in enumerate(X_scaled):
        # OPTIONAL: truncate embeddings
        euclidean_dist = euclidean_distance(feature_vector, gt_feature_vector)
        if euclidean_dist < best_score:
            best_score = euclidean_dist
            best_match_idx = gt_fv_idx
            
    predictions_from_features[pred_fv_idx] = player_ids_gt[best_match_idx]

In [124]:
# create a dataframe with all predictions
predictions_computed = pd.DataFrame({'tracklet_file_path': tracklet_file_paths, 'human_label': human_labels, 'prediction': predictions_from_features})

In [125]:
# find all matching predictions
matched_mask = predictions_computed['human_label'] == predictions_computed['prediction']
predictions_computed_matched = predictions_computed[matched_mask]

In [126]:
# convert all ids to floats
predictions_computed_no_na = predictions_computed_matched.dropna()
predictions_computed_no_na['human_label'] = predictions_computed_no_na['human_label'].astype(int)
predictions_computed_no_na['prediction'] = predictions_computed_no_na['prediction'].astype(int)
# find all matching predictions
matched_mask = predictions_computed_no_na['human_label'] == predictions_computed_no_na['prediction']
predictions_no_na_matched = predictions_computed_no_na[matched_mask]

In [128]:
predictions_no_na_matched.__len__()

22