In [None]:
import pandas as pd
import json

# q: how can we represent players as feature vectors?
raw_features_fp = '/playpen-storage/levlevi/player-re-id/src/data/raw_features.json'
raw_features_df_fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
with open(raw_features_fp, 'r') as f:
    raw_features = json.load(f)
raw_features_df = pd.read_csv(raw_features_df_fp)

In [None]:
# which features are redundant?
df_redundant_features_dropped = raw_features_df[['team_id', 'player_id', 'team_colors', 'jersey_number']]

# did a player play in this game-period?
# encode each player's player_id using a one hot encoding
df_redundant_features_dropped['player_id_one_hot'] = df_redundant_features_dropped['player_id']

# which features can be one-hot encoded?
df_one_hot_team_id = pd.get_dummies(df_redundant_features_dropped, columns=['team_id', 'player_id_one_hot'])

In [None]:
# drop team colors for now
df_no_team_colors = df_one_hot_team_id.copy()
df_no_team_colors.drop('team_colors', axis=1, inplace=True)

### Convert Ground-Truth Features to Embeddings (1x45)
***

In [None]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

JERSEY_VECTOR_SIZE = 500

df = df_no_team_colors.copy()

# prepare data for FastText embedding
jersey_numbers = df['jersey_number'].astype(str).tolist()

sentences = [[char for char in number] for number in jersey_numbers]
# train a FastText model
model = FastText(sentences, vector_size=JERSEY_VECTOR_SIZE, window=1, min_count=1, sg=1)

# get embeddings for jersey numbers
def get_embedding(number):
    return model.wv[number]
df['jersey_embedding'] = df['jersey_number'].apply(lambda x: get_embedding(x))
# expand the embeddings into separate columns
embeddings = pd.DataFrame(df['jersey_embedding'].tolist(), index=df.index)

# add jersey embeddings
df = df.drop(columns=['jersey_embedding', 'jersey_number']).join(embeddings)

# boolean columns
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype(int)
# convert all columns to float
df = df.astype(float)

# select feature columns
feature_columns = [col for col in df.columns if col != 'player_id']
X = df[feature_columns]

num_cols = len(X.columns)
# remove column names
X_no_columns = X.copy()
X_no_columns.columns = np.arange(num_cols)

# normalize all features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_no_columns)
# target column (labels)
y = df['player_id']

### Convert Predictions to Embeddings
***

In [None]:
import ast

# how can we convert model predictions to feature vectors?
# human labels for 50 tracklet test set
with open('/playpen-storage/levlevi/player-re-id/src/data/_50_game_reid_benchmark_/annotations.json') as f:
    annotations = json.load(f)
    
# convert raw florence preditions to dict
florence_predictions_fp = '/playpen-storage/levlevi/player-re-id/src/data/florence_100_track_bm_results.json'
def read_florence_predictions(fp):
    with open(fp, 'r') as f:
        florence_predictions = f.readlines()
    predictions = {}
    for p in florence_predictions:
        key = p.split(':')[0]
        values = ast.literal_eval('{' + '{'.join(p.split('{')[1: ]))
        predictions[key] = values
    return predictions
predictions = read_florence_predictions(florence_predictions_fp)

# roster metadata
with open("/playpen-storage/levlevi/player-re-id/src/data/raw_features.json", 'r') as f:
    rosters = json.load(f)

In [None]:
from typing import List
from collections import Counter

# get most common race from list
def get_maj_race(races):
    if len(races) == 0:
        return None
    counter = Counter(races)
    return counter.most_common(1)[0][0]

# get most common player postion from list
def get_maj_position(positions):
    if len(positions) == 0:
        return None
    counter = Counter(positions)
    return counter.most_common(1)[0][0]

# get most common jersey number from list
def get_maj_jersey_number(jersey_numbers):
    if len(jersey_numbers) == 0:
        return None
    counter = Counter(jersey_numbers)
    return counter.most_common(1)[0][0]

In [295]:
import re
import os
from glob import glob

HUDL_GAME_LOGS_DIR = '/mnt/sun/levlevi/nba-plus-statvu-dataset/hudl-game-logs'
# id: fp
game_logs_map = {
    f.split('.')[-2]: os.path.join(HUDL_GAME_LOGS_DIR, f) for f in os.listdir(HUDL_GAME_LOGS_DIR)
}

def get_candidate_player_ids(tracklet_fp: str) -> List[str]:
    game_id = tracklet_fp.split('/')[-2].split('_')[0]
    period = tracklet_fp.split('/')[-2].split('_')[-1].split('period')[1][0]
    period = int(period)
    df = pd.read_csv(game_logs_map[game_id], delimiter=';')
    df['is_period'] = df['half'].apply(lambda x: x == period)
    df_period_matched = df[df['is_period'] == True]
    unqiue_players = set(df_period_matched['player_id'].unique())
    unique_players_no_na = {p for p in unqiue_players if p > 0}
    return list(unique_players_no_na)

def get_candidate_team_ids(tracklet_fp: str):
    team_one_name = tracklet_fp.split('/')[-2].split('_')[3].replace(" ", "_")
    team_two_name = tracklet_fp.split('/')[-2].split('_')[5].replace(" ", "_")
    team_one_id = rosters[team_one_name]['team_id']
    team_two_id = rosters[team_two_name]['team_id']
    return [team_one_id, team_two_id]

# create dataframe for all tracklet predictions
predictions_df = pd.DataFrame()
# all rows in predictions df
rows = []
for tracklet_fp, raw_predictions in predictions.items():
    # get team ids from file path
    predicted_jersey_numbers = []
    candidate_team_ids = get_candidate_team_ids(tracklet_fp)
    candidate_players_ids = get_candidate_player_ids(tracklet_fp)
    # get jersey numbers
    for frame_idx in raw_predictions:
        # match all valid predictions
        temp_jersey_number_arr = re.findall(r'\d+', raw_predictions[frame_idx].get('<OCR>'))
        for n in temp_jersey_number_arr:
            predicted_jersey_numbers.append(n)
    # find most common jersey number
    maj_jersey_number = get_maj_jersey_number(predicted_jersey_numbers)
    temp_row = [tracklet_fp, maj_jersey_number, candidate_team_ids, candidate_players_ids]
    rows.append(temp_row)
    
# add all rows to df
predictions_df = pd.DataFrame(rows, columns=['tracklet_file_path', 'jersey_number', 'potential_team_ids', 'candidate_player_ids'])
predictions_df

Unnamed: 0,tracklet_file_path,jersey_number,potential_team_ids,candidate_player_ids
0,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,33.0,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
1,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
2,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
3,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,22.0,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
4,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,2.0,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
5,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
6,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,12.0,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."
7,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,28.0,"[1610612747, 1610612754]","[5313.0, 98.0, 5315.0, 99.0, 100.0, 5318.0, 10..."
8,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,3.0,"[1610612747, 1610612754]","[5313.0, 98.0, 5315.0, 99.0, 100.0, 5318.0, 10..."
9,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,4.0,"[1610612749, 1610612746]","[1804.0, 1805.0, 99345.0, 370973.0, 95270.0, 1..."


In [None]:
import math
import sys

TEAM_COL_START_IDX = 0
TEAM_COL_END_IDX = 30

# RACE_START_IDX = TEAM_COL_END_IDX
# RACE_END_IDX = RACE_START_IDX + 4

# POSITION_START_IDX = RACE_END_IDX
# POSITION_END_IDX = RACE_END_IDX + 7

POSITION_END_IDX = 30
JERSEY_NUM_START_IDX = POSITION_END_IDX

# all prediction embeddings
all_prediction_embeddings = []
# team_ids_column_names
team_ids_column_names = list(X.columns)[TEAM_COL_START_IDX:TEAM_COL_END_IDX]
jersey_column_names = list(X.columns)[JERSEY_NUM_START_IDX: ]
# tracklet file paths
tracklet_file_paths = list(predictions_df['tracklet_file_path'])
for idx, player_features in predictions_df.iterrows():
    # blank feature vector
    blank_feature = np.zeros(len(X.columns))
    # feature vector index place holder
    index = 0
    # set one-hot encoded team ids indices to 1
    for row_ix, team_str in enumerate(team_ids_column_names):
        # print(team_ids_column_names)
        team_id = int(team_str.split('_')[-1])
        potential_team_ids = set(player_features['potential_team_ids'])
        if team_id in potential_team_ids:
            blank_feature[row_ix] = 1
        index += 1
    
    # get player jersey number
    predicted_jersey_number = player_features['jersey_number']
    if not predicted_jersey_number:
        predicted_jersey_number = -sys.maxsize + 1
    # generate embedding
    jersey_number_encoding = get_embedding(predicted_jersey_number)
    for row_ix, val in enumerate(jersey_number_encoding):
        blank_feature[row_ix + index] = val
    # add feature vector to list
    all_prediction_embeddings.append(blank_feature)

# convert to array
all_prediction_embeddings = np.array(all_prediction_embeddings)

# scale features
scaler = StandardScaler()
all_prediction_embeddings_scaled = scaler.fit_transform(all_prediction_embeddings)

In [None]:
# euclidean Distance
def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(vec1, vec2)))

# cosine Similarity
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    dot_product = sum(x * y for x, y in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(x ** 2 for x in vec1))
    magnitude2 = math.sqrt(sum(y ** 2 for y in vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Avoid division by zero
    return dot_product / (magnitude1 * magnitude2)

def pearson_correlation(vec1: List[float], vec2: List[float]) -> float:
    n = len(vec1)
    sum1 = sum(vec1)
    sum2 = sum(vec2)
    sum1_sq = sum(x ** 2 for x in vec1)
    sum2_sq = sum(y ** 2 for y in vec2)
    product_sum = sum(x * y for x, y in zip(vec1, vec2))
    
    numerator = product_sum - (sum1 * sum2 / n)
    denominator = math.sqrt((sum1_sq - sum1 ** 2 / n) * (sum2_sq - sum2 ** 2 / n))
    if denominator == 0:
        return 0.0  # Avoid division by zero
    return numerator / denominator

In [None]:
# ONLY USE TEAM AND JERSEY NUMBER EMBEDDINGS
# all_prediction_embeddings_scaled = scaler.fit_transform([np.concatenate([x[TEAM_COL_START_IDX: TEAM_COL_END_IDX], x[JERSEY_NUM_START_IDX: ]]) for x in all_prediction_embeddings_scaled])
# X_scaled = scaler.fit_transform([np.concatenate([x[TEAM_COL_START_IDX: TEAM_COL_END_IDX], x[JERSEY_NUM_START_IDX: ]]) for x in X_scaled])

player_ids_gt = y.copy()
human_labels = []

for fp in tracklet_file_paths:
    video_name = fp.split('/')[-2]
    subtrack = fp.split('/')[-1]
    human_label = annotations[video_name]['tracks'][subtrack]['human_annotation']
    human_labels.append(human_label)
    
predictions_from_features = [np.nan] * len(human_labels)
for pred_fv_idx, feature_vector in enumerate(all_prediction_embeddings_scaled):
    best_score = sys.maxsize - 1
    best_match_idx = np.nan 
    for gt_fv_idx, gt_feature_vector in enumerate(X_scaled):
        euclidean_dist = euclidean_distance(feature_vector, gt_feature_vector)
        if euclidean_dist < best_score:
            best_score = euclidean_dist
            best_match_idx = gt_fv_idx
            
    predictions_from_features[pred_fv_idx] = player_ids_gt[best_match_idx]

In [None]:
# create a dataframe with all predictions
predictions_computed = pd.DataFrame({'tracklet_file_path': tracklet_file_paths, 'human_label': human_labels, 'prediction': predictions_from_features})

In [None]:
# find all matching predictions
matched_mask = predictions_computed['human_label'] == predictions_computed['prediction']
predictions_computed_matched = predictions_computed[matched_mask]

In [None]:
# convert all ids to floats
predictions_computed_no_na = predictions_computed_matched.dropna()
predictions_computed_no_na['human_label'] = predictions_computed_no_na['human_label'].astype(int)
predictions_computed_no_na['prediction'] = predictions_computed_no_na['prediction'].astype(int)

# find all matching predictions
matched_mask = abs(predictions_computed_no_na['human_label'] - predictions_computed_no_na['prediction']) < 0.5
predictions_no_na_matched = predictions_computed_no_na[matched_mask]