In [59]:
import pandas as pd
import json

# q: how can we represent players as feature vectors?
raw_features_fp = '/playpen-storage/levlevi/player-re-id/src/data/raw_features.json'
raw_features_df_fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
with open(raw_features_fp, 'r') as f:
    raw_features = json.load(f)
raw_features_df = pd.read_csv(raw_features_df_fp)

In [60]:
# which features are redundant?
df_redundant_features_dropped = raw_features_df[['team_id', 'player_id', 'jersey_number']]

df_redundant_features_dropped['player_in_game'] = [False] * len(df_redundant_features_dropped)

# which features can be one-hot encoded?
df_one_hot_team_id = pd.get_dummies(df_redundant_features_dropped, columns=['team_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_redundant_features_dropped['player_in_game'] = [False] * len(df_redundant_features_dropped)


### Convert Ground-Truth Features to Embeddings (1x45)
***

In [61]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

JERSEY_VECTOR_SIZE = 30
df = df_one_hot_team_id.copy()

# prepare data for FastText embedding
jersey_numbers = df['jersey_number'].astype(str).tolist()

sentences = [[char for char in number] for number in jersey_numbers]
# train a FastText model
model = FastText(sentences, vector_size=JERSEY_VECTOR_SIZE, window=1, min_count=1, sg=1)

# get embeddings for jersey numbers
def get_embedding(number):
    return model.wv[number]
df['jersey_embedding'] = df['jersey_number'].apply(lambda x: get_embedding(x))
# expand the embeddings into separate columns
embeddings = pd.DataFrame(df['jersey_embedding'].tolist(), index=df.index)

# add jersey embeddings
df = df.drop(columns=['jersey_embedding', 'jersey_number']).join(embeddings)

# boolean columns
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype(int)
# convert all columns to float
df = df.astype(float)

# select feature columns
feature_columns = [col for col in df.columns if col != 'player_id']
X = df[feature_columns]

# remove column indexes
X_no_columns = X.copy()
X_no_columns.columns = np.arange(len(X.columns))

# X
player_ground_truth_embeddings = X_no_columns

# target column (labels)
labels = df['player_id']

### Convert Predictions to Embeddings
***

In [62]:
import ast

# how can we convert model predictions to feature vectors?
# human labels for 50 tracklet test set
with open('/playpen-storage/levlevi/player-re-id/src/data/_50_game_reid_benchmark_/annotations.json') as f:
    annotations = json.load(f)
    
# convert raw florence preditions to dict
florence_predictions_fp = '/playpen-storage/levlevi/player-re-id/src/data/florence_100_track_bm_results.json'
def read_florence_predictions(fp):
    with open(fp, 'r') as f:
        florence_predictions = f.readlines()
    predictions = {}
    for p in florence_predictions:
        key = p.split(':')[0]
        values = ast.literal_eval('{' + '{'.join(p.split('{')[1: ]))
        predictions[key] = values
    return predictions
predictions = read_florence_predictions(florence_predictions_fp)

# roster metadata
with open("/playpen-storage/levlevi/player-re-id/src/data/raw_features.json", 'r') as f:
    rosters = json.load(f)

In [63]:
from typing import List
from collections import Counter

# get most common race from list
def get_maj_race(races):
    if len(races) == 0:
        return None
    counter = Counter(races)
    return counter.most_common(1)[0][0]

# get most common player postion from list
def get_maj_position(positions):
    if len(positions) == 0:
        return None
    counter = Counter(positions)
    return counter.most_common(1)[0][0]

# get most common jersey number from list
def get_maj_jersey_number(jersey_numbers):
    if len(jersey_numbers) == 0:
        return None
    counter = Counter(jersey_numbers)
    return counter.most_common(1)[0][0]

In [64]:
import re
import os
from glob import glob
from difflib import SequenceMatcher

HUDL_GAME_LOGS_DIR = '/mnt/sun/levlevi/nba-plus-statvu-dataset/hudl-game-logs'
# id: fp
game_logs_map = {
    f.split('.')[-2]: os.path.join(HUDL_GAME_LOGS_DIR, f) for f in os.listdir(HUDL_GAME_LOGS_DIR)
}

def calculate_string_similarity_score(string_one, string_two):
    return SequenceMatcher(None, string_one, string_two).ratio()

def find_closest_player_id(player_name: str) -> str:
    best_similarity = -100
    best_player_id_match = None
    best_match_name = ''
    for team in raw_features:
        for player in raw_features[team]['players']:
            cand_full_name = ''
            if ',' not in player:
                cand_full_name = player
            else:
                cand_full_name = player.split(',')[1] + ' ' + player.split(',')[0]
            player_id = raw_features[team]['players'][player]['player_id']
            similarity = calculate_string_similarity_score(cand_full_name, player_name)
            if similarity > best_similarity:
                best_similarity = similarity
                best_player_id_match = player_id
                best_match_name = cand_full_name
    # sanity check
    # print(player_name, best_match_name, best_similarity)
    return int(best_player_id_match)

def get_candidate_player_ids(tracklet_fp: str) -> List[str]:
    game_id = tracklet_fp.split('/')[-2].split('_')[0]
    period = tracklet_fp.split('/')[-2].split('_')[-1].split('period')[1][0]
    period = int(period)
    df = pd.read_csv(game_logs_map[game_id], delimiter=';')
    df['is_period'] = df['half'].apply(lambda x: x == period)
    df_period_matched = df[df['is_period'] == True]
    
    unique_player_ids = set()
    unique_player_names = set(df['player_name'].unique())
    unique_player_names = set(df_period_matched['player_name'].unique())
    unique_player_names_no_nan = {p for p in unique_player_names if not pd.isna(p)}
    for pn in unique_player_names_no_nan:
        unique_player_ids.add(find_closest_player_id(pn))
    return unique_player_ids

def get_candidate_team_ids(tracklet_fp: str):
    team_one_name = tracklet_fp.split('/')[-2].split('_')[3].replace(" ", "_")
    team_two_name = tracklet_fp.split('/')[-2].split('_')[5].replace(" ", "_")
    team_one_id = rosters[team_one_name]['team_id']
    team_two_id = rosters[team_two_name]['team_id']
    return [team_one_id, team_two_id]

# all rows in predictions df
rows = []
for tracklet_fp, raw_predictions in predictions.items():
    # get team ids from file path
    predicted_jersey_numbers = []
    candidate_team_ids = get_candidate_team_ids(tracklet_fp)
    candidate_players_ids = get_candidate_player_ids(tracklet_fp)
    # get jersey numbers
    for frame_idx in raw_predictions:
        # match all valid predictions
        temp_jersey_number_arr = re.findall(r'\d+', raw_predictions[frame_idx].get('<OCR>'))
        for n in temp_jersey_number_arr:
            predicted_jersey_numbers.append(n)
    # find most common jersey number
    maj_jersey_number = get_maj_jersey_number(predicted_jersey_numbers)
    temp_row = [tracklet_fp, maj_jersey_number, candidate_team_ids, candidate_players_ids]
    rows.append(temp_row)
    
# add all rows to df
all_tracklet_features = pd.DataFrame(rows, columns=['tracklet_file_path', 'jersey_number', 'candidate_team_ids', 'candidate_player_ids'])

In [66]:
import math
import sys

# features: player_in_game, team_id, jersey_number, 
PLAYER_IN_GAME_IDX = 0

TEAM_COL_START_IDX = 1
TEAM_COL_END_IDX = TEAM_COL_START_IDX + JERSEY_VECTOR_SIZE

JERSEY_NUM_START_IDX = TEAM_COL_END_IDX

# embeddings representing all extracted features for a tracklet
tracklet_embeddings = []

# team_ids column names
team_ids_column_names = list(X.columns)[TEAM_COL_START_IDX:TEAM_COL_END_IDX]

# jersey_number column names
jersey_column_names = list(X.columns)[JERSEY_NUM_START_IDX: ]

# tracklet file paths
tracklet_file_paths = list(all_tracklet_features['tracklet_file_path'])

for idx, tracklet_features in all_tracklet_features.iterrows():
    # blank feature vector
    blank_embedding = np.zeros(len(X.columns))
    
    # feature vector index place holder
    index = 0
    # skip player in game feature
    index += 1
        
    # set one-hot encoded team ids indices to 1
    potential_team_ids = set(tracklet_features['candidate_team_ids'])
    
    for row_idx, team_str in enumerate(team_ids_column_names):
        team_id = int(team_str.split('_')[-1])
        if team_id in potential_team_ids:
            blank_embedding[row_idx] = 1
        index += 1
    
    # get player jersey number
    predicted_jersey_number = tracklet_features['jersey_number']
    if not predicted_jersey_number:
        predicted_jersey_number = -sys.maxsize + 1
    # generate embedding
    jersey_number_encoding = get_embedding(predicted_jersey_number)
    for row_idx, val in enumerate(jersey_number_encoding):
        blank_embedding[row_idx + index] = val
    
    # add feature vector to list
    tracklet_embeddings.append(blank_embedding)

# convert to array
tracklet_embeddings = np.array(tracklet_embeddings)

In [67]:
# euclidean Distance
def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(vec1, vec2)))

# cosine Similarity
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    dot_product = sum(x * y for x, y in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(x ** 2 for x in vec1))
    magnitude2 = math.sqrt(sum(y ** 2 for y in vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Avoid division by zero
    return dot_product / (magnitude1 * magnitude2)

def pearson_correlation(vec1: List[float], vec2: List[float]) -> float:
    n = len(vec1)
    sum1 = sum(vec1)
    sum2 = sum(vec2)
    sum1_sq = sum(x ** 2 for x in vec1)
    sum2_sq = sum(y ** 2 for y in vec2)
    product_sum = sum(x * y for x, y in zip(vec1, vec2))
    
    numerator = product_sum - (sum1 * sum2 / n)
    denominator = math.sqrt((sum1_sq - sum1 ** 2 / n) * (sum2_sq - sum2 ** 2 / n))
    if denominator == 0:
        return 0.0  # Avoid division by zero
    return numerator / denominator

In [89]:
# ONLY USE TEAM AND JERSEY NUMBER EMBEDDINGS
# all_prediction_embeddings_scaled = scaler.fit_transform([np.concatenate([x[TEAM_COL_START_IDX: TEAM_COL_END_IDX], x[JERSEY_NUM_START_IDX: ]]) for x in all_prediction_embeddings_scaled])
# X_scaled = scaler.fit_transform([np.concatenate([x[TEAM_COL_START_IDX: TEAM_COL_END_IDX], x[JERSEY_NUM_START_IDX: ]]) for x in X_scaled])

def normalize_vector(arr):
    mean = np.mean(arr)
    std = np.std(arr)
    return (arr - mean) / std

human_labels = []

for fp in tracklet_file_paths:
    video_name = fp.split('/')[-2]
    subtrack = fp.split('/')[-1]
    human_label = annotations[video_name]['tracks'][subtrack]['human_annotation']
    human_labels.append(human_label)
    
# for each tracklet, generate a feature vector
reidentified_tracklet_player_ids = [np.nan] * len(human_labels)
for tracklet_index, tracklet_embedding in enumerate(tracklet_embeddings):
    best_score = sys.maxsize - 1
    best_match_idx = np.nan
    for index, row in enumerate(player_ground_truth_embeddings.itertuples()):
        candidate_player_embedding = np.array(row)
        # normalized vectors
        # v1 = normalize_vector(tracklet_embedding)[1: ]
        # v2 = normalize_vector(candidate_player_embedding)[1: ]
        
        # find dist
        euclidean_dist = (tracklet_embedding[1: ], candidate_player_embedding[1: ])
        print(euclidean_dist)
        
        # if index % 50 == 0:
        #     print(tracklet_embedding)
        #     print(candidate_player_embedding)
        #     assert False
            
        euclidean_dist = euclidean_distance(candidate_player_embedding, tracklet_embedding)
        if euclidean_dist < best_score:
            best_score = euclidean_dist
            best_match_idx = index
            
    # print(best_match_idx)
    reidentified_tracklet_player_ids[tracklet_index] = labels[best_match_idx]
    
    break

(array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.01598771, -0.00961207,  0.00804735, -0.01401704,  0.0275057 ,
        0.00609295,  0.01959442, -0.00785966,  0.01306538,  0.00504062,
        0.00391874, -0.00218122,  0.00808902,  0.0127568 ,  0.00133242,
        0.00623575, -0.01195749, -0.00752794, -0.02476759,  0.00243054,
        0.00668655, -0.00344423,  0.01228664, -0.01208394,  0.00251345,
       -0.01293537, -0.01172553, -0.01219346,  0.00822715, -0.00169341]), array([ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 

In [88]:
# create a dataframe with all predictions
predictions_computed = pd.DataFrame({'tracklet_file_path': tracklet_file_paths, 'human_label': human_labels, 'prediction': reidentified_tracklet_player_ids})
predictions_computed

Unnamed: 0,tracklet_file_path,human_label,prediction
0,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,202340.0
1,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,
2,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,200755.0,
3,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,
4,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,
5,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,201933.0,
6,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,201933.0,
7,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,977.0,
8,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,201588.0,
9,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,200755.0,


In [None]:
# find all matching predictions
matched_mask = predictions_computed['human_label'] == predictions_computed['prediction']
predictions_computed_matched = predictions_computed[matched_mask]

In [None]:
# convert all ids to floats
predictions_computed_no_na = predictions_computed_matched.dropna()
predictions_computed_no_na['human_label'] = predictions_computed_no_na['human_label'].astype(int)
predictions_computed_no_na['prediction'] = predictions_computed_no_na['prediction'].astype(int)

# find all matching predictions
matched_mask = abs(predictions_computed_no_na['human_label'] - predictions_computed_no_na['prediction']) < 0.5
predictions_no_na_matched = predictions_computed_no_na[matched_mask]

In [None]:
len(predictions_no_na_matched)

0

In [None]:
v1 = np.array([1, 0, 0, 0, 1, 0, 1]).astype(np.float32)
v2 = np.array([0, 1, 0, 1, 0, 1, 0]).astype(np.float32)
v3 = np.array([0, 0, 1, 0, 1, 1, 0]).astype(np.float32)
c1 = np.array([1, 1, 0, 1, 1, 0, 1]).astype(np.float32)

print(euclidean_distance(c1, v1), euclidean_distance(c1, v2), euclidean_distance(c1, v3))