In [1]:
import pandas as pd
import json

ground_truth_player_features_fp = '/playpen-storage/levlevi/player-re-id/src/data/raw_features.json'
ground_truth_player_features_df_fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
with open(ground_truth_player_features_fp, 'r') as f:
    ground_truth_player_features_dict = json.load(f)
ground_truth_player_features_df = pd.read_csv(ground_truth_player_features_df_fp)

In [2]:
# which features are redundant?
ground_truth_player_features_subset_df = ground_truth_player_features_df[['team_id', 'player_id', 'jersey_number']]

# add feature to indicate if player is in game
ground_truth_player_features_subset_df['player_in_game'] = [False] * len(ground_truth_player_features_subset_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth_player_features_subset_df['player_in_game'] = [False] * len(ground_truth_player_features_subset_df)


### **Format Tracklet Predicted Features**
***

In [3]:
import ast

# how can we convert model predictions to feature vectors?
# human labels for 50 tracklet test set
with open('/playpen-storage/levlevi/player-re-id/src/data/_50_game_reid_benchmark_/annotations.json') as f:
    tracklet_human_annotations_dict = json.load(f)
    
# convert raw florence preditions to dict
florence_predictions_fp = '/playpen-storage/levlevi/player-re-id/src/data/florence_100_track_bm_results.json'
def read_florence_predictions(fp):
    with open(fp, 'r') as f:
        florence_predictions = f.readlines()
    predictions = {}
    for p in florence_predictions:
        key = p.split(':')[0]
        values = ast.literal_eval('{' + '{'.join(p.split('{')[1: ]))
        predictions[key] = values
    return predictions
predicted_tracklet_features_florence_dict = read_florence_predictions(florence_predictions_fp)

# roster metadata
with open("/playpen-storage/levlevi/player-re-id/src/data/raw_features.json", 'r') as f:
    ground_truth_player_features_dict = json.load(f)

In [4]:
from typing import List
from collections import Counter

# get most common race from list
def get_maj_race(races):
    if len(races) == 0:
        return None
    counter = Counter(races)
    return counter.most_common(1)[0][0]

# get most common player postion from list
def get_maj_position(positions):
    if len(positions) == 0:
        return None
    counter = Counter(positions)
    return counter.most_common(1)[0][0]

# get most common jersey number from list
def get_maj_jersey_number(jersey_numbers):
    if len(jersey_numbers) == 0:
        return None
    counter = Counter(jersey_numbers)
    return counter.most_common(1)[0][0]

In [5]:
import re
import os
from glob import glob
from difflib import SequenceMatcher

HUDL_GAME_LOGS_DIR = '/mnt/sun/levlevi/nba-plus-statvu-dataset/hudl-game-logs'
# id: fp
game_logs_map = {
    f.split('.')[-2]: os.path.join(HUDL_GAME_LOGS_DIR, f) for f in os.listdir(HUDL_GAME_LOGS_DIR)
}

def calculate_string_similarity_score(string_one, string_two):
    return SequenceMatcher(None, string_one, string_two).ratio()

def find_closest_player_id(player_name: str) -> str:
    best_similarity = -100
    best_player_id_match = None
    best_match_name = ''
    for team in ground_truth_player_features_dict:
        for player in ground_truth_player_features_dict[team]['players']:
            cand_full_name = ''
            if ',' not in player:
                cand_full_name = player
            else:
                cand_full_name = player.split(',')[1] + ' ' + player.split(',')[0]
            player_id = ground_truth_player_features_dict[team]['players'][player]['player_id']
            similarity = calculate_string_similarity_score(cand_full_name, player_name)
            if similarity > best_similarity:
                best_similarity = similarity
                best_player_id_match = player_id
                best_match_name = cand_full_name
    # sanity check
    # print(player_name, best_match_name, best_similarity)
    return int(best_player_id_match)

def get_candidate_player_ids(tracklet_fp: str) -> List[str]:
    game_id = tracklet_fp.split('/')[-2].split('_')[0]
    period = tracklet_fp.split('/')[-2].split('_')[-1].split('period')[1][0]
    period = int(period)
    df = pd.read_csv(game_logs_map[game_id], delimiter=';')
    df['is_period'] = df['half'].apply(lambda x: x == period)
    df_period_matched = df[df['is_period'] == True]
    
    unique_player_ids = set()
    unique_player_names = set(df['player_name'].unique())
    unique_player_names = set(df_period_matched['player_name'].unique())
    unique_player_names_no_nan = {p for p in unique_player_names if not pd.isna(p)}
    for pn in unique_player_names_no_nan:
        unique_player_ids.add(find_closest_player_id(pn))
    return unique_player_ids

def get_candidate_team_ids(tracklet_fp: str) -> List[str]:
    team_one_name = tracklet_fp.split('/')[-2].split('_')[3].replace(" ", "_")
    team_two_name = tracklet_fp.split('/')[-2].split('_')[5].replace(" ", "_")
    team_one_id = ground_truth_player_features_dict[team_one_name]['team_id']
    team_two_id = ground_truth_player_features_dict[team_two_name]['team_id']
    return [team_one_id, team_two_id]

# all rows in predictions df
rows = []
for tracklet_fp, raw_predictions in predicted_tracklet_features_florence_dict.items():
    # get team ids from file path
    predicted_jersey_numbers = []
    candidate_team_ids = get_candidate_team_ids(tracklet_fp)
    candidate_players_ids = get_candidate_player_ids(tracklet_fp)
    # get jersey numbers
    for frame_idx in raw_predictions:
        # match all valid predictions
        temp_jersey_number_arr = re.findall(r'\d+', raw_predictions[frame_idx].get('<OCR>'))
        for n in temp_jersey_number_arr:
            predicted_jersey_numbers.append(n)
    # find most common jersey number
    maj_jersey_number = get_maj_jersey_number(predicted_jersey_numbers)
    temp_row = [tracklet_fp, maj_jersey_number, candidate_team_ids, candidate_players_ids]
    rows.append(temp_row)
    
# add all rows to df
all_tracklet_features = pd.DataFrame(rows, columns=['tracklet_file_path', 'predicted_jersey_number', 'candidate_team_ids', 'candidate_player_ids'])

In [6]:
human_labels = []
for file_path in all_tracklet_features['tracklet_file_path']:
    video_name = file_path.split('/')[-2]
    subtrack = file_path.split('/')[-1]
    human_label = tracklet_human_annotations_dict[video_name]['tracks'][subtrack]['human_annotation']
    human_labels.append(human_label)
    
all_tracklet_features['human_label'] = human_labels

In [29]:
import math
import numpy as np
import sys
from gensim.models import FastText

JERSEY_VECTOR_SIZE = 300

# 1. generate ground truth player embeddings
ground_truth_player_embeddings = []

jersey_numbers = list(set(ground_truth_player_features_subset_df['jersey_number'].astype(str)))
sentences = [[char for char in number] for number in jersey_numbers]
model = FastText(sentences, vector_size=JERSEY_VECTOR_SIZE, window=1, min_count=1, sg=1)

In [30]:
unique_team_ids = list(ground_truth_player_features_dict[k]['team_id'] for k in ground_truth_player_features_dict)
unique_team_ids_idx_map = {str(int(team_id)): idx for idx, team_id in enumerate(unique_team_ids)}

player_ids_arr = ground_truth_player_features_subset_df['player_id'].unique()
player_ids_idx_map = {player_id: idx for idx, player_id in enumerate(player_ids_arr)}

In [31]:
def get_jersey_number_embed(number: str):
    if not number:
        return [-sys.maxsize + 1] * JERSEY_VECTOR_SIZE
    return model.wv[number]

def get_team_id_embed(team_ids: List[str]):
    embedding = np.zeros(len(unique_team_ids_idx_map))
    for team_id in team_ids:
        team_idx = unique_team_ids_idx_map[str(team_id)]
        embedding[team_idx] = 1
    return embedding
    
def get_player_id_embed(player_ids: List[str]):
    embedding = np.zeros(len(player_ids_idx_map))
    for player_id in player_ids:
        player_idx = player_ids_idx_map[player_id]
        embedding[player_idx] = 1
    return embedding
    
for row in ground_truth_player_features_subset_df.itertuples():
    jersey_number = row.jersey_number
    # a. get jersey number embed
    jersey_number_embedding = get_jersey_number_embed(jersey_number)
    # b. get team id embed
    team_id = [row.team_id]
    team_id_embedding = get_team_id_embed(team_id)
    # c. get player id embeds
    player_id = [row.player_id]
    player_id_embedding = get_player_id_embed(player_id)
    # d. get concatinated embedding
    concatinated_embedding = np.concatenate([jersey_number_embedding, team_id_embedding, player_id_embedding])
    ground_truth_player_embeddings.append(concatinated_embedding)
    
ground_truth_player_features_subset_df['embedding'] = ground_truth_player_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth_player_features_subset_df['embedding'] = ground_truth_player_embeddings


In [36]:
# 2. generated tracklet embeddings
tracklet_embeddings = []
for row in all_tracklet_features.itertuples():
    jersey_number = row.predicted_jersey_number
    team_ids = row.candidate_team_ids
    candidate_player_ids = row.candidate_player_ids
    # a. get jersey number embed
    jersey_number_embedding = get_jersey_number_embed(jersey_number)
    # b. get team id embed
    team_id_embedding = get_team_id_embed(team_ids)
    # c. get player id embeds
    player_ids_embedding = get_player_id_embed(list(candidate_player_ids))
    # d. get concatinated embedding
    concatinated_embedding = np.concatenate([jersey_number_embedding, team_id_embedding, player_ids_embedding])
    tracklet_embeddings.append(concatinated_embedding)
    
all_tracklet_features['embedding'] = tracklet_embeddings
all_tracklet_features.head()

Unnamed: 0,tracklet_file_path,predicted_jersey_number,candidate_team_ids,candidate_player_ids,human_label,embedding
0,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,33.0,"[1610612749, 1610612746]","{201601, 203948, 203953, 200755, 1718, 2746, 1...",,"[0.015987707301974297, -0.009612071327865124, ..."
1,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,"[1610612749, 1610612746]","{201601, 203948, 203953, 200755, 1718, 2746, 1...",203114.0,"[-9.223372036854776e+18, -9.223372036854776e+1..."
2,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,"[1610612749, 1610612746]","{201601, 203948, 203953, 200755, 1718, 2746, 1...",200755.0,"[-9.223372036854776e+18, -9.223372036854776e+1..."
3,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,22.0,"[1610612749, 1610612746]","{201601, 203948, 203953, 200755, 1718, 2746, 1...",203114.0,"[0.007096998859196901, 0.013843805529177189, -..."
4,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,2.0,"[1610612749, 1610612746]","{201601, 203948, 203953, 200755, 1718, 2746, 1...",203114.0,"[0.004819462541490793, -0.014067944139242172, ..."


In [37]:
# euclidean Distance
def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(vec1, vec2)))

# cosine Similarity
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    dot_product = sum(x * y for x, y in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(x ** 2 for x in vec1))
    magnitude2 = math.sqrt(sum(y ** 2 for y in vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Avoid division by zero
    return dot_product / (magnitude1 * magnitude2)

def pearson_correlation(vec1: List[float], vec2: List[float]) -> float:
    n = len(vec1)
    sum1 = sum(vec1)
    sum2 = sum(vec2)
    sum1_sq = sum(x ** 2 for x in vec1)
    sum2_sq = sum(y ** 2 for y in vec2)
    product_sum = sum(x * y for x, y in zip(vec1, vec2))
    
    numerator = product_sum - (sum1 * sum2 / n)
    denominator = math.sqrt((sum1_sq - sum1 ** 2 / n) * (sum2_sq - sum2 ** 2 / n))
    if denominator == 0:
        return 0.0  # Avoid division by zero
    return numerator / denominator

# **Match Tracklet Embeddings to Candidates**
***

In [38]:
def normalize_vector(arr):
    mean = np.mean(arr)
    std = np.std(arr)
    return (arr - mean) / std
    
# for each tracklet, generate a feature vector
tracklet_index = 0
reidentified_tracklet_player_ids = [np.nan] * len(human_labels)
for tracklet_row in all_tracklet_features.itertuples():
    tracklet_embedding = np.array(tracklet_row.embedding)
    best_score = - sys.maxsize + 1
    best_match_idx = np.nan
    for row in ground_truth_player_features_subset_df.itertuples():
        candidate_player_embedding = np.array(row.embedding)
        # normalize
        v1 = normalize_vector(tracklet_embedding)
        v2 = normalize_vector(candidate_player_embedding)
        # compute similarity
        score = cosine_similarity(tracklet_embedding, candidate_player_embedding)
        if score > best_score:
            best_score = score
            best_match_player_id = row.player_id
            
    # print(best_match_player_id)
    reidentified_tracklet_player_ids[tracklet_index] = best_match_player_id
    tracklet_index += 1

In [39]:
# create a dataframe with all predictions
predictions_computed = pd.DataFrame({'tracklet_file_path': all_tracklet_features['tracklet_file_path'], 'human_label': human_labels, 'prediction': reidentified_tracklet_player_ids})
predictions_computed.head()

Unnamed: 0,tracklet_file_path,human_label,prediction
0,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,,202325
1,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,202340
2,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,200755.0,202340
3,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,203114
4,/mnt/opr/levlevi/player-re-id/src/data/_50_gam...,203114.0,2037


In [40]:
# find all matching predictions
matched_mask = predictions_computed['human_label'] == predictions_computed['prediction']
predictions_computed_matched = predictions_computed[matched_mask]

In [41]:
# convert all ids to floats
predictions_computed_no_na = predictions_computed_matched.dropna()
predictions_computed_no_na['human_label'] = predictions_computed_no_na['human_label'].astype(int)
predictions_computed_no_na['prediction'] = predictions_computed_no_na['prediction'].astype(int)

# find all matching predictions
matched_mask = abs(predictions_computed_no_na['human_label'] - predictions_computed_no_na['prediction']) < 0.5
predictions_no_na_matched = predictions_computed_no_na[matched_mask]

In [42]:
len(predictions_no_na_matched)

22

In [14]:
v1 = np.array([1, 0, 0, 0, 1, 0, 1]).astype(np.float32)
v2 = np.array([0, 1, 0, 1, 0, 1, 0]).astype(np.float32)
v3 = np.array([0, 0, 1, 0, 1, 1, 0]).astype(np.float32)
c1 = np.array([1, 1, 0, 1, 1, 0, 1]).astype(np.float32)

print(euclidean_distance(c1, v1), euclidean_distance(c1, v2), euclidean_distance(c1, v3))

1.4142135623730951 2.0 2.449489742783178
