In [1]:
# Required Libraries
import numpy as np
import pandas as pd
from typing import List
from scipy.spatial.distance import cdist
import json
from collections import Counter
from difflib import SequenceMatcher
import os
from glob import glob

# Load Data
def load_data(embedding_fp: str, human_labels_fp: str, ground_truth_fp: str):
    all_tracklet_features = pd.read_pickle(embedding_fp)
    human_labels = pd.read_csv(human_labels_fp)
    with open(ground_truth_fp, 'r') as f:
        ground_truth_player_features_dict = json.load(f)
    return all_tracklet_features, human_labels, ground_truth_player_features_dict

# Generate Feature Vector
def generate_feature_vector(
    all_tracklet_features: pd.DataFrame,
    ground_truth_player_features_subset_df: pd.DataFrame,
    human_labels: List[float]
) -> List[float]:
    reidentified_tracklet_player_ids = [np.nan] * len(human_labels)
    ground_truth_embeddings = np.vstack(ground_truth_player_features_subset_df['embedding'].values)
    ground_truth_player_ids = ground_truth_player_features_subset_df['player_id'].values
    
    for tracklet_index, tracklet_row in enumerate(all_tracklet_features.itertuples()):
        tracklet_embedding = np.array(tracklet_row.embedding).reshape(1, -1)
        distances = cdist(tracklet_embedding, ground_truth_embeddings, metric='euclidean')
        best_match_idx = np.argmin(distances)
        best_match_player_id = ground_truth_player_ids[best_match_idx]
        reidentified_tracklet_player_ids[tracklet_index] = best_match_player_id
    
    return reidentified_tracklet_player_ids

# Utility Functions for Majority Voting
def get_majority_element(elements: List[str]) -> str:
    if not elements:
        return None
    counter = Counter(elements)
    return counter.most_common(1)[0][0]

# Calculate String Similarity
def calculate_string_similarity_score(string_one: str, string_two: str) -> float:
    return SequenceMatcher(None, string_one, string_two).ratio()

def find_closest_player_id(player_name: str, ground_truth_dict: dict) -> str:
    best_similarity = -1
    best_player_id_match = None
    for team in ground_truth_dict:
        for player in ground_truth_dict[team]['players']:
            cand_full_name = ' '.join(player.split(',')[::-1]) if ',' in player else player
            player_id = ground_truth_dict[team]['players'][player]['player_id']
            similarity = calculate_string_similarity_score(player_name, cand_full_name)
            if similarity > best_similarity:
                best_similarity = similarity
                best_player_id_match = player_id
    return best_player_id_match

EMBEDDING_FP = '/path/to/embedding.pkl'
HUMAN_LABELS_FP = '/path/to/human_labels.csv'
GROUND_TRUTH_FP = '/path/to/ground_truth.json'

all_tracklet_features, human_labels, ground_truth_player_features_dict = load_data(EMBEDDING_FP, HUMAN_LABELS_FP, GROUND_TRUTH_FP)

# Convert ground truth dictionary to dataframe
ground_truth_player_features_list = []
for team in ground_truth_player_features_dict:
    for player in ground_truth_player_features_dict[team]['players']:
        player_data = ground_truth_player_features_dict[team]['players'][player]
        player_data['player_id'] = player
        ground_truth_player_features_list.append(player_data)
ground_truth_player_features_subset_df = pd.DataFrame(ground_truth_player_features_list)

# Generate feature vector
reidentified_tracklet_player_ids = generate_feature_vector(all_tracklet_features, ground_truth_player_features_subset_df, human_labels)

# Prepare final DataFrame
predictions_computed = pd.DataFrame({'tracklet_file_path': all_tracklet_features['tracklet_file_path'], 'human_label': human_labels, 'prediction': reidentified_tracklet_player_ids})
print(predictions_computed.head())

FileNotFoundError: [Errno 2] No such file or directory: '/path/to/embedding.pkl'