In [None]:
# Q: how can we represent players as feature vectors?
import pandas as pd
import os
import shutil
import json
from glob import glob

raw_features_fp = '/playpen-storage/levlevi/player-re-id/src/data/raw_features.json'
raw_features_df_fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
with open(raw_features_fp, 'r') as f:
    raw_features = json.load(f)
raw_features_df = pd.read_csv(raw_features_df_fp)

In [None]:
# which features are redundant?
df_redundant_features_dropped = raw_features_df[['team_id', 'player_id', 'team_colors', 'position', 'jersey_number', 'race']]
# which features can be one-hot encoded?
df_one_hot_team_id = pd.get_dummies(df_redundant_features_dropped, columns=['team_id', 'race',])
# segment positions (i.e. multi-category)
positions_segmented = []
for pos in df_one_hot_team_id['position']:
    if len(pos) == 2:
        positions_segmented.append([pos])
    else:
        positions_segmented.append(pos.split('/'))
df_positions_segmented = df_one_hot_team_id.copy()         
df_positions_segmented['position'] = positions_segmented

In [None]:
# how do we encode positions
unique_positions = set(pos for sublist in df_positions_segmented['position'] for pos in sublist)
# create columns for each unique position
for pos in unique_positions:
    df_positions_segmented[f'position_{pos}'] = df_positions_segmented['position'].apply(lambda x: 1 if pos in x else 0)
# drop the original position column
df_positions_segmented.drop('position', axis=1, inplace=True)

In [None]:
# drop team colors for now
df_no_team_colors = df_positions_segmented.copy()
df_no_team_colors.drop('team_colors', axis=1, inplace=True)

### Convert Ground-Truth Features to Embeddings (1x45)
***

In [69]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler

df = df_no_team_colors.copy()
# prepare data for FastText embedding
jersey_numbers = df['jersey_number'].astype(str).tolist()
sentences = [[char for char in number] for number in jersey_numbers]
# train a FastText model
model = FastText(sentences, vector_size=10, window=3, min_count=1, sg=1)
# get embeddings for jersey numbers
def get_embedding(number):
    return model.wv[number]
df['jersey_embedding'] = df['jersey_number'].apply(lambda x: get_embedding(x))
# expand the embeddings into separate columns
embeddings = pd.DataFrame(df['jersey_embedding'].tolist(), index=df.index)
df = df.drop(columns=['jersey_embedding', 'jersey_number']).join(embeddings)
# boolean columns
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype(int)
# convert all columns to float
df = df.astype(float)
# feature columns
feature_columns = [col for col in df.columns if col != 'player_id']
X = df[feature_columns]
num_cols = len(X.columns)
X.columns = np.arange(num_cols)
# scale all features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# target column (labels)
y = df['player_id']


### Convert Predictions to Embeddings
***

In [None]:
# how can we convert model predictions to feature vectors?