## embed player skill

In [1]:
import pandas as pd 
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Emory/NFL Lab/')

In [2]:
df = pd.read_csv('data/input_2023_w07.csv')
df_2 = pd.read_csv('data/output_2023_w07.csv')

#### feature extraction

In [3]:
# need to change his position as he was the only one labeled under S
df.loc[df['player_name'] == 'Myles Bryant', 'player_position'] = 'FS'

average time to 90th / 50th percentile speed, 90th percentile orentation change, 90th percentile acceleration, change in dir, decell

In [4]:
df_bio = df[['nfl_id', 'player_name', 'player_weight', 'player_height', 'player_position']].drop_duplicates()

In [5]:
df['change_in_dir'] = df.groupby(['nfl_id', 'play_id'])['dir'].diff(1)
df['change_in_o'] = df.groupby(['nfl_id', 'play_id'])['o'].diff(1)
df['change_in_dir_3'] = df.groupby(['nfl_id', 'play_id'])['change_in_dir'].rolling(window=3, min_periods=2).mean().reset_index(level=[0,1], drop=True)
df['change_in_o_3'] = df.groupby(['nfl_id', 'play_id'])['change_in_o'].rolling(window=3, min_periods=2).mean().reset_index(level=[0,1], drop=True)

In [6]:
df['change_in_dir_3'] = abs(df['change_in_dir_3'])
df['change_in_o_3'] = abs(df['change_in_o_3'])
df['change_in_dir'] = abs(df['change_in_dir'])
df['change_in_o'] = abs(df['change_in_o'])

In [7]:
df_count = df.groupby('nfl_id')['play_id'].nunique().reset_index().rename(columns={'play_id':'play_count'})
df_bio = df_bio.merge(df_count, on=['nfl_id'], how='left')

position avg normalized for weight

In [8]:
df_avgs = df.dropna()
df_avgs = df_avgs[(df_avgs['change_in_dir_3'] < 40)]
df_avgs = df_avgs[(df_avgs['change_in_o_3'] < 40)]

In [9]:
features = df_bio[['player_position', 'player_weight']]
features = pd.get_dummies(features, columns=['player_position'])

agg = df_avgs.groupby('nfl_id').agg({
    's': [lambda x: np.percentile(x, 90), lambda x: np.percentile(x, 50)],
    'change_in_dir_3': [lambda x: np.percentile(x.dropna(), 90), lambda x: np.percentile(x.dropna(), 50)],
    'change_in_o_3': [lambda x: np.percentile(x.dropna(), 90), lambda x: np.percentile(x.dropna(), 50)],
    'a': [lambda x: np.percentile(x, 90), lambda x: np.percentile(x, 50)]
})
agg.columns = ['s_90', 's_50', 'dir_90', 'dir_50', 'o_90', 'o_50', 'a_90', 'a_50']
agg = agg.reset_index()


data = df_bio.merge(agg, on='nfl_id')

In [10]:
target_cols = ['s_90', 's_50', 'dir_90', 'dir_50', 'o_90', 'o_50', 'a_90', 'a_50']
models = {}
predictions = {}

for target in target_cols:
    X = features.values
    y = data[target].values
    model = LinearRegression()
    model.fit(X, y)
    models[target] = model
    predictions[target] = model.predict(X)


for target in target_cols:
    data[f'pred_{target}'] = predictions[target]

print(r2_score(data['o_50'], data['pred_o_50']))

0.2525084969254503


regressed mean

In [11]:
def regressed_means(df, pos, mean_col):

    pos_df = df[df['player_position'] == pos]
    var_of_means = np.var(pos_df[mean_col])
    avg_n = np.mean(pos_df['play_count'])

    sigma_sq = var_of_means * 0.5
    tau_sq = max(0, var_of_means - (sigma_sq / avg_n))

    df.loc[df['player_position'] == pos, f'shrink_b{mean_col}'] = (sigma_sq / df.loc[df['player_position'] == pos, 'play_count']) / ((sigma_sq / df.loc[df['player_position'] == pos, 'play_count']) + tau_sq)
    
    df.loc[df['player_position'] == pos, f'reg_mean{mean_col}'] = (df.loc[df['player_position'] == pos, f'shrink_b{mean_col}'] * df.loc[df['player_position'] == pos, f'pred_{mean_col}']) + \
        ((1 - df.loc[df['player_position'] == pos, f'shrink_b{mean_col}']) * df.loc[df['player_position'] == pos, mean_col])
    
    return df


In [12]:
cols = ['s_90', 's_50', 'dir_90', 'dir_50', 'o_90', 'o_50', 'a_90', 'a_50']

for col in cols:
    data[f'shrink_b{col}'] = np.nan
    data[f'reg_mean{col}'] = np.nan

for pos in data['player_position'].unique():
    for col in cols:
        df_regressed = regressed_means(data, pos=pos, mean_col=col)


In [13]:
cols_to_keep = [col for col in df_regressed.columns if col.startswith('reg_mean') or col.startswith('play') or col.startswith('nfl')]
df_regressed = df_regressed[cols_to_keep]
df_regressed = df_regressed.drop(columns=['player_weight', 'player_height'])

In [14]:
df_target = df[['nfl_id', 'player_height', 'player_weight','frame_id', 'player_role', 'play_id', 'x', 'y']]
df_train = df_target.merge(df_regressed, on=['nfl_id'], how='left')

In [15]:
max_frames = df_train.groupby(['play_id', 'nfl_id'])['frame_id'].max().reset_index()
max_frames.columns = ['play_id', 'nfl_id', 'max_frame_id']
df_2_extended = df_2.merge(max_frames, on=['play_id', 'nfl_id'], how='inner')

df_2_extended['frame_id'] = df_2_extended['frame_id'] + df_2_extended['max_frame_id']

common_cols = ['nfl_id', 'play_id', 'frame_id', 'x', 'y']
df_2_subset = df_2_extended[common_cols]

df_combined = pd.concat([df_train, df_2_subset], ignore_index=True)
df_combined = df_combined.sort_values(['play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)

In [16]:
missing_cols = ['player_height', 'player_weight', 'player_role', 'player_name', 'player_position', 'play_count', 
                'reg_means_90', 'reg_means_50', 'reg_meandir_90', 'reg_meandir_50', 'reg_meano_90', 'reg_meano_50', 
                'reg_meana_90', 'reg_meana_50']

for col in missing_cols:
    if col in df_combined.columns:
        df_combined[col] = df_combined.groupby(['play_id', 'nfl_id'])[col].transform(lambda x: x.fillna(x.iloc[0] if x.notna().any() else np.nan))

In [18]:
df_train['player_height'] = df_train['player_height'].str.split('-').str[0].astype(int) * 12 + df_train['player_height'].str.split('-').str[1].astype(int)

In [19]:
df_train['change_x'] = df_train.groupby(['nfl_id', 'play_id'])['x'].diff(1)
df_train['change_y'] = df_train.groupby(['nfl_id', 'play_id'])['y'].diff(1)
df_train = df_train.dropna()

In [21]:
cols_to_normalize = ['player_height', 'player_weight', 'reg_means_90', 'reg_means_50', 
                    'reg_meandir_90', 'reg_meandir_50', 'reg_meano_90', 'reg_meano_50', 
                    'reg_meana_90', 'reg_meana_50']

# normalize each column to 0-1 range
for col in cols_to_normalize:
    if col in df_train.columns:
        min_val = df_train[col].min()
        max_val = df_train[col].max()
        df_train[col] = (df_train[col] - min_val) / (max_val - min_val)

#### creating embeddings

use neural network with embedding layer to create embeddings based on the change in player positon, model learns repersentation of prior features that are important for the movement of a player. the target feature is change x, change in y. we will have two heads for this. Create seperate postional embeddings per position, and then represent the others as 0s resulting in a sparese matrix 

In [22]:
class PlayerEmbeddingModel(nn.Module):
    def __init__(self, num_unique_players, num_prior_features, num_categories, embedding_dim=64, category_embedding_dim=8):

        super(PlayerEmbeddingModel, self).__init__()
        
        self.player_embedding = nn.Embedding(
            num_embeddings=num_unique_players,
            embedding_dim=embedding_dim
        )

        self.category_embedding = nn.Embedding(
            num_embeddings=num_categories,
            embedding_dim=category_embedding_dim
        )

        concatenated_feature_size = embedding_dim + category_embedding_dim + num_prior_features
        
        # standard sequential model 64-32-2 
        self.prediction_head = nn.Sequential(
            nn.Linear(concatenated_feature_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2) # two heads to predict change in x and change in y
        )

    def forward(self, player_id_input, category_input, prior_stats_input):
        embedded_player = self.player_embedding(player_id_input)
        embedded_category = self.category_embedding(category_input)

        if embedded_player.dim() == 3: # for batch processing
            embedded_player = embedded_player.squeeze(1)
        
        if embedded_category.dim() == 3:
            embedded_category = embedded_category.squeeze(1)

        concatenated_features = torch.cat([embedded_player, embedded_category, prior_stats_input], dim=1)

        output = self.prediction_head(concatenated_features)
        return output

def extract_embeddings(model):
    return model.player_embedding.weight.detach().cpu()


train

In [23]:
# make postions int values
position_mapping = {pos: idx for idx, pos in enumerate(df_train['player_position'].unique())}
df_train['position_encoded'] = df_train['player_position'].map(position_mapping)

# player_ids cons.
player_mapping = {player: idx for idx, player in enumerate(df_train['nfl_id'].unique())}
df_train['player_id_encoded'] = df_train['nfl_id'].map(player_mapping)

# continious cols
cols_cont = ['reg_means_90', 'reg_means_50', 'reg_meandir_90', 'reg_meandir_50', 
             'reg_meano_90', 'reg_meano_50', 'reg_meana_90', 'reg_meana_50', 
             'player_height', 'player_weight']
X_cont = df_train[cols_cont].values

# target
y = df_train[['change_x', 'change_y']].values

In [24]:
# tensors
player_ids = torch.LongTensor(df_train['player_id_encoded'].values)
pos_cat = torch.LongTensor(df_train['position_encoded'].values)
X_cont = torch.FloatTensor(X_cont)
y = torch.FloatTensor(y)

In [None]:
# data loader
dataset = TensorDataset(player_ids, pos_cat, X_cont, y)
train_load = DataLoader(dataset, batch_size=60,shuffle=True)

# counts
n_players = df_train['nfl_id'].nunique()
num_cat = 14
num__feat = len(cols_cont)

model = PlayerEmbeddingModel(
    num_unique_players=n_players,
    num_prior_features=num__feat,
    num_categories=num_cat,
    embedding_dim=10,
    category_embedding_dim=6
)

train loop

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

best_loss = float('inf')
patience = 10
patience_counter = 0

# train loop with early stopping
for epoch in range(1000):
    epoch_loss = 0
    num_batches = 0
    
    for player_batch, category_batch, features_batch, targets_batch in train_load:
        optimizer.zero_grad() # zero gradients so diff grad across batches dont gte summed
        
        predictions = model(player_batch, category_batch, features_batch) # forward pass
        loss = criterion(predictions, targets_batch) # mse
        
        loss.backward() # backprop
        optimizer.step() # update model weights, update opti
        
        epoch_loss += loss.item()
        num_batches += 1
    
    # early stopping logic, prevent overfitting
    avg_loss = epoch_loss / num_batches
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        break
    
    print(f'epoch {epoch+1}, loss: {avg_loss}')

epoch 1, loss: 0.0703675865528224
epoch 2, loss: 0.06897248218701715
epoch 3, loss: 0.06858578110563736
epoch 4, loss: 0.06842888336333149
epoch 5, loss: 0.06833271443339434
epoch 6, loss: 0.0682703221375908
epoch 7, loss: 0.0682306305537441
epoch 8, loss: 0.06819040807657471
epoch 9, loss: 0.06817217032144922
epoch 10, loss: 0.06815974488152983
epoch 11, loss: 0.06814105169200078
epoch 12, loss: 0.06813999298131154
epoch 13, loss: 0.06812294065812016
epoch 14, loss: 0.06811855975231619
epoch 15, loss: 0.06809500577584554
epoch 16, loss: 0.06809900186994094
epoch 17, loss: 0.06809792194191057
epoch 18, loss: 0.06808483151549549
epoch 19, loss: 0.06808324715747416
epoch 20, loss: 0.068088354948234
epoch 21, loss: 0.06808615774488731
epoch 22, loss: 0.0680822479384796
epoch 23, loss: 0.06806548086225689
epoch 24, loss: 0.06806387066367664
epoch 25, loss: 0.06804819055371968
epoch 26, loss: 0.0680474118081636
epoch 27, loss: 0.06805485267608338
epoch 28, loss: 0.06805429906509151
epoch 29

#### save embeddings / plot

In [None]:
# extract embeddings
player_embeddings = extract_embeddings(model)

# map embeddings to ids
embedding_tensor = player_embeddings
nfl_id_mapping = {nfl_id: idx for idx, nfl_id in enumerate(df_train['nfl_id'].unique())}
reverse_mapping = {idx: nfl_id for nfl_id, idx in nfl_id_mapping.items()}

# save mappings and embeddings
torch.save({
    'embeddings': embedding_tensor,
    'nfl_id_to_idx': nfl_id_mapping,
    'idx_to_nfl_id': reverse_mapping,
    'embedding_dim': embedding_tensor.shape[1],
    'num_players': embedding_tensor.shape[0]
}, 'nfl_embeddings.pt')



plot to see if location makes logical sense

In [None]:
tsne = TSNE(n_components=2, random_state=42)
player_embeddings_2d = tsne.fit_transform(player_embeddings.numpy())

embeddings_df = pd.DataFrame(player_embeddings_2d, columns=['x', 'y'])
embeddings_df['nfl_id'] = df_train['nfl_id'].unique() 
embeddings_df = embeddings_df.merge(df_bio[['nfl_id', 'player_name']], on='nfl_id', how='left')

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_df['x'], embeddings_df['y'], alpha=0.5)
plt.title('Player Embeddings (t-SNE)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')

num_labels = 50
random_indices = random.sample(range(len(embeddings_df)), num_labels)

for i in random_indices:
    plt.annotate(embeddings_df['player_name'].iloc[i], (embeddings_df['x'].iloc[i], embeddings_df['y'].iloc[i]))

plt.show()