### creating embeddings for player context

based on predicting player postion using a sequential model

In [12]:
import pandas as pd 
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Emory/NFL Lab/data/')

In [13]:
df = pd.read_csv('djM_games.csv')

context for a certian play includes distance to go, qbs average tedencies which also are reflected in the teams average tendencies, time left in qtr 

In [15]:
context = df[['play_direction', 'absolute_yardline_number', 'ball_land_x', 'ball_land_y', 'play_id_n', 'num_frames_output']].drop_duplicates()
qb_id = df[['play_id_n', 'player_name', 'player_position', 'nfl_id']]
qb_id = qb_id[qb_id['player_position'] == 'QB']
qb_id = qb_id[qb_id['player_name'] != 'Taysom Hill']

In [16]:
df_qb = df[df['player_position'] == 'QB']
df_qb = df_qb.loc[df_qb.groupby('play_id_n')['frame_id'].idxmax()]
df_qb['max_throw'] = np.sqrt(np.square(df_qb['ball_land_x'] - df_qb['x']) + np.square(df_qb['ball_land_y'] - df_qb['y'] ))
df_qb = df_qb[df_qb['player_to_predict'] != True]


In [17]:
df_qb_stats = df_qb.groupby('nfl_id')['max_throw'].quantile([0.2, 0.5, 0.95]).unstack()
df_qb_stats.columns = ['max_throw_20', 'max_throw_50', 'max_throw_90']
df_qb_stats = df_qb_stats.reset_index()

In [18]:
qb_id = qb_id.merge(df_qb_stats, on=['nfl_id'], how='left').drop_duplicates().drop(columns=['player_name', 'player_position', 'nfl_id'])
context = context.merge(qb_id, on=['play_id_n'], how='left')
df_train = df[['game_id', 'x', 'y', 'play_id_n', 'nfl_id']]
df_train = df_train.merge(context, on='play_id_n', how='left')

In [None]:
mask = df['play_direction'] == 'right'
df_train.loc[mask, 'dist_to_go'] = 110 - df[mask, 'absolute_yard_line']

mask = df['play_direction'] == 'left'
df_train.loc[mask, 'dist_to_go'] = df[mask, 'absolute_yard_line'] - 10

In [None]:
df_train['change_x'] = df_train.groupby(['nfl_id', 'play_id_n'])['x'].transform(lambda x: x.max() - x.min())
df_train['change_y'] = df_train.groupby(['nfl_id', 'play_id_n'])['y'].transform(lambda x: x.max() - x.min())
df_train = df_train.dropna().drop(columns=['x', 'y']).drop_duplicates()

Unnamed: 0,game_id,play_id_n,nfl_id,play_direction,absolute_yardline_number,ball_land_x,ball_land_y,num_frames_output,max_throw_20,max_throw_50,max_throw_90,change_x,change_y
0,2023091008,696,42381,left,79.0,80.800003,5.59,10.0,14.820613,20.772089,44.781288,2.93,4.95
38,2023091008,696,44911,left,79.0,80.800003,5.59,10.0,14.820613,20.772089,44.781288,2.74,10.07
76,2023091008,696,45021,left,79.0,80.800003,5.59,10.0,14.820613,20.772089,44.781288,7.03,3.02
104,2023091008,696,46087,left,79.0,80.800003,5.59,10.0,14.820613,20.772089,44.781288,1.65,6.48
132,2023091008,696,46093,left,79.0,80.800003,5.59,10.0,14.820613,20.772089,44.781288,6.83,8.9


In [20]:
cols_to_normalize = df_train.columns.drop(['play_id_n', 'nfl_id', 'game_id', 'play_direction'])

for col in cols_to_normalize:
    if col in df_train.columns:
        min_val = df_train[col].min()
        max_val = df_train[col].max()
        df_train[col] = (df_train[col] - min_val) / (max_val - min_val)

### Embedding Model

In [None]:
class PlayerEmbeddingModel(nn.Module):
    def __init__(self, num_unique_players, num_prior_features, embedding_dim=16):

        super(PlayerEmbeddingModel, self).__init__()
        
        self.player_embedding = nn.Embedding(
            num_embeddings=num_unique_players,
            embedding_dim=embedding_dim
        )


        concatenated_feature_size = embedding_dim + num_prior_features
        
        # standard sequential model 64-32-2 
        self.prediction_head = nn.Sequential(
            nn.Linear(concatenated_feature_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2) # two heads to predict change in x and change in y
        )

    def forward(self, player_id_input,  prior_stats_input):
        embedded_player = self.player_embedding(player_id_input)

        if embedded_player.dim() == 3: # for batch processing
            embedded_player = embedded_player.squeeze(1)

        concatenated_features = torch.cat([embedded_player, prior_stats_input], dim=1)

        output = self.prediction_head(concatenated_features)
        return output

def extract_embeddings(model):
    return model.player_embedding.weight.detach().cpu()

In [None]:
# player_ids cons.
id_mapping = {player: idx for idx, player in enumerate(df_train['play_id_n'].unique())}
df_train['play_id_encoded'] = df_train['play_id_n'].map(id_mapping)

# continious cols
cols_cont = ['absolute_yardline_number', 'ball_land_x', 'ball_land_y', 'num_frames_output', 'max_throw_20',
             'max_throw_50', 'max_throw_90']
X_cont = df_train[cols_cont].values

# target
y = df_train[['change_x', 'change_y']].values

In [None]:
# tensors
player_ids = torch.LongTensor(df_train['play_id_encoded'].values)
X_cont = torch.FloatTensor(X_cont)
y = torch.FloatTensor(y)

In [None]:
# data loader
dataset = TensorDataset(player_ids, X_cont, y)
train_load = DataLoader(dataset, batch_size=127, shuffle=True)

# counts
n_players = df_train['play_id_n'].nunique()
num_cat = 2
num__feat = len(cols_cont)

model = PlayerEmbeddingModel(
    num_unique_players=n_players,
    num_prior_features=num__feat,
    num_categories=num_cat,
    embedding_dim=4,
    category_embedding_dim=1
)

In [30]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

best_loss = float('inf')
patience = 10
patience_counter = 0

# train loop with early stopping
for epoch in range(1000):
    epoch_loss = 0
    num_batches = 0
    
    for player_batch, category_batch, features_batch, targets_batch in train_load:
        optimizer.zero_grad() # zero gradients so diff grad across batches dont gte summed
        
        predictions = model(player_batch, category_batch, features_batch) # forward pass
        loss = criterion(predictions, targets_batch) # mse
        
        loss.backward() # backprop
        optimizer.step() # update model weights, update opti
        
        epoch_loss += loss.item()
        num_batches += 1
    
    # early stopping logic, prevent overfitting
    avg_loss = epoch_loss / num_batches
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        break
    if epoch % 10 == 0:
        print(f'epoch {epoch+1}, loss: {avg_loss}')

epoch 1, loss: 0.008622333760855787
epoch 11, loss: 0.008573540952056646


In [None]:
# extract embeddings
player_embeddings = extract_embeddings(model)

# map embeddings to ids
embedding_tensor = player_embeddings
nfl_id_mapping = {nfl_id: idx for idx, nfl_id in enumerate(df_train['nfl_id'].unique())}
reverse_mapping = {idx: nfl_id for nfl_id, idx in nfl_id_mapping.items()}

# save mappings and embeddings
torch.save({
    'embeddings': embedding_tensor,
    'nfl_id_to_idx': nfl_id_mapping,
    'idx_to_nfl_id': reverse_mapping,
    'embedding_dim': embedding_tensor.shape[1],
    'num_players': embedding_tensor.shape[0]
}, 'dj_context_embeddings.pt')
