### creating embeddings for player context

based on predicting player postion using a sequential model

In [1]:
import pandas as pd 
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Emory/NFL Lab/')

In [2]:
df = pd.read_csv('data/input_2023_w07.csv')
df_throw = pd.read_csv('data/output_2023_w07.csv')

In [3]:
df['play_id_n'] = df.groupby(['ball_land_x', 'ball_land_y', 'play_id']).ngroup()

In [4]:
max_frames = df.groupby(['play_id', 'nfl_id'])['frame_id'].max().reset_index()
max_frames.columns = ['play_id', 'nfl_id', 'max_frame_id']
df_2_extended = df_throw.merge(max_frames, on=['play_id', 'nfl_id'], how='inner')

df_2_extended['frame_id'] = df_2_extended['frame_id'] + df_2_extended['max_frame_id']

common_cols = ['nfl_id', 'play_id', 'frame_id', 'x', 'y']
df_2_subset = df_2_extended[common_cols]

df_combined = pd.concat([df, df_2_subset], ignore_index=True)
df_combined = df_combined.sort_values(['play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)

In [5]:
df_combined.drop(columns=['s', 'a', 'dir', 'o', 'player_birth_date'], inplace=True)

In [6]:
missing_cols = ['player_height', 'player_weight', 'player_role', 'player_name', 'player_position','game_id', 'player_to_predict',
                'play_direction', 'absolute_yardline_number', 'num_frames_output', 'ball_land_x', 'ball_land_y', 'player_side', 'play_id_n']

for col in missing_cols:
    if col in df_combined.columns:
        df_combined[col] = df_combined.groupby(['play_id', 'nfl_id'])[col].transform(lambda x: x.fillna(x.iloc[0] if x.notna().any() else np.nan))

  df_combined[col] = df_combined.groupby(['play_id', 'nfl_id'])[col].transform(lambda x: x.fillna(x.iloc[0] if x.notna().any() else np.nan))


context for a certian play includes distance to go, qbs average tedencies which also are reflected in the teams average tendencies, time left in qtr 

In [7]:
context = df[['play_direction', 'absolute_yardline_number', 'ball_land_x', 'ball_land_y', 'play_id_n', 'num_frames_output']].drop_duplicates()
qb_id = df[['play_id_n', 'player_name', 'player_position', 'nfl_id']]
qb_id = qb_id[qb_id['player_position'] == 'QB']
qb_id = qb_id[qb_id['player_name'] != 'Taysom Hill']

In [8]:
df_qb = df[df['player_position'] == 'QB']
df_qb = df_qb.loc[df_qb.groupby('play_id_n')['frame_id'].idxmax()]
df_qb['max_throw'] = np.sqrt(np.square(df_qb['ball_land_x'] - df_qb['x']) + np.square(df_qb['ball_land_y'] - df_qb['y'] ))
df_qb = df_qb[df_qb['player_to_predict'] != True]


In [9]:
df_qb_stats = df_qb.groupby('nfl_id')['max_throw'].quantile([0.2, 0.5, 0.95]).unstack()
df_qb_stats.columns = ['max_throw_20', 'max_throw_50', 'max_throw_90']
df_qb_stats = df_qb_stats.reset_index()

In [10]:
qb_id = qb_id.merge(df_qb_stats, on=['nfl_id'], how='left').drop_duplicates().drop(columns=['player_name', 'player_position', 'nfl_id'])
context = context.merge(qb_id, on=['play_id_n'], how='left')
df_train = df_combined[['game_id', 'x', 'y', 'play_id_n', 'nfl_id']]
df_train = df_train.merge(context, on='play_id_n', how='left')

In [11]:
df_train['change_x'] = df_train.groupby(['nfl_id', 'play_id_n'])['x'].transform(lambda x: x.max() - x.min())
df_train['change_y'] = df_train.groupby(['nfl_id', 'play_id_n'])['y'].transform(lambda x: x.max() - x.min())
df_train = df_train.dropna().drop(columns=['x', 'y']).drop_duplicates()
df_train.head()

Unnamed: 0,game_id,play_id_n,nfl_id,play_direction,absolute_yardline_number,ball_land_x,ball_land_y,num_frames_output,max_throw_20,max_throw_50,max_throw_90,change_x,change_y
0,2023102000.0,293.0,38577,right,50,54.18,29.940001,7,11.620349,13.99594,28.451614,1.91,2.43
28,2023102000.0,293.0,39983,right,50,54.18,29.940001,7,11.620349,13.99594,28.451614,6.81,2.99
49,2023102000.0,585.0,40024,left,85,85.629997,47.150002,10,13.907774,21.585702,37.046851,8.17,0.41
75,2023102000.0,72.0,41233,right,35,30.07,21.299999,6,13.018115,15.201107,38.838003,7.27,5.99
98,2023102000.0,96.0,41238,right,35,32.73,10.98,6,11.200986,19.738685,30.81585,12.24,2.35


### Embedding Model

In [None]:
class PlayerEmbeddingModel(nn.Module):
    def __init__(self, num_unique_players, num_prior_features, num_categories, embedding_dim=16, category_embedding_dim=8):

        super(PlayerEmbeddingModel, self).__init__()
        
        self.player_embedding = nn.Embedding(
            num_embeddings=num_unique_players,
            embedding_dim=embedding_dim
        )

        self.category_embedding = nn.Embedding(
            num_embeddings=num_categories,
            embedding_dim=category_embedding_dim
        )

        concatenated_feature_size = embedding_dim + category_embedding_dim + num_prior_features
        
        # standard sequential model 64-32-2 
        self.prediction_head = nn.Sequential(
            nn.Linear(concatenated_feature_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2) # two heads to predict change in x and change in y
        )

    def forward(self, player_id_input, category_input, prior_stats_input):
        embedded_player = self.player_embedding(player_id_input)
        embedded_category = self.category_embedding(category_input)

        if embedded_player.dim() == 3: # for batch processing
            embedded_player = embedded_player.squeeze(1)
        
        if embedded_category.dim() == 3:
            embedded_category = embedded_category.squeeze(1)

        concatenated_features = torch.cat([embedded_player, embedded_category, prior_stats_input], dim=1)

        output = self.prediction_head(concatenated_features)
        return output

def extract_embeddings(model):
    return model.player_embedding.weight.detach().cpu()

In [None]:
# make play direction int values
dir_mapping = {pos: idx for idx, pos in enumerate(df_train['play_direction'].unique())}
df_train['direction_encoded'] = df_train['play_direction'].map(dir_mapping)

# player_ids cons.
id_mapping = {player: idx for idx, player in enumerate(df_train['play_id_n'].unique())}
df_train['play_id_encoded'] = df_train['play_id_n'].map(id_mapping)

# continious cols
cols_cont = ['absolute_yardline_number', 'ball_land_x', 'ball_land_y', 'num_frames_output', 'max_throw_20',
             'max_throw_50', 'max_throw_90']
X_cont = df_train[cols_cont].values

# target
y = df_train[['change_x', 'change_y']].values

In [15]:
# tensors
player_ids = torch.LongTensor(df_train['play_id_encoded'].values)
pos_cat = torch.LongTensor(df_train['direction_encoded'].values)
X_cont = torch.FloatTensor(X_cont)
y = torch.FloatTensor(y)

In [None]:
# data loader
dataset = TensorDataset(player_ids, pos_cat, X_cont, y)
train_load = DataLoader(dataset, batch_size=127, shuffle=True)

# counts
n_players = df_train['play_id_n'].nunique()
num_cat = 2
num__feat = len(cols_cont)

model = PlayerEmbeddingModel(
    num_unique_players=n_players,
    num_prior_features=num__feat,
    num_categories=num_cat,
    embedding_dim=4,
    category_embedding_dim=1
)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

best_loss = float('inf')
patience = 10
patience_counter = 0

# train loop with early stopping
for epoch in range(1000):
    epoch_loss = 0
    num_batches = 0
    
    for player_batch, category_batch, features_batch, targets_batch in train_load:
        optimizer.zero_grad() # zero gradients so diff grad across batches dont gte summed
        
        predictions = model(player_batch, category_batch, features_batch) # forward pass
        loss = criterion(predictions, targets_batch) # mse
        
        loss.backward() # backprop
        optimizer.step() # update model weights, update opti
        
        epoch_loss += loss.item()
        num_batches += 1
    
    # early stopping logic, prevent overfitting
    avg_loss = epoch_loss / num_batches
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        break
    
    print(f'epoch {epoch+1}, loss: {avg_loss}')

In [None]:
player_embeddings = extract_embeddings(model)
display(player_embeddings)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
player_embeddings_2d = tsne.fit_transform(player_embeddings.numpy())

embeddings_df = pd.DataFrame(player_embeddings_2d, columns=['x', 'y'])
embeddings_df['nfl_id'] = df_train['nfl_id'].unique() 
embeddings_df = embeddings_df.merge(df_bio[['nfl_id', 'player_name']], on='nfl_id', how='left')

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_df['x'], embeddings_df['y'], alpha=0.5)
plt.title('Player Embeddings (t-SNE)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')

num_labels = 50
random_indices = random.sample(range(len(embeddings_df)), num_labels)

for i in random_indices:
    plt.annotate(embeddings_df['player_name'].iloc[i], (embeddings_df['x'].iloc[i], embeddings_df['y'].iloc[i]))

plt.show()