In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.animation as animation
import nfl_data_py as nfl
from IPython.display import HTML

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
input_df = pd.read_csv("data/inputweeks.csv")

### Spatial Model

1. Create Tensors From the Dataframe
    - node feats
        - each frame contains N players
        - each player has F columns associated with the player within space
        - this N x F column is a matrix which is created over T frames
        - This group a matrices is processed in a batch of size B
        - output shape = [B, T, N, F]
    - pos
        - only 2 columns associated with each player in a frame - x_LOS and y_centered
        - output shape = [B, T, N, 2]
    - mask
        - player counts among plays are inconsitent, mask will be added
        - [B, T, N] = 1, real player
        - [B, T, N] = 0, padding
    - targets for training
        - output df
        - shape = [B, T, N, 2]
            - in the batch of size B
            - over the output frames size T
            - over the players that we need to predict
            - preidct x_LOS and y_centered
        

In [3]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4880579 entries, 0 to 4880578
Data columns (total 45 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   game_id                   int64  
 1   play_id                   int64  
 2   player_to_predict         bool   
 3   nfl_id                    int64  
 4   frame_id                  int64  
 5   absolute_yardline_number  int64  
 6   player_weight             int64  
 7   player_birth_date         object 
 8   player_position           object 
 9   player_role               object 
 10  x                         float64
 11  y                         float64
 12  s                         float64
 13  a                         float64
 14  dir                       float64
 15  o                         float64
 16  num_frames_output         int64  
 17  ball_land_x               float64
 18  ball_land_y               float64
 19  line_of_scrimmage         float64
 20  x_LOS                   

In [4]:
node_feature_cols = [
    "x_smooth", "y_smooth",
    "v_x_smooth", "v_y_smooth",
    "speed_roll_mean", "acc_roll_mean",
    "heading_smooth", "angular_velocity_smooth",
    "distance_to_ball", "angle_to_ball_rad",
    "x_LOS", "y_centered", "ball_land_x_LOS", "ball_land_y_centered",
    "momentum", "kinetic_energy",
    "position_encoded",
    "player_weight", "player_height_inches", "player_to_predict",
]

pos_cols = ["x_LOS", "y_centered"]

In [5]:
input_df = input_df.fillna(0)

In [6]:
output_df = pd.read_csv("data/outputweeks.csv")

In [7]:
output_df.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,player_name,x,y,x_LOS,y_centered
0,2023090700,101,True,46137,27,Justin Reid,56.22,17.28,14.22,-9.37
1,2023090700,101,True,46137,28,Justin Reid,56.63,16.88,14.63,-9.77
2,2023090700,101,True,46137,29,Justin Reid,57.06,16.46,15.06,-10.19
3,2023090700,101,True,46137,30,Justin Reid,57.48,16.02,15.48,-10.63
4,2023090700,101,True,46137,31,Justin Reid,57.91,15.56,15.91,-11.09


## Create Tensors

In [8]:
# Step 1 
# Group Input and Output plays
plays_in  = input_df.groupby(["game_id", "play_id"])
plays_out = output_df.groupby(["game_id", "play_id"])

In [38]:
## Initially operate on a couple plays
sample = plays_in.head()
sample_x = sample.head(200)
plays_in_small = sample_x.groupby(["game_id","play_id"])

In [39]:
sample_x 

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,absolute_yardline_number,player_weight,player_birth_date,player_position,player_role,...,speed_roll_mean,acc_roll_mean,x_smooth,y_smooth,v_x_smooth,v_y_smooth,heading_smooth,angular_velocity,angular_velocity_smooth,angle_to_ball_rad
0,2023090700,101,False,43290,1,42,223,1994-10-14,QB,Passer,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.863368
1,2023090700,101,False,43290,2,42,223,1994-10-14,QB,Passer,...,0.000000,0.000000,-4.643333,3.420000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.863368
2,2023090700,101,False,43290,3,42,223,1994-10-14,QB,Passer,...,0.000000,0.000000,-4.650000,3.420000,-0.066667,0.000000,3.141593,0.000000,0.000000,-0.863177
3,2023090700,101,False,43290,4,42,223,1994-10-14,QB,Passer,...,0.020000,0.456667,-4.660000,3.420000,-0.100000,0.000000,3.141593,0.000000,0.000000,-0.862986
4,2023090700,101,False,43290,5,42,223,1994-10-14,QB,Passer,...,0.123333,1.353333,-4.673333,3.420000,-0.133333,0.000000,3.141593,-0.143117,-0.047706,-0.862796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13625,2023090700,2638,False,38696,1,51,198,1990-03-12,WR,Other Route Runner,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.693693
13626,2023090700,2638,False,38696,2,51,198,1990-03-12,WR,Other Route Runner,...,0.000000,0.000000,-1.376667,-3.726667,0.000000,0.000000,0.000000,0.000000,0.000000,-0.692947
13627,2023090700,2638,False,38696,3,51,198,1990-03-12,WR,Other Route Runner,...,0.000000,0.000000,-1.380000,-3.723333,-0.033333,0.033333,2.356194,0.000000,0.000000,-0.693843
13628,2023090700,2638,False,38696,4,51,198,1990-03-12,WR,Other Route Runner,...,0.000000,0.000000,-1.380000,-3.720000,0.000000,0.033333,1.570796,0.000000,0.000000,-0.693843


In [41]:
sample_ids = [(g,p) for (g,p), df in plays_in_small][:3]

In [50]:
tiny_df = input_df[input_df.set_index(["game_id","play_id"]).index.isin(sample_ids)]
plays_in_tiny = tiny_df.groupby(["game_id","play_id"])

In [51]:
def build_all_play_tensors(plays_in, node_feature_cols, pos_cols, T_max=32, N_max=13):
    """
    Build node_feats, pos, and mask tensors for every play using a per-play
    player_index mapping so rows always correspond to the same player.

    Output:
        node_list  -> list of [T_max, N_max, F]
        pos_list   -> list of [T_max, N_max, 2]
        mask_list  -> list of [T_max, N_max]
        player_maps -> list of dictionaries per play (nfl_id -> row index)
    """

    node_list = []
    pos_list = []
    mask_list = []
    player_maps = []   # store mapping for later use (training / output alignment)

    for (game_id, play_id), play_df in plays_in:

        # Sort by frame order
        play_in = play_df.sort_values("frame_id")

        # ----------------------------
        # 1. Build per-play player index
        # ----------------------------
        player_ids = sorted(play_in["nfl_id"].unique())
        N_play = len(player_ids)

        # Map nfl_id -> row index (0..N_play-1)
        player_index = {pid: i for i, pid in enumerate(player_ids)}
        player_maps.append(player_index)

        # ----------------------------
        # 2. Build per-frame matrices using the mapping
        # ----------------------------
        frame_groups = play_in.groupby("frame_id")

        node_frames = []
        pos_frames = []
        mask_frames = []

        for _, frame_df in frame_groups:

            # Create empty matrices for this frame (N_play rows)
            node_mat = np.zeros((N_play, len(node_feature_cols)), dtype=np.float32)
            pos_mat  = np.zeros((N_play, 2), dtype=np.float32)
            mask     = np.zeros((N_play,), dtype=np.float32)

            # Fill rows by mapping nfl_id → row index
            for _, row in frame_df.iterrows():
                idx = player_index[row["nfl_id"]]
                node_mat[idx] = row[node_feature_cols].values
                pos_mat[idx]  = row[pos_cols].values
                mask[idx]     = 1.0

            node_frames.append(node_mat)
            pos_frames.append(pos_mat)
            mask_frames.append(mask)

        # ----------------------------
        # 3. Pad players to N_max
        # ----------------------------
        def pad_players(frame_list, fill_shape):
            padded = []
            for mat in frame_list:
                N_current = mat.shape[0]
                if N_current < N_max:
                    pad_n = N_max - N_current
                    mat = np.pad(mat, ((0, pad_n), (0, 0)), constant_values=0)
                padded.append(mat)
            return padded

        node_frames = pad_players(node_frames, (N_max, len(node_feature_cols)))
        pos_frames  = pad_players(pos_frames,  (N_max, 2))

        # Mask padding
        mask_frames = [np.pad(m, (0, N_max - len(m)), constant_values=0) for m in mask_frames]

        # ----------------------------
        # 4. Pad time dimension to T_max
        # ----------------------------
        T_play = len(node_frames)
        if T_play < T_max:
            pad_t = T_max - T_play
            node_frames += [np.zeros((N_max, len(node_feature_cols)), dtype=np.float32)] * pad_t
            pos_frames  += [np.zeros((N_max, 2), dtype=np.float32)] * pad_t
            mask_frames += [np.zeros((N_max,), dtype=np.float32)] * pad_t

        # ----------------------------
        # 5. Convert lists to arrays
        # ----------------------------
        node_tensor = np.stack(node_frames)  # [T_max, N_max, F]
        pos_tensor  = np.stack(pos_frames)   # [T_max, N_max, 2]
        mask_tensor = np.stack(mask_frames)  # [T_max, N_max]

        node_list.append(node_tensor)
        pos_list.append(pos_tensor)
        mask_list.append(mask_tensor)

    return node_list, pos_list, mask_list, player_maps

In [52]:
node_list, pos_list, mask_list, player_maps = build_all_play_tensors(
    plays_in_tiny, node_feature_cols, pos_cols
)

In [53]:
print(node_list[0].shape)   # should be (5, 6, F)
print(pos_list[0].shape)    # (5, 6, 2)
print(mask_list[0].shape)   # (5, 6)

print(player_maps[0]) 

(32, 13, 20)
(32, 13, 2)
(32, 13)
{43290: 0, 44930: 1, 46137: 2, 52546: 3, 53487: 4, 53541: 5, 53959: 6, 54486: 7, 54527: 8}


## Tiny GNN

- transforms player features into embeddings
    - takes a row and creates vector representation, a compact summary of that players state given the features

- hidden dimensions are defined by the coder
    - 16, 32, 64
    - how much space the model uses to define a players state
- transformation layers
    - lin1
        - transform raw input into hidden_dim size embeddings
    - lin2
        - transform raw embeddings again, deeper and more expressive learning
        - let model learning complex relationships
- forward function
    - apply linear transformation on x
    - apply nonlinearity ReLu

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TinyGNN(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.lin1 = nn.Linear(in_dim, hidden_dim)
        self.lin2 = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        # x shape: [N, F]
        h = F.relu(self.lin1(x))
        h = F.relu(self.lin2(h))
        return h

In [65]:
# Tiny GNN setup
x0 = torch.tensor(node_list[0][0], dtype=torch.float32)
gnn = TinyGNN(in_dim=x0.shape[1], hidden_dim=16)

# Collect embeddings for all frames in play 0
all_embeddings = []

for n in node_list:
    play_embs = []
    for t in range(n.shape[0]):
        frame_feats = torch.tensor(n[t], dtype=torch.float32)
        play_embs.append(gnn(frame_feats))
    all_embeddings.append(torch.stack(play_embs))  # [T_play, N_max, 16]

print(len(all_embeddings))  # number of plays
print(all_embeddings[0].shape)  # shape of play 0

3
torch.Size([32, 13, 16])


In [69]:
print(player_maps[3])

IndexError: list index out of range