In [96]:
import pandas as pd
import numpy as np

In [97]:
def standardize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a direction-invariant view of all plays.
    
    Returns a new DataFrame where:
    - x_rel=0 is at the line of scrimmage (offense behind at negative x_rel, defense ahead at positive x_rel)
    - All plays show offense driving toward increasing x (left to right / bottom to top)
    - 'left' plays are flipped since they drive toward decreasing x
    - Orientation and direction angles are properly adjusted
    
    Original DataFrame is not modified.
    """
    # Create a copy to avoid modifying original
    df_rel = df.copy()
    
    # Determine which plays need flipping
    # 'left' means offense drives toward decreasing x (needs flip)
    # 'right' means offense drives toward increasing x (no flip needed)
    is_left = df_rel['play_direction'] == 'left'
    
    # For left plays, flip x coordinates (mirror horizontally)
    # Original: x goes 0 to 120
    # After flip: x' = 120 - x
    df_rel.loc[is_left, 'x'] = 120 - df_rel.loc[is_left, 'x']
    if 'ball_land_x' in df_rel.columns:
        df_rel.loc[is_left, 'ball_land_x'] = 120 - df_rel.loc[is_left, 'ball_land_x']
    
    # For left plays, flip y coordinates (mirror vertically)
    # This keeps players on same relative side after horizontal flip
    df_rel.loc[is_left, 'y'] = 53.3 - df_rel.loc[is_left, 'y']
    if 'ball_land_y' in df_rel.columns:
        df_rel.loc[is_left, 'ball_land_y'] = 53.3 - df_rel.loc[is_left, 'ball_land_y']
    
    # For left plays, flip orientation and direction angles
    # When mirroring horizontally and vertically, angle transforms as: θ' = 180° - θ
    df_rel.loc[is_left, 'o'] = df_rel.loc[is_left, 'o'] - 180
    df_rel.loc[is_left, 'dir'] = df_rel.loc[is_left, 'dir'] - 180
    
    # Normalize angles to [0, 360) range
    df_rel.loc[is_left, 'o'] = df_rel.loc[is_left, 'o'] % 360
    df_rel.loc[is_left, 'dir'] = df_rel.loc[is_left, 'dir'] % 360
    
    # Also need to flip the absolute_yardline_number for left plays
    # because it's measured from their defending endzone
    df_rel.loc[is_left, 'absolute_yardline_number'] = 120 - df_rel.loc[is_left, 'absolute_yardline_number']
    
    # Make x relative to line of scrimmage (LOS at x=0)
    # Negative x = behind LOS (offense side), Positive x = past LOS (defense side)
    df_rel['x_rel'] = df_rel['x'] - df_rel['absolute_yardline_number']
    if 'ball_land_x' in df_rel.columns:
        df_rel['ball_land_x_rel'] = df_rel['ball_land_x'] - df_rel['absolute_yardline_number']
    
    # Make y relative to center of field
    df_rel['y_rel'] = df_rel['y'] - 26.65
    if 'ball_land_y' in df_rel.columns:
        df_rel['ball_land_y_rel'] = df_rel['ball_land_y'] - 26.65
    
    # Add distance to ball landing spot
    if 'ball_land_x' in df_rel.columns and 'ball_land_y' in df_rel.columns:
        df_rel['dist_to_ball'] = np.sqrt(
            (df_rel['x'] - df_rel['ball_land_x'])**2 + 
            (df_rel['y'] - df_rel['ball_land_y'])**2
        )
    
    return df_rel

In [98]:
from pathlib import Path

# Get all input CSV files from dataset/train
train_path = Path('dataset/train')
input_files = sorted(train_path.glob('input*.csv'))

print(f"Found {len(input_files)} input files")

# Read and concatenate all input files
dfs = []
for file in input_files:
    print(f"Reading {file.name}...")
    df = pd.read_csv(file)
    dfs.append(df)

all_weeks = pd.concat(dfs, ignore_index=True)
print(f"\nTotal rows: {len(all_weeks):,}")

# Standardize all data
all_weeks_std = standardize(all_weeks)

Found 18 input files
Reading input_2023_w01.csv...
Reading input_2023_w02.csv...
Reading input_2023_w02.csv...
Reading input_2023_w03.csv...
Reading input_2023_w03.csv...
Reading input_2023_w04.csv...
Reading input_2023_w04.csv...
Reading input_2023_w05.csv...
Reading input_2023_w05.csv...
Reading input_2023_w06.csv...
Reading input_2023_w06.csv...
Reading input_2023_w07.csv...
Reading input_2023_w07.csv...
Reading input_2023_w08.csv...
Reading input_2023_w08.csv...
Reading input_2023_w09.csv...
Reading input_2023_w09.csv...
Reading input_2023_w10.csv...
Reading input_2023_w10.csv...
Reading input_2023_w11.csv...
Reading input_2023_w11.csv...
Reading input_2023_w12.csv...
Reading input_2023_w12.csv...
Reading input_2023_w13.csv...
Reading input_2023_w13.csv...
Reading input_2023_w14.csv...
Reading input_2023_w14.csv...
Reading input_2023_w15.csv...
Reading input_2023_w15.csv...
Reading input_2023_w16.csv...
Reading input_2023_w16.csv...
Reading input_2023_w17.csv...
Reading input_2023_

In [99]:
all_weeks_std.columns

Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y', 'x_rel',
       'ball_land_x_rel', 'y_rel', 'ball_land_y_rel', 'dist_to_ball'],
      dtype='object')

In [100]:
play_features = [
    'game_id', 'play_id', 'frame_id', 'nfl_id', 'player_height', 'player_weight',
    'player_position', 'player_side', 'player_role', 's', 'a', 'dir', 'o',
    'x_rel', 'ball_land_x_rel', 'y_rel', 'ball_land_y_rel'
]
all_weeks_std = all_weeks_std.filter(play_features)
all_weeks_std.sample(5)

Unnamed: 0,game_id,play_id,frame_id,nfl_id,player_height,player_weight,player_position,player_side,player_role,s,a,dir,o,x_rel,ball_land_x_rel,y_rel,ball_land_y_rel
614578,2023092401,1938,16,54533,6-2,200,CB,Defense,Defensive Coverage,3.04,3.26,117.14,236.56,2.03,18.290001,13.21,-4.709999
3958208,2023121708,3559,4,44926,6-1,220,SS,Defense,Defensive Coverage,0.6,0.77,73.63,286.81,8.9,20.36,6.63,26.4
4196455,2023122407,1722,8,39987,6-3,221,QB,Offense,Passer,1.24,4.2,285.71,83.1,-5.01,9.669998,0.03,-21.01
3785116,2023121600,3582,5,47855,6-2,245,OLB,Defense,Defensive Coverage,1.53,0.61,227.55,299.67,1.97,1.260002,-5.56,-19.53
4720336,2024010704,742,12,56248,6-3,202,WR,Offense,Other Route Runner,4.17,1.52,61.32,114.53,0.7,8.38,13.76,27.56


In [101]:
supp = pd.read_csv('dataset/supplementary_data.csv')

supp_features = [
    'game_id', 'play_id', 'down', 'yards_to_go', 'dropback_type', 'team_coverage_type'
]

supp = supp.filter(supp_features)
supp.sample(5)

  supp = pd.read_csv('dataset/supplementary_data.csv')


Unnamed: 0,game_id,play_id,down,yards_to_go,dropback_type,team_coverage_type
2796,2023100103,3923,2,10,TRADITIONAL,COVER_3_ZONE
11097,2023121602,2467,1,10,TRADITIONAL,COVER_3_ZONE
15527,2024121600,1632,2,12,TRADITIONAL,COVER_2_ZONE
10991,2023121600,2136,2,7,TRADITIONAL,COVER_4_ZONE
4908,2023101900,3640,2,10,TRADITIONAL,COVER_6_ZONE


In [102]:
merged = pd.merge(left=all_weeks_std,
                  right=supp,
                  how='left',
                  on=['game_id', 'play_id'])
merged.sample(5)

Unnamed: 0,game_id,play_id,frame_id,nfl_id,player_height,player_weight,player_position,player_side,player_role,s,...,dir,o,x_rel,ball_land_x_rel,y_rel,ball_land_y_rel,down,yards_to_go,dropback_type,team_coverage_type
103513,2023091004,3584,17,47877,6-1,206,CB,Defense,Defensive Coverage,2.7,...,109.77,231.8,10.3,-4.27,8.36,5.03,1,10,DESIGNED_ROLLOUT_RIGHT,COVER_4_ZONE
4287484,2023122411,4183,3,42929,6-2,242,ILB,Defense,Defensive Coverage,1.41,...,212.3,272.48,3.93,3.099998,4.26,3.17,1,10,TRADITIONAL,COVER_3_ZONE
3230518,2023112700,2541,15,47791,6-5,248,TE,Offense,Other Route Runner,2.05,...,112.77,108.84,-0.97,9.82,-2.76,-1.790001,4,3,TRADITIONAL,COVER_3_ZONE
4394434,2023123101,2065,18,47817,6-0,190,CB,Defense,Defensive Coverage,4.07,...,126.43,192.37,9.05,1.860001,16.31,-18.539999,2,9,TRADITIONAL,COVER_3_ZONE
3873332,2023121703,263,32,54483,6-3,225,WR,Offense,Targeted Receiver,6.74,...,94.5,58.81,15.53,24.330002,-7.9,1.319999,2,8,TRADITIONAL,COVER_4_ZONE


In [103]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Convert player_height from "feet-inches" format to total inches
def height_to_inches(height_str):
    """Convert height string like '6-2' to inches (74)"""
    if pd.isna(height_str):
        return None
    feet, inches = height_str.split('-')
    return int(feet) * 12 + int(inches)

merged['player_height'] = merged['player_height'].apply(height_to_inches)

# Identify categorical string columns
categorical_cols = merged.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}")

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform the categorical columns
encoded_array = encoder.fit_transform(merged[categorical_cols])

# Get feature names for the encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Create a dataframe with the encoded features
encoded_df = pd.DataFrame(encoded_array, columns=encoded_feature_names, index=merged.index)

# Drop original categorical columns and concatenate encoded ones
merged_encoded = pd.concat([merged.drop(columns=categorical_cols), encoded_df], axis=1)

print(f"\nOriginal shape: {merged.shape}")
print(f"Encoded shape: {merged_encoded.shape}")
merged_encoded.sample(5)

Categorical columns to encode: ['player_position', 'player_side', 'player_role', 'dropback_type', 'team_coverage_type']

Original shape: (4880579, 21)
Encoded shape: (4880579, 57)

Original shape: (4880579, 21)
Encoded shape: (4880579, 57)


Unnamed: 0,game_id,play_id,frame_id,nfl_id,player_height,player_weight,s,a,dir,o,...,dropback_type_nan,team_coverage_type_COVER_0_MAN,team_coverage_type_COVER_1_MAN,team_coverage_type_COVER_2_MAN,team_coverage_type_COVER_2_ZONE,team_coverage_type_COVER_3_ZONE,team_coverage_type_COVER_4_ZONE,team_coverage_type_COVER_6_ZONE,team_coverage_type_PREVENT,team_coverage_type_nan
4365080,2023122800,4463,20,56252,69,180,3.53,4.11,66.42,94.42,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2741374,2023111901,3859,13,43503,72,238,2.41,2.74,171.83,258.93,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4418248,2023123103,1202,31,54504,72,195,6.02,3.52,26.96,103.05,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1433521,2023101501,1233,28,47871,74,237,4.17,1.36,112.99,236.43,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4262271,2023122410,3227,14,44827,73,212,1.83,3.88,52.39,291.96,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [104]:
# Transform data into the required format:
# Historical Agent Features (Batch, T_hist, N_agents, D_agent)
# Global Context Features (Batch, D_global)
# Ground Truth Trajectories (Batch, T_pred, N_agents, 2)

import numpy as np

# Group by play (game_id, play_id)
grouped = merged_encoded.groupby(['game_id', 'play_id'])

# Agent features: per-player features that vary over time
agent_feature_cols = ['player_height', 'player_weight', 's', 'a', 'dir', 'o', 
                      'x_rel', 'y_rel'] + [col for col in merged_encoded.columns 
                                            if col.startswith('player_position_') or 
                                            col.startswith('player_side_') or 
                                            col.startswith('player_role_')]

# Global context features: per-play features that don't vary by agent
global_feature_cols = ['down', 'yards_to_go'] + [col for col in merged_encoded.columns 
                                                   if col.startswith('dropback_type_') or 
                                                   col.startswith('team_coverage_type_')]

# Ground truth trajectories: next timestep positions (x_rel, y_rel) for each agent
trajectory_cols = ['x_rel', 'y_rel']

historical_agent_features = []
global_context_features = []
ground_truth_trajectories = []

for (game_id, play_id), play_data in grouped:
    # Sort by frame_id to ensure temporal ordering
    play_data = play_data.sort_values('frame_id')
    
    # Get unique frames
    frames = play_data['frame_id'].unique()
    
    # Need at least 2 frames to have history + next timestep
    if len(frames) < 2:
        continue
    
    # For each frame (except last), get all agents (players)
    frame_data = []
    ground_truth_data = []
    
    for i in range(len(frames) - 1):
        # Current frame (history)
        current_frame = frames[i]
        next_frame = frames[i + 1]
        
        current_frame_players = play_data[play_data['frame_id'] == current_frame].sort_values('nfl_id')
        next_frame_players = play_data[play_data['frame_id'] == next_frame].sort_values('nfl_id')
        
        # Agent features at current timestep
        agent_features = current_frame_players[agent_feature_cols].values
        frame_data.append(agent_features)
        
        # Ground truth: positions at next timestep
        next_positions = next_frame_players[trajectory_cols].values
        ground_truth_data.append(next_positions)
    
    # Stack frames: (T_hist, N_agents, D_agent)
    historical_agent_features.append(np.array(frame_data))
    
    # Stack ground truth: (T_pred, N_agents, 2)
    ground_truth_trajectories.append(np.array(ground_truth_data))
    
    # Global context (same for entire play, take first row)
    global_features = play_data[global_feature_cols].iloc[0].values
    global_context_features.append(global_features)

# Convert to arrays
# Note: plays may have different numbers of frames/agents, so we can't stack directly
# You may need to pad or handle variable lengths depending on your model

print(f"Number of plays: {len(historical_agent_features)}")
print(f"Sample play shape (T_hist, N_agents, D_agent): {historical_agent_features[0].shape}")
print(f"Global context shape (D_global,): {global_context_features[0].shape}")
print(f"Ground truth trajectory shape (T_pred, N_agents, 2): {ground_truth_trajectories[0].shape}")

# Show sample
print(f"\nFirst play:")
print(f"  Historical features: {historical_agent_features[0].shape}")
print(f"  Global context: {global_context_features[0]}")
print(f"  Ground truth shape: {ground_truth_trajectories[0].shape}")

Number of plays: 14108
Sample play shape (T_hist, N_agents, D_agent): (25, 9, 33)
Global context shape (D_global,): (18,)
Ground truth trajectory shape (T_pred, N_agents, 2): (25, 9, 2)

First play:
  Historical features: (25, 9, 33)
  Global context: [3. 3. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  Ground truth shape: (25, 9, 2)


In [105]:
import torch

# Convert lists to tensors
# Note: Since plays have variable lengths, we store as a list of tensors
historical_agent_features_tensors = [torch.tensor(arr, dtype=torch.float32) for arr in historical_agent_features]
ground_truth_trajectories_tensors = [torch.tensor(arr, dtype=torch.float32) for arr in ground_truth_trajectories]
global_context_features_tensor = torch.tensor(np.array(global_context_features), dtype=torch.float32)

# Save to disk
save_path = Path('dataset/processed')
save_path.mkdir(exist_ok=True)

torch.save({
    'historical_agent_features': historical_agent_features_tensors,
    'ground_truth_trajectories': ground_truth_trajectories_tensors,
    'global_context_features': global_context_features_tensor
}, save_path / 'processed_data.pt')

print(f"\nSaved processed data to {save_path / 'processed_data.pt'}")
print(f"  - {len(historical_agent_features_tensors)} plays")
print(f"  - Historical features: list of tensors with shape (T_hist, N_agents, D_agent)")
print(f"  - Ground truth: list of tensors with shape (T_pred, N_agents, 2)")
print(f"  - Global context: tensor with shape ({global_context_features_tensor.shape})")


Saved processed data to dataset/processed/processed_data.pt
  - 14108 plays
  - Historical features: list of tensors with shape (T_hist, N_agents, D_agent)
  - Ground truth: list of tensors with shape (T_pred, N_agents, 2)
  - Global context: tensor with shape (torch.Size([14108, 18]))
