In [193]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import IPython

Reading in Data

In [194]:
"""Emily's reading in files"""
data_dir = os.getcwd() + '/data/nfl-big-data-bowl-2024/'
players = pd.read_csv(data_dir + "players.csv")
week = pd.read_csv(data_dir + "tracking_week_1.csv")
plays = pd.read_csv(data_dir + "plays.csv")
tackles = pd.read_csv(data_dir + "tackles.csv")


# join player positioning information onto a week's worth of tracking data 
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')
week.shape

(1407439, 18)

Defining Functions for feature engineering at each level

In [195]:
"""Frame level functions"""
# def distance_to_ball(play_data, frame):
#     """
#     Calculates the distance to the ball carrier for each player per frame in a play.

#     Parameters:
#     - dataset_name: Name of the dataset to load
#     """
#     ball_carrier_id = play_data["ballCarrierId"].iloc[0]
#     ball_carrier = frame[frame["nflId"] == ball_carrier_id]
#     carr_x = ball_carrier["x"].values[0]
#     carr_y = ball_carrier["y"].values[0]
#     frame['distance_to_ball_carrier'] = cdist(frame[['x', 'y']], [[carr_x, carr_y]], metric='euclidean')
#     return frame

def create_dnn_input_df(frames, side_of_ball):
    """
    Creates the input dataframe for the DNN model.

    Parameters:
    - frames: Name of the dataset to load
    """
    sorted = frames.sort_values(by='distance_to_ball').reset_index(drop=True)
    
    input_df = pd.DataFrame()
    # Iterate over each row in the sorted DataFrame
    for idx, row in sorted.iterrows():
        # Extract player's x and y values
        x = row['x']
        y = row['y']
        s = row['s']
        a = row['a']
        dis = row['dis']
        o = row['o']
        dir = row['dir']
        distance_to_ball = row['distance_to_ball']
        # Add x_i columns to the new DataFrame
        input_df.loc[0,f'{side_of_ball}_x_{idx}'] = x
        input_df.loc[0,f'{side_of_ball}_y_{idx}'] = y
        input_df.loc[0,f'{side_of_ball}_s_{idx}'] = s
        input_df.loc[0,f'{side_of_ball}_a_{idx}'] = a
        input_df.loc[0,f'{side_of_ball}_dis_{idx}'] = dis
        input_df.loc[0,f'{side_of_ball}_o_{idx}'] = o
        input_df.loc[0,f'{side_of_ball}_dir_{idx}'] = dir
        input_df.loc[0,f'{side_of_ball}_distance_to_ball_{idx}'] = distance_to_ball

    # Concatenate the original DataFrame with the new x_i DataFrame
    return input_df
#create_dnn_input_df(week, 'd')
def create_dnn_output_df(frames, tackles_data):
    """
    Creates the output dataframe for the DNN model.

    Parameters:
    - frames: Name of the dataset to load
    - tackles_data: Name of the tackles dataset to load
    """
    total_num_tackles = len(tackles_data)
    tacklers = []
    for idx, row in tackles_data.iterrows():
        tacklers.append(row['nflId'])

    sorted = frames.sort_values(by='distance_to_ball').reset_index(drop=True)
    
    output_df = pd.DataFrame()
    # Iterate over each row in the sorted DataFrame
    for idx, row in sorted.iterrows():
        # Extract player's x and y values
        if row['nflId'] in tacklers:
            output_df.loc[0,f'tackle_{idx}'] = 1/total_num_tackles
        else:
            output_df.loc[0,f'tackle_{idx}'] = 0

    if total_num_tackles == 0:
        output_df.loc[0,f'tackle_11'] = 1
    else:
        output_df.loc[0,f'tackle_11'] = 0
            
    # Concatenate the original DataFrame with the new x_i DataFrame
    return output_df
    






In [196]:
week

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event,position
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.370000,27.27,1.62,1.15,0.16,231.74,147.90,,G
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.470000,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived,G
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.560000,27.01,1.57,0.49,0.15,230.98,147.05,,G
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.640000,26.90,1.44,0.89,0.14,232.38,145.42,,G
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.720000,26.80,1.29,1.24,0.13,233.36,141.95,,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407434,2022091200,3826,,football,49,2022-09-12 23:05:57.799999,,football,left,56.220001,9.89,2.56,1.25,0.25,,,tackle,
1407435,2022091200,3826,,football,50,2022-09-12 23:05:57.900000,,football,left,56.060001,10.08,2.50,1.14,0.24,,,,
1407436,2022091200,3826,,football,51,2022-09-12 23:05:58.000000,,football,left,55.889999,10.27,2.38,1.70,0.25,,,,
1407437,2022091200,3826,,football,52,2022-09-12 23:05:58.099999,,football,left,55.730000,10.44,2.07,2.83,0.24,,,,


Iterating through each Play

In [197]:
# getting distance to ball as separate columns
week = week[(week['gameId'].isin([2022090800])) & (week['playId'] == 56) & (week['frameId'].isin([1,2,3]))]
grouped = week.groupby(['playId', 'frameId']).apply(lambda g: g[g['club'] == 'football'])[['x','y']]
test = week.merge(grouped, on = ['playId', 'frameId'], how = 'left', suffixes=('','_ball') )

#using jersey number and position
play = game.loc[game['playId'] == pid].copy()
        play_data = game_plays[game_plays['playId'] == pid]
        play_tackles = game_tackles[game_tackles['playId'] == pid]  

In [198]:
week

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event,position
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.370000,27.270000,1.620000,1.15,0.16,231.74,147.90,,G
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.470000,27.130000,1.670000,0.61,0.17,230.98,148.53,pass_arrived,G
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.560000,27.010000,1.570000,0.49,0.15,230.98,147.05,,G
22,2022090800,56,38577.0,Bobby Wagner,1,2022-09-08 20:24:05.200000,45.0,LA,left,78.250000,27.530000,1.240000,3.81,0.11,4.20,331.67,,ILB
23,2022090800,56,38577.0,Bobby Wagner,2,2022-09-08 20:24:05.299999,45.0,LA,left,78.200000,27.690000,1.780000,4.43,0.16,357.21,346.31,pass_arrived,ILB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2022090800,56,53532.0,Ernest Jones,2,2022-09-08 20:24:05.299999,53.0,LA,left,76.400000,33.570000,0.770000,3.88,0.08,105.44,181.46,pass_arrived,ILB
464,2022090800,56,53532.0,Ernest Jones,3,2022-09-08 20:24:05.400000,53.0,LA,left,76.420000,33.500000,0.790000,3.76,0.07,107.15,148.69,,ILB
484,2022090800,56,,football,1,2022-09-08 20:24:05.200000,,football,left,85.050003,33.810001,22.209999,11.85,2.07,,,,
485,2022090800,56,,football,2,2022-09-08 20:24:05.299999,,football,left,83.150002,34.830002,20.900000,13.82,2.16,,,pass_arrived,


In [199]:
test['distance_to_ball'] = np.sqrt(
    (test["x"] - test["x_ball"]) ** 2
    + (test["y"] - test["y_ball"]) ** 2
)

In [200]:
test_sorted = test.sort_values(['gameId','playId', 'frameId', 'club', 'distance_to_ball'])
test_sorted.groupby(['gameId','playId', 'frameId', 'club']).cumcount()
# sorted = frames.sort_values(by='distance_to_ball_carrier').reset_index(drop=True)
pre_final = test_sorted.copy()
pre_final['number in line'] = pre_final.groupby(['gameId','playId', 'frameId', 'club']).cumcount()
#test_sorted

In [201]:
import warnings
warnings.filterwarnings("ignore")
# Process each game, play, and frame
all_play_types = pre_final.copy()
final_input_list = []
for gid in tqdm(all_play_types['gameId'].unique(), leave=True):
    game = all_play_types.loc[all_play_types['gameId'] == gid].copy()
    game_plays = plays[plays['gameId'] == gid]
    game_tackles = tackles[tackles['gameId'] == gid]

    for pid in tqdm(game['playId'].unique(), leave=False):
        play = game.loc[game['playId'] == pid].copy()
        play_data = game_plays[game_plays['playId'] == pid]
        play_tackles = game_tackles[game_tackles['playId'] == pid]  

        #Adding distance to ball carrier as a feature in tracking data
        #play = distance_to_ball(play_data, play)
        for fid in play['frameId'].unique():
            
            print(fid)
            offense = play[play["club"] == play_data["possessionTeam"].iloc[0]]
            defense = play[play["club"] == play_data["defensiveTeam"].iloc[0]]

            offense_input = create_dnn_input_df(offense, "o")
            defense_input = create_dnn_input_df(defense, "d")
            output_df = create_dnn_output_df(defense, play_tackles)
            
            player_tracking_data = pd.concat([offense_input, defense_input, output_df], axis=1)
        final_input_list.append(player_tracking_data)

        #Adding who tackled the ball carrier
        
final_input_df = pd.concat(final_input_list)


        



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

1
2
3


In [202]:
final_input_df

Unnamed: 0,o_x_0,o_y_0,o_s_0,o_a_0,o_dis_0,o_o_0,o_dir_0,o_distance_to_ball_0,o_x_1,o_y_1,...,tackle_23,tackle_24,tackle_25,tackle_26,tackle_27,tackle_28,tackle_29,tackle_30,tackle_31,tackle_32
0,80.34,37.09,5.98,2.48,0.6,127.16,194.4,2.051827,80.48,37.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# don't need ball carrier's distance to themselves
final_input_df = final_input_df.drop(columns=['o_distance_to_ball_carrier_0'])

In [31]:
#final_input_df.to_csv(index=False, path_or_buf="final_input_df_with_output.csv")

In [190]:
final_input_df.head()

Unnamed: 0,o_x_0,o_y_0,o_s_0,o_a_0,o_dis_0,o_o_0,o_dir_0,o_distance_to_ball_0,o_x_1,o_y_1,...,tackle_12,tackle_13,tackle_14,tackle_15,tackle_16,tackle_17,tackle_18,tackle_19,tackle_20,tackle_21
0,80.34,37.09,5.98,2.48,0.6,127.16,194.4,2.051827,80.48,37.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
