In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import IPython

Reading in Data

In [2]:
"""Katies reading in files"""
players = pd.read_csv("players.csv")
week = pd.read_csv("tracking_week_1.csv")
plays = pd.read_csv("plays.csv")
tackles = pd.read_csv("tackles.csv")


# join player positioning information onto a week's worth of tracking data 
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')
week.shape

(1407439, 18)

In [None]:
""" Matts reading in files """
def load_dataset(dataset_name):
    """
    Download a specific dataset from data directory.

    Parameters:
    - dataset_name: Name of the dataset to load
    """
    return pd.read_csv(f"C:\\Users\\mattd\\Documents\\GitHub\\big-data-bowl-2024\\data\\{dataset_name}.csv")

# Read In csvs
plays = load_dataset("plays")
players = load_dataset("players")
week = load_dataset("tracking_week_1")
tackles = load_dataset("tackles")
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')
week.shape

Defining Functions for feature engineering at each level

(Just put in ones I thought of, if it has #TO DO: over it then it isnt complete/not started)

In [3]:
"""Game level functions"""

def get_frames_of_catch(games):
    """
    Returns only the frame of the play where the ball is caught.

    Parameters:
    - games: Dataset of games

    Returns:
    - frame: The tracking data of only the data where the ball is caught
    """
    return games.loc[games['event'] == 'pass_outcome_caught'].copy()

In [4]:
"""Frame level functions"""
def distance_to_ball(play_data, frame, ball_carrier_id):
    """
    Calculates the distance to the ball for each player per frame in a play.

    Parameters:
    - dataset_name: Name of the dataset to load
    """
    ball_carrier = frame[frame["nflId"] == ball_carrier_id]
    carr_x = ball_carrier["x"].values[0]
    carr_y = ball_carrier["y"].values[0]
    frame['distance_to_ball_carrier'] = cdist(frame[['x', 'y']], [[carr_x, carr_y]], metric='euclidean')
    frame['ballCarrierId'] = ball_carrier_id  # Include ballCarrierId in the frame
    return frame

def who_tackles(play_data, frame):
    """
    Adds who tackles the ball carrier to the tracking data set.

    Parameters:
    - dataset_name: Name of the dataset to load
    """
    ball_carrier_id = play_data["ballCarrierId"].iloc[0]
    ball_carrier = frame[frame["nflId"] == ball_carrier_id]
    carr_x = ball_carrier["x"].values[0]
    carr_y = ball_carrier["y"].values[0]
    frame['distance_to_ball_carrier'] = cdist(frame[['x', 'y']], [[carr_x, carr_y]], metric='euclidean')
    return frame

def create_dnn_input_df(frames, side_of_ball):
    """
    Creates the input dataframe for the DNN model.

    Parameters:
    - frames: Name of the dataset to load
    """
    sorted = frames.sort_values(by='distance_to_ball_carrier').reset_index(drop=True)
    
    input_df = pd.DataFrame()
    # Iterate over each row in the sorted DataFrame
    for idx, row in sorted.iterrows():
        # Extract player's x and y values
        x = row['x']
        y = row['y']
        s = row['s']
        a = row['a']
        dis = row['dis']
        o = row['o']
        dir = row['dir']
        distance_to_ball_carrier = row['distance_to_ball_carrier']


        # Add x_i columns to the new DataFrame
        input_df.loc[0,f'{side_of_ball}_x_{idx}'] = x
        input_df.loc[0,f'{side_of_ball}_y_{idx}'] = y
        input_df.loc[0,f'{side_of_ball}_s_{idx}'] = s
        input_df.loc[0,f'{side_of_ball}_a_{idx}'] = a
        input_df.loc[0,f'{side_of_ball}_dis_{idx}'] = dis
        input_df.loc[0,f'{side_of_ball}_o_{idx}'] = o
        input_df.loc[0,f'{side_of_ball}_dir_{idx}'] = dir
        input_df.loc[0,f'{side_of_ball}_distance_to_ball_carrier_{idx}'] = distance_to_ball_carrier

    # Concatenate the original DataFrame with the new x_i DataFrame
    return input_df
    


def calculate_nearby_players(play, radius=5):
    
    """
    Calculate the number of defenders and blockers within a specified radius of the ball carrier.
    
    Parameters:
    play - DataFrame containing play data including player positions and teams
    radius - Distance within which a player is considered 'nearby' ( 5 yards)
    
    Returns:
    DataFrame with additional columns 'defenders_nearby' and 'blockers_nearby'
    """
    
    ball_carrier_id = play["ballCarrierId"].iloc[0]
    ball_carrier = play[play["nflId"] == ball_carrier_id]

    # Calculate distances from the ball carrier to all players
    play['distance_to_carrier'] = np.sqrt((play['x'] - ball_carrier['x'].values[0])**2 + (play['y'] - ball_carrier['y'].values[0])**2)

    # Identify defenders and blockers
    is_defender = (play['club'] != ball_carrier['club'].values[0]) & (play['nflId'] != ball_carrier_id)
    is_blocker = (play['club'] == ball_carrier['club'].values[0]) & (play['nflId'] != ball_carrier_id)

    # Count nearby defenders and blockers
    defenders_nearby = play[is_defender & (play['distance_to_carrier'] < radius)].groupby(['gameId', 'playId']).nflId.nunique().rename('defenders_nearby')
    blockers_nearby = play[is_blocker & (play['distance_to_carrier'] < radius)].groupby(['gameId', 'playId']).nflId.nunique().rename('blockers_nearby')

    return defenders_nearby, blockers_nearby

In [None]:
#do not run until fixed:

#ERROR FOR BOTH:

1. It runs but cannot get it to merge in the "Process each game, play, and frame" portion (view 2 comments there)
2. ValueError: The column label 'gameId' is not unique. Merge issue????

    
def calculate_snap_to_catch_time(plays, tracking):
    """
    Calculates the time in milliseconds from ball_snap to pass_outcome_caught for each play.

    Parameters:
    - plays: DataFrame containing play data
    - tracking: DataFrame containing tracking data

    Returns:
    - DataFrame with gameId, playId, and time difference in milliseconds
    """
    time_diffs = []

    for pid in plays['playId'].unique():
        # Get the gameId for the current playId from the plays DataFrame
        gid = plays[plays['playId'] == pid]['gameId'].iloc[0]
        
        play_tracking = tracking[tracking['playId'] == pid]

        snap_tracking = play_tracking[play_tracking['event'] == 'ball_snap']
        catch_tracking = play_tracking[play_tracking['event'] == 'pass_outcome_caught']

        if not snap_tracking.empty and not catch_tracking.empty:
            snap_time = pd.to_datetime(snap_tracking['time'].iloc[0])
            catch_time = pd.to_datetime(catch_tracking['time'].iloc[0])

            time_diff = (catch_time - snap_time).total_seconds() * 1000
            time_diffs.append({'gameId': gid, 'playId': pid, 'time_diff_ms': time_diff})
        else:
            time_diffs.append({'gameId': gid, 'playId': pid, 'time_diff_ms': None})

    return pd.DataFrame(time_diffs)


def merge_play_data(final_df, plays_df, columns_to_merge):
    """
    Merges specified columns from the plays DataFrame into the final DataFrame.

    Parameters:
    - final_df: The final DataFrame to which the columns will be added.
    - plays_df: The DataFrame containing play data.
    - columns_to_merge: List of column names to be merged from plays_df to final_df.

    Returns:
    - DataFrame with merged columns.
    """
    # Select required columns from plays DataFrame along with gameId and playId
    plays_subset = plays_df[['gameId', 'playId'] + columns_to_merge]

    # Merge the plays data with the final DataFrame
    merged_df = final_df.merge(plays_subset, on=['gameId', 'playId'], how='left')
    return merged_df

# Usage of the function
columns_to_add = ['quarter', 'down', 'yardsToGo', 'gameClock', 'passLength', 'expectedPoints']
final_input_df = merge_play_data(final_input_df, plays, columns_to_add)



Iterating through each Play

In [5]:
# Process each game, play, and frame
pass_only = get_frames_of_catch(week)
#time_diffs_df = calculate_snap_to_catch_time(plays, week)-- this was for the calculate_snap_to_catch_time


final_input_list = []
for gid in tqdm(pass_only['gameId'].unique(), leave=True):
    
    # print(f"Processing Game ID: {gid}")  # Printing Game ID 
    game = pass_only.loc[pass_only['gameId'] == gid].copy()
    game_plays = plays[plays['gameId'] == gid]
    game_tackles = tackles[tackles['gameId'] == gid]

    for pid in tqdm(game['playId'].unique(), leave=False):
        #print(f"  Processing Play ID: {pid}")  # Print Play ID
        play = game.loc[game['playId'] == pid].copy()
        play_data = game_plays[game_plays['playId'] == pid]
        ball_carrier_id = play_data["ballCarrierId"].iloc[0]  # Extract ballCarrierId

        
        #print("Columns in play_data:", play_data.columns)


        #Adding distance to ball carrier as a feature in tracking data
        play = distance_to_ball(play_data, play, ball_carrier_id)  # Pass ballCarrierId

        
        #print("Columns in play after distance_to_ball:", play.columns)

            
        defenders_nearby, blockers_nearby = calculate_nearby_players(play)

        offense = play[play["club"] == play_data["possessionTeam"].iloc[0]]
        defense = play[play["club"] == play_data["defensiveTeam"].iloc[0]]
        #print(defense.head())
        #print("Length of ",len(defense))
        #print("Length of ",len(offense))


        offense_input = create_dnn_input_df(offense, "o")
        defense_input = create_dnn_input_df(defense, "d")
        
        # Including the game and play IDs in the offense and defense dataframes
        offense_input['gameId'] = gid
        defense_input['gameId'] = gid
        offense_input['playId'] = pid
        defense_input['playId'] = pid
        offense_input['nflid'] = pid
        defense_input['nflid'] = pid
        
        player_tracking_data = pd.concat([offense_input, defense_input], axis=1)
        player_tracking_data['defenders_nearby'] = defenders_nearby.get((gid, pid), 0)
        player_tracking_data['blockers_nearby'] = blockers_nearby.get((gid, pid), 0)
        
        
        final_input_list.append(player_tracking_data)

        
final_input_df = pd.concat(final_input_list, ignore_index=True)

#this was for the calculate_snap_to_catch_time
#final_input_df = final_input_df.merge(time_diffs_df, on=['gameId', 'playId'], how='left')


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=52.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=32.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=22.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=39.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=51.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=37.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=47.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=43.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=49.0), HTML(value='')))




In [None]:
final_input_df.to_csv(index=False, path_or_buf="final_input_df.csv")

In [None]:
final_input_df.columns.tolist() 


In [6]:
final_input_df



Unnamed: 0,o_x_0,o_y_0,o_s_0,o_a_0,o_dis_0,o_o_0,o_dir_0,o_distance_to_ball_carrier_0,o_x_1,o_y_1,...,d_a_10,d_dis_10,d_o_10,d_dir_10,d_distance_to_ball_carrier_10,gameId,playId,nflid,defenders_nearby,blockers_nearby
0,79.85,35.59,4.61,4.82,0.45,114.27,202.20,0.0,74.99,29.43,...,4.14,0.27,331.57,278.33,29.415605,2022090800,56,56,2,0
1,67.89,38.41,4.75,1.46,0.47,52.19,335.55,0.0,68.74,32.54,...,2.12,0.39,82.35,239.91,35.097602,2022090800,122,122,2,0
2,51.68,27.87,5.00,1.40,0.50,117.58,226.52,0.0,60.26,25.86,...,4.05,0.61,321.01,314.26,16.807335,2022090800,167,167,1,0
3,37.68,46.87,7.26,2.74,0.74,93.99,331.40,0.0,47.86,47.24,...,0.78,0.16,1.30,27.20,37.464150,2022090800,212,212,1,0
4,19.30,44.85,8.43,0.93,0.85,315.91,304.27,0.0,33.24,32.57,...,1.18,0.07,319.93,246.50,30.755188,2022090800,236,236,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,80.24,39.11,0.32,3.83,0.04,168.11,354.59,0.0,87.61,25.38,...,4.06,0.47,109.09,136.58,34.780549,2022091200,3596,3596,1,0
686,72.33,41.04,2.50,1.77,0.24,33.31,336.17,0.0,63.69,45.32,...,4.48,0.55,342.72,298.57,32.704350,2022091200,3628,3628,2,0
687,68.01,38.45,5.95,1.20,0.60,24.30,348.39,0.0,74.56,30.32,...,1.52,0.65,152.68,225.08,30.846175,2022091200,3723,3723,1,0
688,64.24,35.96,4.62,2.50,0.47,2.78,10.50,0.0,62.88,35.32,...,1.91,0.22,34.60,284.04,32.248597,2022091200,3747,3747,2,2
