In [1232]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import IPython

Reading in Data

In [1233]:
"""Emily's reading in files"""
data_dir = os.getcwd() + '/data/nfl-big-data-bowl-2024/'
players = pd.read_csv(data_dir + "players.csv")
week_1 = pd.read_csv(data_dir + "tracking_week_1.csv")
week_2 = pd.read_csv(data_dir + "tracking_week_2.csv")
week_3 = pd.read_csv(data_dir + "tracking_week_3.csv")
week_4 = pd.read_csv(data_dir + "tracking_week_4.csv")
week_5 = pd.read_csv(data_dir + "tracking_week_5.csv")
week_6 = pd.read_csv(data_dir + "tracking_week_6.csv")
week_7 = pd.read_csv(data_dir + "tracking_week_7.csv")
week_8 = pd.read_csv(data_dir + "tracking_week_8.csv")
week_9 = pd.read_csv(data_dir + "tracking_week_9.csv")
plays = pd.read_csv(data_dir + "plays.csv")
tackles = pd.read_csv(data_dir + "tackles.csv")

weeks_list = [week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9]
week = pd.concat(weeks_list)

# join player positioning information onto a week's worth of tracking data 
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')

Creating df with input and output data

In [None]:
# get distance from ball
ball = week.groupby(['gameId','playId', 'frameId']).apply(lambda g: g[g['club'] == 'football'])[['x','y']]
week_and_ball = week.merge(ball, on = ['gameId','playId', 'frameId'], how = 'left', suffixes=('','_ball') )
week_and_ball['distance_to_ball'] = np.sqrt((week_and_ball["x"] - week_and_ball["x_ball"]) ** 2 + (week_and_ball["y"] - week_and_ball["y_ball"]) ** 2)

#sort by position and jerseyNumber
week_and_ball = week_and_ball.sort_values(['gameId','playId', 'frameId', 'club', 'position', 'jerseyNumber'])

#merge with tackles
input = week_and_ball.copy()
input = input.merge(tackles[['gameId', 'playId', 'nflId', 'tackle', 'assist']], on = ['gameId', 'playId', 'nflId'], how = 'left')
input['tackle'] = input['tackle'].fillna(0)
input['assist'] = input['assist'].fillna(0)

#get order of players (0-10)
input['number'] = input.groupby(['gameId','playId', 'frameId', 'club']).cumcount()
input = input[input['club'] != 'football']
input = input.sort_values(by = ['gameId', 'playId', 'frameId', 'position', 'jerseyNumber'])
input = input.reset_index(drop = True)

# merge with plays
input_transform = input.copy()
input_transform = input_transform.reset_index(drop = True)
plays = plays[['gameId', 'playId', 'possessionTeam', 'defensiveTeam']]
all_players = input.merge(plays, on = ['gameId', 'playId'])

# offense
offense = all_players[all_players['possessionTeam'] == all_players['club']]
offense = offense.rename(columns={c: 'o_'+c for c in offense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

o_table = pd.pivot_table(offense, values=['o_x', 'o_y', 'o_s', 'o_a', 'o_dis', 'o_o', 'o_dir', 'o_distance_to_ball'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
o_table.columns = o_table.columns.map('{0[0]}_{0[1]}'.format) 
o_table = o_table.reset_index(['gameId','playId','frameId'])

# defense - includes output data
defense = all_players[all_players['defensiveTeam'] == all_players['club']]
defense = defense.rename(columns={c: 'd_'+c for c in defense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

d_table = pd.pivot_table(defense, values=['d_x', 'd_y', 'd_s', 'd_a', 'd_dis', 'd_o', 'd_dir', 'd_distance_to_ball', 'tackle', 'assist'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
d_table.columns = d_table.columns.map('{0[0]}_{0[1]}'.format) 
d_table = d_table.reset_index(['gameId','playId','frameId'])
# combinate offense and defense
input_output_df = o_table.merge(d_table, on = ['gameId', 'playId', 'frameId'], how = 'right')

# calculate tackle credit 
tackle_cols = ['tackle_0', 'tackle_1', 'tackle_2', 'tackle_3', 'tackle_4', 'tackle_5', 'tackle_6', 'tackle_7', 'tackle_8', 'tackle_9', 'tackle_10']
assist_cols = ['assist_0', 'assist_1', 'assist_2', 'assist_3', 'assist_4', 'assist_5', 'assist_6', 'assist_7', 'assist_8', 'assist_9', 'assist_10']
tackle_assist_cols = tackle_cols + assist_cols

input_output_df['total_tacklers'] = input_output_df[tackle_assist_cols].sum(axis=1)
input_output_df['tackle_11'] = np.where(input_output_df['total_tacklers'] == 0 , 1, 0)
input_output_df.loc[:, tackle_cols] = input_output_df.loc[:, tackle_cols].div(input_output_df['total_tacklers'], axis=0).fillna(0)

# drop unnecessary columns
input_output_df.drop(assist_cols, axis=1, inplace=True)
input_output_df.drop('total_tacklers', axis=1, inplace=True)

In [None]:
input_output_df

Unnamed: 0,gameId,playId,frameId,o_a_0,o_a_1,o_a_2,o_a_3,o_a_4,o_a_5,o_a_6,...,tackle_2,tackle_3,tackle_4,tackle_5,tackle_6,tackle_7,tackle_8,tackle_9,tackle_10,tackle_11
0,2022090800,56,1,1.97,2.46,1.15,1.00,0.33,2.47,2.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2022090800,56,2,1.82,2.53,0.61,1.22,0.66,2.59,2.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2022090800,56,3,1.60,2.73,0.49,1.24,0.92,2.56,2.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2022090800,56,4,1.26,2.74,0.89,1.03,1.37,2.40,2.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2022090800,56,5,0.99,2.94,1.24,0.76,1.63,2.50,2.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529874,2022110700,3787,40,1.30,0.94,0.41,4.04,1.30,1.16,1.08,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529875,2022110700,3787,41,1.54,0.91,0.28,4.24,1.46,1.24,1.14,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529876,2022110700,3787,42,1.58,0.94,0.27,4.13,1.50,1.25,1.22,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529877,2022110700,3787,43,1.60,1.01,0.48,3.65,1.61,1.32,1.27,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
input_output_df.to_csv('input_output_df_base.csv')