In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
import IPython
from sklearn.preprocessing import StandardScaler

Reading in Data

In [1233]:
"""Emily's reading in files"""
data_dir = os.getcwd() + '/data/nfl-big-data-bowl-2024/'
players = pd.read_csv(data_dir + "players.csv")
week_1 = pd.read_csv(data_dir + "tracking_week_1.csv")
week_2 = pd.read_csv(data_dir + "tracking_week_2.csv")
week_3 = pd.read_csv(data_dir + "tracking_week_3.csv")
week_4 = pd.read_csv(data_dir + "tracking_week_4.csv")
week_5 = pd.read_csv(data_dir + "tracking_week_5.csv")
week_6 = pd.read_csv(data_dir + "tracking_week_6.csv")
week_7 = pd.read_csv(data_dir + "tracking_week_7.csv")
week_8 = pd.read_csv(data_dir + "tracking_week_8.csv")
week_9 = pd.read_csv(data_dir + "tracking_week_9.csv")
plays = pd.read_csv(data_dir + "plays.csv")
tackles = pd.read_csv(data_dir + "tackles.csv")

weeks_list = [week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9]
week = pd.concat(weeks_list)

# join player positioning information onto a week's worth of tracking data 
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')

In [4]:
def load_dataset(dataset_name):
    """
    Download a specific dataset from data directory.

    Parameters:
    - dataset_name: Name of the dataset to load
    """
    return pd.read_csv(f"C:\\Users\\mattd\\Documents\\GitHub\\big-data-bowl-2024\\data\\{dataset_name}.csv")

# Read In csvs
plays = load_dataset("plays")
players = load_dataset("players")
week_1 = load_dataset("tracking_week_1")
week_2 = load_dataset("tracking_week_2")
week_3 = load_dataset("tracking_week_3")
week_4 = load_dataset("tracking_week_4")
week_5 = load_dataset("tracking_week_5")
week_6 = load_dataset("tracking_week_6")
week_7 = load_dataset("tracking_week_7")
week_8 = load_dataset("tracking_week_8")
week_9 = load_dataset("tracking_week_9")
tackles = load_dataset("tackles")

weeks_list = [week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9]
week = pd.concat(weeks_list)

week = week.merge(players.loc[:, ['nflId', 'position']], how='left')
week.shape

(12187398, 18)

Creating df with input and output data

In [5]:
# get distance from ball
ball = week.groupby(['gameId','playId', 'frameId']).apply(lambda g: g[g['club'] == 'football'])[['x','y']]
week_and_ball = week.merge(ball, on = ['gameId','playId', 'frameId'], how = 'left', suffixes=('','_ball') )
week_and_ball['distance_to_ball'] = np.sqrt((week_and_ball["x"] - week_and_ball["x_ball"]) ** 2 + (week_and_ball["y"] - week_and_ball["y_ball"]) ** 2)

#sort by position and jerseyNumber
week_and_ball = week_and_ball.sort_values(['gameId','playId', 'frameId', 'club', 'position', 'jerseyNumber'])

#merge with tackles
input = week_and_ball.copy()
input = input.merge(tackles[['gameId', 'playId', 'nflId', 'tackle', 'assist']], on = ['gameId', 'playId', 'nflId'], how = 'left')
input['tackle'] = input['tackle'].fillna(0)
input['assist'] = input['assist'].fillna(0)

#get order of players (0-10)
input['number'] = input.groupby(['gameId','playId', 'frameId', 'club']).cumcount()
input = input[input['club'] != 'football']
# check if needed
input = input.sort_values(by = ['gameId', 'playId', 'frameId', 'position', 'jerseyNumber'])
input = input.reset_index(drop = True)

# merge with plays
input_transform = input.copy()
#check
input_transform = input_transform.reset_index(drop = True)
plays = plays[['gameId', 'playId', 'possessionTeam', 'defensiveTeam']]
all_players = input.merge(plays, on = ['gameId', 'playId'])

# offense
offense = all_players[all_players['possessionTeam'] == all_players['club']]
offense = offense.rename(columns={c: 'o_'+c for c in offense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

o_table = pd.pivot_table(offense, values=['o_x', 'o_y', 'o_s', 'o_a', 'o_dis', 'o_o', 'o_dir', 'o_distance_to_ball'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
o_table.columns = o_table.columns.map('{0[0]}_{0[1]}'.format) 
o_table = o_table.reset_index(['gameId','playId','frameId'])

# defense - includes output data
defense = all_players[all_players['defensiveTeam'] == all_players['club']]
defense = defense.rename(columns={c: 'd_'+c for c in defense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

d_table = pd.pivot_table(defense, values=['d_x', 'd_y', 'd_s', 'd_a', 'd_dis', 'd_o', 'd_dir', 'd_distance_to_ball', 'tackle', 'assist'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
d_table.columns = d_table.columns.map('{0[0]}_{0[1]}'.format) 
d_table = d_table.reset_index(['gameId','playId','frameId'])

# combinate offense and defense
input_output_df = o_table.merge(d_table, on = ['gameId', 'playId', 'frameId'], how = 'right')

# calculate tackle credit 
tackle_cols = ['tackle_0', 'tackle_1', 'tackle_2', 'tackle_3', 'tackle_4', 'tackle_5', 'tackle_6', 'tackle_7', 'tackle_8', 'tackle_9', 'tackle_10']
assist_cols = ['assist_0', 'assist_1', 'assist_2', 'assist_3', 'assist_4', 'assist_5', 'assist_6', 'assist_7', 'assist_8', 'assist_9', 'assist_10']
tackle_assist_cols = tackle_cols + assist_cols

input_output_df['total_tacklers'] = input_output_df[tackle_assist_cols].sum(axis=1)
input_output_df['tackle_11'] = np.where(input_output_df['total_tacklers'] == 0 , 1, 0)
input_output_df.loc[:, tackle_cols] = input_output_df.loc[:, tackle_cols].div(input_output_df['total_tacklers'], axis=0).fillna(0)

# drop unnecessary columns
input_output_df.drop(assist_cols, axis=1, inplace=True)
input_output_df.drop('total_tacklers', axis=1, inplace=True)

In [6]:
input_output_df

Unnamed: 0,gameId,playId,frameId,o_a_0,o_a_1,o_a_2,o_a_3,o_a_4,o_a_5,o_a_6,...,tackle_2,tackle_3,tackle_4,tackle_5,tackle_6,tackle_7,tackle_8,tackle_9,tackle_10,tackle_11
0,2022090800,56,1,1.97,2.46,1.15,1.00,0.33,2.47,2.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2022090800,56,2,1.82,2.53,0.61,1.22,0.66,2.59,2.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2022090800,56,3,1.60,2.73,0.49,1.24,0.92,2.56,2.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2022090800,56,4,1.26,2.74,0.89,1.03,1.37,2.40,2.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2022090800,56,5,0.99,2.94,1.24,0.76,1.63,2.50,2.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529874,2022110700,3787,40,1.30,0.94,0.41,4.04,1.30,1.16,1.08,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529875,2022110700,3787,41,1.54,0.91,0.28,4.24,1.46,1.24,1.14,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529876,2022110700,3787,42,1.58,0.94,0.27,4.13,1.50,1.25,1.22,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529877,2022110700,3787,43,1.60,1.01,0.48,3.65,1.61,1.32,1.27,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [1236]:
input_output_df.to_csv('input_output_df_base.csv')

In [1238]:
for i in input_output_df.columns:
    print(i)

gameId
playId
frameId
o_a_0
o_a_1
o_a_2
o_a_3
o_a_4
o_a_5
o_a_6
o_a_7
o_a_8
o_a_9
o_a_10
o_dir_0
o_dir_1
o_dir_2
o_dir_3
o_dir_4
o_dir_5
o_dir_6
o_dir_7
o_dir_8
o_dir_9
o_dir_10
o_dis_0
o_dis_1
o_dis_2
o_dis_3
o_dis_4
o_dis_5
o_dis_6
o_dis_7
o_dis_8
o_dis_9
o_dis_10
o_distance_to_ball_0
o_distance_to_ball_1
o_distance_to_ball_2
o_distance_to_ball_3
o_distance_to_ball_4
o_distance_to_ball_5
o_distance_to_ball_6
o_distance_to_ball_7
o_distance_to_ball_8
o_distance_to_ball_9
o_distance_to_ball_10
o_o_0
o_o_1
o_o_2
o_o_3
o_o_4
o_o_5
o_o_6
o_o_7
o_o_8
o_o_9
o_o_10
o_s_0
o_s_1
o_s_2
o_s_3
o_s_4
o_s_5
o_s_6
o_s_7
o_s_8
o_s_9
o_s_10
o_x_0
o_x_1
o_x_2
o_x_3
o_x_4
o_x_5
o_x_6
o_x_7
o_x_8
o_x_9
o_x_10
o_y_0
o_y_1
o_y_2
o_y_3
o_y_4
o_y_5
o_y_6
o_y_7
o_y_8
o_y_9
o_y_10
d_a_0
d_a_1
d_a_2
d_a_3
d_a_4
d_a_5
d_a_6
d_a_7
d_a_8
d_a_9
d_a_10
d_dir_0
d_dir_1
d_dir_2
d_dir_3
d_dir_4
d_dir_5
d_dir_6
d_dir_7
d_dir_8
d_dir_9
d_dir_10
d_dis_0
d_dis_1
d_dis_2
d_dis_3
d_dis_4
d_dis_5
d_dis_6
d_dis_7
d_dis_8
d_dis

In [7]:
just_input = input_output_df.iloc[:, 3:-12]
just_input

Unnamed: 0,o_a_0,o_a_1,o_a_2,o_a_3,o_a_4,o_a_5,o_a_6,o_a_7,o_a_8,o_a_9,...,d_y_1,d_y_2,d_y_3,d_y_4,d_y_5,d_y_6,d_y_7,d_y_8,d_y_9,d_y_10
0,1.97,2.46,1.15,1.00,0.33,2.47,2.57,4.49,6.34,0.79,...,40.78,33.78,25.47,31.61,21.15,27.53,33.64,28.50,37.91,20.33
1,1.82,2.53,0.61,1.22,0.66,2.59,2.75,4.09,6.53,0.81,...,40.21,33.65,25.54,31.32,21.26,27.69,33.57,28.69,37.60,20.33
2,1.60,2.73,0.49,1.24,0.92,2.56,2.90,3.72,6.70,0.78,...,39.63,33.52,25.59,30.98,21.44,27.88,33.50,28.90,37.25,20.34
3,1.26,2.74,0.89,1.03,1.37,2.40,2.92,3.42,6.31,0.93,...,39.03,33.39,25.63,30.65,21.67,28.13,33.44,29.14,36.87,20.39
4,0.99,2.94,1.24,0.76,1.63,2.50,2.75,2.89,5.84,0.91,...,38.44,33.26,25.65,30.30,21.95,28.42,33.39,29.38,36.48,20.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529874,1.30,0.94,0.41,4.04,1.30,1.16,1.08,0.40,1.89,2.13,...,9.19,19.39,19.81,38.17,21.37,20.89,17.36,24.32,24.13,18.39
529875,1.54,0.91,0.28,4.24,1.46,1.24,1.14,0.38,2.00,2.56,...,9.21,19.28,19.68,37.61,21.44,20.64,17.50,24.33,23.76,18.38
529876,1.58,0.94,0.27,4.13,1.50,1.25,1.22,0.45,2.18,3.01,...,9.23,19.16,19.60,37.06,21.51,20.40,17.67,24.35,23.42,18.39
529877,1.60,1.01,0.48,3.65,1.61,1.32,1.27,0.61,2.38,2.93,...,9.23,19.13,19.55,36.51,21.60,20.19,17.83,24.38,23.11,18.40


In [8]:
just_output = input_output_df.iloc[:, -12:]
just_output

Unnamed: 0,tackle_0,tackle_1,tackle_2,tackle_3,tackle_4,tackle_5,tackle_6,tackle_7,tackle_8,tackle_9,tackle_10,tackle_11
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
529874,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529875,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529876,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529877,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
scaler = StandardScaler()
normalized_input = pd.DataFrame(scaler.fit_transform(just_input), columns=just_input.columns)
normalized_input

Unnamed: 0,o_a_0,o_a_1,o_a_2,o_a_3,o_a_4,o_a_5,o_a_6,o_a_7,o_a_8,o_a_9,...,d_y_1,d_y_2,d_y_3,d_y_4,d_y_5,d_y_6,d_y_7,d_y_8,d_y_9,d_y_10
0,0.982338,1.348459,-0.137390,-0.446534,-1.127424,1.018461,1.568344,2.197760,3.364562,-0.911863,...,1.073038,0.654039,-0.194692,0.641409,-0.713721,0.072564,0.791930,0.196206,1.261203,-0.695015
1,0.790808,1.428394,-0.733708,-0.272019,-0.888456,1.133029,1.780948,1.872814,3.509933,-0.897046,...,1.029438,0.641896,-0.184856,0.602057,-0.699989,0.091490,0.783622,0.219676,1.225649,-0.695015
2,0.509898,1.656781,-0.866224,-0.256154,-0.700178,1.104387,1.958118,1.572239,3.640001,-0.919271,...,0.985073,0.629752,-0.177829,0.555919,-0.677520,0.113963,0.775315,0.245616,1.185508,-0.693955
3,0.075763,1.668201,-0.424506,-0.422736,-0.374313,0.951629,1.981741,1.328530,3.341609,-0.808145,...,0.939178,0.617608,-0.172208,0.511139,-0.648808,0.143534,0.768194,0.275263,1.141926,-0.688655
4,-0.268991,1.896588,-0.038003,-0.636913,-0.186035,1.047103,1.780948,0.897977,2.982008,-0.822962,...,0.894048,0.605465,-0.169398,0.463644,-0.613856,0.177837,0.762260,0.304909,1.097197,-0.682296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529874,0.126838,-0.387284,-0.954567,1.964937,-0.425003,-0.232250,-0.191546,-1.124810,-0.040168,0.080858,...,-1.343328,-0.690168,-0.990066,1.531593,-0.686258,-0.712838,-1.140181,-0.320132,-0.319215,-0.900654
529875,0.433286,-0.421542,-1.098125,2.123586,-0.309140,-0.155870,-0.120678,-1.141057,0.043994,0.399418,...,-1.341798,-0.700444,-1.008334,1.455602,-0.677520,-0.742409,-1.123566,-0.318897,-0.361650,-0.901714
529876,0.484360,-0.387284,-1.109168,2.036329,-0.280174,-0.146323,-0.026188,-1.084192,0.181713,0.732794,...,-1.340268,-0.711653,-1.019576,1.380967,-0.668781,-0.770797,-1.103390,-0.316426,-0.400644,-0.900654
529877,0.509898,-0.307349,-0.877266,1.655570,-0.200518,-0.079491,0.032869,-0.954213,0.334735,0.673527,...,-1.340268,-0.714456,-1.026603,1.306333,-0.657547,-0.795637,-1.084402,-0.312720,-0.436198,-0.899594


In [10]:
normalized_input_output_df = pd.concat([input_output_df[['gameId', 'playId', 'frameId']], normalized_input, just_output], axis = 1)

In [85]:
def last_5_converter(df):
  final_input = pd.DataFrame(columns=['four', 'three', 'two', 'one', 'current', 'tackle_1','tackle_2','tackle_3','tackle_4','tackle_5','tackle_6','tackle_7','tackle_8','tackle_9','tackle_10','tackle_11', 'tackle_12'])
  for idx, row in df.iterrows():
    if row['frameId'] < 5:
      current = df.iloc[idx, 4:-12].tolist()
      previous_frames = [np.zeros(17) for _ in range(4)]
      #print("FrameID",row['frameId'])
      # Use a loop to assign the lists
      for i in range(1, int(row['frameId'])):
          #print(3-i)
          previous_frames[4-i] = df.iloc[idx - i, 4:-12].tolist()

      new_row = [previous_frames[0], previous_frames[1], previous_frames[2], previous_frames[3], current, df.at[idx,'tackle_0'], df.at[idx,'tackle_1'], df.at[idx,'tackle_2'], df.at[idx,'tackle_3'], df.at[idx,'tackle_4'], df.at[idx,'tackle_5'], df.at[idx,'tackle_6'], df.at[idx,'tackle_7'], df.at[idx,'tackle_8'], df.at[idx,'tackle_9'], df.at[idx,'tackle_10'], df.at[idx, 'tackle_11']]
      final_input.loc[len(final_input.index)] = new_row 
    else:
      current = df.iloc[idx, 4:-12].tolist()
      one = df.iloc[idx - 1, 4:-12].tolist()
      two = df.iloc[idx - 2, 4:-12].tolist()
      three = df.iloc[idx - 3, 4:-12].tolist()
      four = df.iloc[idx - 4, 4:-12].tolist()
      new_row = [four, three, two, one, current, df.at[idx,'tackle_0'], df.at[idx,'tackle_1'], df.at[idx,'tackle_2'], df.at[idx,'tackle_3'], df.at[idx,'tackle_4'], df.at[idx,'tackle_5'], df.at[idx,'tackle_6'], df.at[idx,'tackle_7'], df.at[idx,'tackle_8'], df.at[idx,'tackle_9'], df.at[idx,'tackle_10'], df.at[idx, 'tackle_11']]
      # Assuming final_input.columns contains the correct column names
      new_row_series = pd.Series(new_row, index=final_input.columns)
      # Concatenate along axis=1
      final_input.loc[len(final_input.index)] = new_row 
      #final_input = pd.concat([final_input,pd.Series(new_row, index=final_input.columns)], axis = 1, ignore_index=True)
  return final_input

In [None]:
model_input_no_id = last_5_converter(normalized_input_output_df)

In [88]:
len(model_input_no_id)

100