Data Preprocessing with test data creation commented out

In [47]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm_notebook
import IPython
from sklearn.preprocessing import StandardScaler
import random

Reading in NFL Data

In [48]:
"""reading in files"""
data_dir = os.getcwd() + '/data/nfl-big-data-bowl-2024/'
players = pd.read_csv(data_dir + "players.csv")
week_1 = pd.read_csv(data_dir + "tracking_week_1.csv")
week_2 = pd.read_csv(data_dir + "tracking_week_2.csv")
week_3 = pd.read_csv(data_dir + "tracking_week_3.csv")
week_4 = pd.read_csv(data_dir + "tracking_week_4.csv")
week_5 = pd.read_csv(data_dir + "tracking_week_5.csv")
week_6 = pd.read_csv(data_dir + "tracking_week_6.csv")
week_7 = pd.read_csv(data_dir + "tracking_week_7.csv")
week_8 = pd.read_csv(data_dir + "tracking_week_8.csv")
week_9 = pd.read_csv(data_dir + "tracking_week_9.csv")
plays = pd.read_csv(data_dir + "plays.csv")
tackles = pd.read_csv(data_dir + "tackles.csv")

weeks_list = [week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9]
week = pd.concat(weeks_list)

# join player positioning information onto a week's worth of tracking data 
week = week.merge(players.loc[:, ['nflId', 'position']], how='left')

Creating df with input and output data (player features)

In [49]:
# get distance from ball
ball = week.groupby(['gameId','playId', 'frameId']).apply(lambda g: g[g['club'] == 'football'])[['x','y']]
week_and_ball = week.merge(ball, on = ['gameId','playId', 'frameId'], how = 'left', suffixes=('','_ball') )
week_and_ball['distance_to_ball'] = np.sqrt((week_and_ball["x"] - week_and_ball["x_ball"]) ** 2 + (week_and_ball["y"] - week_and_ball["y_ball"]) ** 2)


#sort by position and jerseyNumber
week_and_ball = week_and_ball.sort_values(['gameId','playId', 'frameId', 'club', 'position', 'jerseyNumber'])

#merge with tackles
input = week_and_ball.copy()
input = input.merge(tackles[['gameId', 'playId', 'nflId', 'tackle', 'assist']], on = ['gameId', 'playId', 'nflId'], how = 'left')
input['tackle'] = input['tackle'].fillna(0)
input['assist'] = input['assist'].fillna(0)

#get order of players (0-10)
input['number'] = input.groupby(['gameId','playId', 'frameId', 'club']).cumcount()
input = input[input['club'] != 'football']
# check if needed
input = input.sort_values(by = ['gameId', 'playId', 'frameId', 'position', 'jerseyNumber'])
input = input.reset_index(drop = True)

# merge with plays
input_transform = input.copy()
#check
input_transform = input_transform.reset_index(drop = True)
plays = plays[['gameId', 'playId', 'possessionTeam', 'defensiveTeam']]
all_players = input.merge(plays, on = ['gameId', 'playId'])

# offense
offense = all_players[all_players['possessionTeam'] == all_players['club']]
offense = offense.rename(columns={c: 'o_'+c for c in offense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

o_table = pd.pivot_table(offense, values=['o_x', 'o_y', 'o_s', 'o_a', 'o_dis', 'o_o', 'o_dir', 'o_distance_to_ball'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
o_table.columns = o_table.columns.map('{0[0]}_{0[1]}'.format) 
o_table = o_table.reset_index(['gameId','playId','frameId'])

# defense - includes output data
defense = all_players[all_players['defensiveTeam'] == all_players['club']]
defense = defense.rename(columns={c: 'd_'+c for c in defense.columns if c in ['x', 'y', 's', 'a', 'dis', 'o', 'dir', 'distance_to_ball']})

d_table = pd.pivot_table(defense, values=['d_x', 'd_y', 'd_s', 'd_a', 'd_dis', 'd_o', 'd_dir', 'd_distance_to_ball', 'tackle', 'assist'], index=['gameId', 'playId', 'frameId'],
                                                                     #,'position', 'jerseyNumber'],
                       columns=['number'], aggfunc="mean")
d_table.columns = d_table.columns.map('{0[0]}_{0[1]}'.format) 
d_table = d_table.reset_index(['gameId','playId','frameId'])

# combinate offense and defense
input_output_df = o_table.merge(d_table, on = ['gameId', 'playId', 'frameId'], how = 'right')

# calculate tackle credit 
tackle_cols = ['tackle_0', 'tackle_1', 'tackle_2', 'tackle_3', 'tackle_4', 'tackle_5', 'tackle_6', 'tackle_7', 'tackle_8', 'tackle_9', 'tackle_10']
assist_cols = ['assist_0', 'assist_1', 'assist_2', 'assist_3', 'assist_4', 'assist_5', 'assist_6', 'assist_7', 'assist_8', 'assist_9', 'assist_10']
tackle_assist_cols = tackle_cols + assist_cols

input_output_df['total_tacklers'] = input_output_df[tackle_assist_cols].sum(axis=1)
input_output_df['tackle_11'] = np.where(input_output_df['total_tacklers'] == 0 , 1, 0)

input_output_df['tackle_0'] = input_output_df['tackle_0'] + input_output_df['assist_0']
input_output_df['tackle_1'] = input_output_df['tackle_1'] + input_output_df['assist_1']
input_output_df['tackle_2'] = input_output_df['tackle_2'] + input_output_df['assist_2']
input_output_df['tackle_3'] = input_output_df['tackle_3'] + input_output_df['assist_3']
input_output_df['tackle_4'] = input_output_df['tackle_4'] + input_output_df['assist_4']
input_output_df['tackle_5'] = input_output_df['tackle_5'] + input_output_df['assist_5']
input_output_df['tackle_6'] = input_output_df['tackle_6'] + input_output_df['assist_6']
input_output_df['tackle_7'] = input_output_df['tackle_7'] + input_output_df['assist_7']
input_output_df['tackle_8'] = input_output_df['tackle_8'] + input_output_df['assist_8']
input_output_df['tackle_9'] = input_output_df['tackle_9'] + input_output_df['assist_9']
input_output_df['tackle_10'] = input_output_df['tackle_10'] + input_output_df['assist_10']

input_output_df.loc[:, tackle_cols] = input_output_df.loc[:, tackle_cols].div(input_output_df['total_tacklers'], axis=0).fillna(0)

# drop unnecessary columns
input_output_df.drop(assist_cols, axis=1, inplace=True)
input_output_df.drop('total_tacklers', axis=1, inplace=True)

Reformat data to feed into LSTM 

In [50]:
# remove ids and output columns for preprocessing
just_input = input_output_df.iloc[:, 3:-12]

# get output columns 11 players and "no tackle"
just_output = input_output_df.iloc[:, -12:]

In [51]:
# test_input_with_id = input_output_df[~input_output_df[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_250k)]
# test_input_no_id = test_input_with_id.iloc[:, 3:-12]
# test_input_just_id = test_input_with_id.iloc[:, :3]
# just_test_output = test_input_with_id.iloc[:, -12:]
# just_test_output = just_test_output.reset_index(drop = True)

Normalization (train and test data)

In [52]:
scaler = StandardScaler()
normalized_input = pd.DataFrame(scaler.fit_transform(just_input), columns = just_input.columns)

# normalized_test_input = pd.DataFrame(scaler.transform(test_input_no_id), columns = test_input_no_id.columns)
# test_input_just_id = test_input_just_id.reset_index(drop=True)
# normalized_test_input_df = pd.concat([test_input_just_id, normalized_test_input, just_test_output], axis = 1)

In [53]:
# add ids and output back in for LSTM formatting
normalized_input_output_df = pd.concat([input_output_df[['gameId', 'playId', 'frameId']], normalized_input, just_output], axis = 1)

In [54]:
def last_5_converter(df):
    ''' takes each frame and grabs 4 previous frames, inputting zeros when frame < 5'''
    rows = []
    for idx, row in tqdm_notebook(df.iterrows()):
        current = df.iloc[idx, 3:-12].tolist()
        tackles = [df.at[idx, f'tackle_{i}'] for i in range(12)]
        #ids = [df.at[idx, f'{i}'] for i in ['gameId', 'playId', 'frameId']]
        if row['frameId'] < 5:
            previous_frames = [np.zeros(176) for _ in range(4)]
            for i in range(1, int(row['frameId'])):
                previous_frames[4-i] = df.iloc[idx - i, 3:-12].tolist()
        else:
            previous_frames = [df.iloc[idx - i, 3:-12].tolist() for i in range(1, 5)]
        new_row = previous_frames + [current] + tackles
        rows.append(new_row)
    final_input = pd.DataFrame(rows, columns=['four', 'three', 'two', 'one', 'current', 'tackle_1','tackle_2','tackle_3','tackle_4','tackle_5','tackle_6','tackle_7','tackle_8','tackle_9','tackle_10','tackle_11', 'tackle_12'])
    return final_input

In [55]:
# puts test data in correct format for eval
# test_model_input_no_id = last_5_converter(normalized_test_input_df)
# test_model_input = pd.concat([test_input_just_id, test_model_input_no_id], axis = 1)
# test_model_input.to_pickle("test_model_input.pkl")

In [56]:
model_input_no_id = last_5_converter(normalized_input_output_df)
model_input = pd.concat([input_output_df[['gameId', 'playId', 'frameId']], model_input_no_id], axis = 1)
model_input

0it [00:00, ?it/s]

Unnamed: 0,gameId,playId,frameId,four,three,two,one,current,tackle_1,tackle_2,tackle_3,tackle_4,tackle_5,tackle_6,tackle_7,tackle_8,tackle_9,tackle_10,tackle_11,tackle_12
0,2022090800,56,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9823384598581234, 1.3484585884884548, -0.13...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2022090800,56,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9823384598581234, 1.3484585884884548, -0.13...","[0.790808421685418, 1.4283941077210496, -0.733...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2022090800,56,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9823384598581234, 1.3484585884884548, -0.13...","[0.790808421685418, 1.4283941077210496, -0.733...","[0.5098976990321165, 1.656781305528464, -0.866...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2022090800,56,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9823384598581234, 1.3484585884884548, -0.13...","[0.790808421685418, 1.4283941077210496, -0.733...","[0.5098976990321165, 1.656781305528464, -0.866...","[0.07576294584065033, 1.668200665418835, -0.42...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2022090800,56,5,"[0.07576294584065033, 1.668200665418835, -0.42...","[0.5098976990321165, 1.656781305528464, -0.866...","[0.790808421685418, 1.4283941077210496, -0.733...","[0.9823384598581234, 1.3484585884884548, -0.13...","[-0.26899112287021976, 1.8965878632262487, -0....",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529874,2022110700,3787,40,"[-0.16684176917811003, -0.30734859561529776, -...","[-0.3839091457738431, -0.0561226780271422, -0....","[-0.4732898302544391, -0.19315499671159056, -0...","[-0.30729713050476093, -0.3301873153960392, -0...","[0.1268376226867052, -0.38728411484789277, -0....",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529875,2022110700,3787,41,"[0.1268376226867052, -0.38728411484789277, -0....","[-0.16684176917811003, -0.30734859561529776, -...","[-0.3839091457738431, -0.0561226780271422, -0....","[-0.4732898302544391, -0.19315499671159056, -0...","[0.4332856837630341, -0.4215421945190048, -1.0...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529876,2022110700,3787,42,"[0.4332856837630341, -0.4215421945190048, -1.0...","[0.1268376226867052, -0.38728411484789277, -0....","[-0.16684176917811003, -0.30734859561529776, -...","[-0.3839091457738431, -0.0561226780271422, -0....","[0.484360360609089, -0.38728411484789277, -1.1...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
529877,2022110700,3787,43,"[0.484360360609089, -0.38728411484789277, -1.1...","[0.4332856837630341, -0.4215421945190048, -1.0...","[0.1268376226867052, -0.38728411484789277, -0....","[-0.16684176917811003, -0.30734859561529776, -...","[0.5098976990321165, -0.30734859561529776, -0....",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [57]:
# average number of frames per play - can work backwards 
# (x thousand / 44 to get approx num of plays to sample for desired number of frames )
model_input.groupby(['gameId', 'playId'])[['frameId']].agg('count')['frameId'].median()

44.0

Sample final data set for training data - done in chunks due to capacity limitations

In [58]:
# extract unique game/play combos
game_play_unique = [list(x) for x in set(tuple(x) for x in input_output_df[['gameId', 'playId']].values.tolist())]

In [59]:
# set seed in each chunk for reproducibility
# sample ~100k frames from all data
random.seed(41)
random_sample_100k = random.choices(game_play_unique, k=2600)
model_input_100k_random_sample = model_input[model_input[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_100k)]

In [60]:
model_input_100k_random_sample

Unnamed: 0,gameId,playId,frameId,four,three,two,one,current,tackle_1,tackle_2,tackle_3,tackle_4,tackle_5,tackle_6,tackle_7,tackle_8,tackle_9,tackle_10,tackle_11,tackle_12
22,2022090800,80,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4803165591071579, 1.1314907505714114, -0.99...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
23,2022090800,80,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4803165591071579, 1.1314907505714114, -0.99...","[1.2377118440883974, 1.0972326709002995, -0.38...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
24,2022090800,80,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4803165591071579, 1.1314907505714114, -0.99...","[1.2377118440883974, 1.0972326709002995, -0.38...","[0.6248157219357396, 1.2456843494751186, 0.127...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
25,2022090800,80,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.4803165591071579, 1.1314907505714114, -0.99...","[1.2377118440883974, 1.0972326709002995, -0.38...","[0.6248157219357396, 1.2456843494751186, 0.127...","[0.4077483453400067, 1.4055553879403082, 0.657...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
26,2022090800,80,5,"[0.4077483453400067, 1.4055553879403082, 0.657...","[0.6248157219357396, 1.2456843494751186, 0.127...","[1.2377118440883974, 1.0972326709002995, -0.38...","[1.4803165591071579, 1.1314907505714114, -0.99...","[0.16514363032124635, 1.9308459428973612, 1.26...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529595,2022110700,3630,57,"[0.4971290298206027, 1.4854909071729034, 0.028...","[0.5098976990321165, 1.6339425857477226, 0.171...","[0.4077483453400067, 1.9080072231166199, -0.08...","[0.4205170145515204, 1.9765233824588435, -0.08...","[0.28006165322486953, 1.154329470352153, -0.32...",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0
529596,2022110700,3630,58,"[0.28006165322486953, 1.154329470352153, -0.32...","[0.4971290298206027, 1.4854909071729034, 0.028...","[0.5098976990321165, 1.6339425857477226, 0.171...","[0.4077483453400067, 1.9080072231166199, -0.08...","[0.31836766085941065, 0.8003293137506609, -0.6...",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0
529597,2022110700,3630,59,"[0.31836766085941065, 0.8003293137506609, -0.6...","[0.28006165322486953, 1.154329470352153, -0.32...","[0.4971290298206027, 1.4854909071729034, 0.028...","[0.5098976990321165, 1.6339425857477226, 0.171...","[0.24175564559032836, 0.4349097972587982, -0.9...",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0
529598,2022110700,3630,60,"[0.24175564559032836, 0.4349097972587982, -0.9...","[0.31836766085941065, 0.8003293137506609, -0.6...","[0.28006165322486953, 1.154329470352153, -0.32...","[0.4971290298206027, 1.4854909071729034, 0.028...","[0.25452431480184207, -0.10180011758862507, -1...",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0


In [61]:
# same as above for 200k
random.seed(41)
random_sample_200k = random.sample(game_play_unique, k=5100)
model_input_200k_random_sample = model_input[model_input[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_200k)]

In [62]:
# same as above for 250k
random.seed(41)
random_sample_250k = random.sample(game_play_unique, k=6300)
model_input_250k_random_sample = model_input[model_input[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_250k)]

In [63]:
#same as above for 300k
random.seed(41)
random_sample_300k = random.sample(game_play_unique, k=7500)
model_input_300k_random_sample = model_input[model_input[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_300k)]

In [64]:
model_input_100k_random_sample.to_pickle("model_input_100k_random_sample.pkl")
model_input_200k_random_sample.to_pickle("model_input_200k_random_sample.pkl")
model_input_250k_random_sample.to_pickle("model_input_250k_random_sample.pkl")
model_input_300k_random_sample.to_pickle("model_input_300k_random_sample.pkl")

In [65]:
# test is just all the values that were not part of the 250k training set
#test_sample_250k = model_input[~model_input[['gameId', 'playId']].apply(list, axis = 1).isin(random_sample_250k)]
#test_sample_250k.to_pickle("test_sample_250k.pkl")