In [4]:
import pandas as pd
import numpy as np
import glob

#### Import NFL BDB 2021 and NFL BDB 2021 Bonus data

In [5]:
players = pd.read_csv('nfl-big-data-bowl-2021/players.csv')
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')
targetedReceiver = pd.read_csv('nfl-big-data-bowl-2021-bonus/targetedReceiver.csv')

extension = 'csv'
all_filenames = [i for i in glob.glob('nfl-big-data-bowl-2021/week*.{}'.format(extension))]
tracking = pd.concat([pd.read_csv(f) for f in all_filenames])

#### Import relative distance data

In [6]:
all_filenames1 = [i for i in glob.glob('additional_data/realtive_info/Relative_Info_Week*.{}'.format(extension))]
distances = pd.concat([pd.read_csv(f) for f in all_filenames1])
distances = distances.drop(columns='Unnamed: 0')

In [7]:
distances.shape

(17092915, 11)

In [8]:
distances.head()

Unnamed: 0,gameId,playId,frameId,nflId,Opp_Dist,closestOpp_Id,Team_Dist,closestTeam_Id,QB_Dist,closestQB_Id,FootDist
0,2018101800,96,1,494307.0,17.555184,2559248.0,10.716604,2532898.0,21.280359,2560858.0,17.405683
1,2018101800,96,1,2495202.0,5.111008,2553435.0,6.498138,2550930.0,7.322739,2560858.0,5.288875
2,2018101800,96,1,2506106.0,4.550011,2530510.0,4.983172,2559248.0,11.464903,2560858.0,11.425625
3,2018101800,96,1,2507917.0,7.598066,2560711.0,5.919206,2555550.0,20.026026,2560858.0,18.144073
4,2018101800,96,1,2530510.0,4.550011,2506106.0,7.339244,2532898.0,13.230155,2560858.0,11.701115


In [9]:
tracking.shape # includes rows for ball location

(18309388, 19)

#### Get data at time of snap

In [10]:
tsnaps = tracking[tracking['event']=='ball_snap']
snaps = tsnaps[['gameId','playId', 'x', 'y','nflId','event','displayName','position','frameId','route']]

In [11]:
snaps.shape

(282543, 10)

#### Merge with targeted reciever

In [12]:
df1 = pd.merge(snaps, targetedReceiver, \
         how='inner', \
         left_on=['gameId','playId','nflId'], \
         right_on=['gameId','playId','targetNflId'])
df1.shape

(19196, 11)

#### Merge with distance info

In [13]:
df2 = pd.merge(df1, distances, \
                     left_on=['gameId','playId','nflId','frameId'], \
                     right_on=['gameId','playId','nflId','frameId'])
df2.shape

(18848, 18)

#### Merge with play info

In [14]:
df3 = pd.merge(df2, plays[['gameId','playId','quarter','down','yardsToGo',\
                           'defendersInTheBox','numberOfPassRushers',\
                           'absoluteYardlineNumber','passResult','penaltyCodes', \
                           'offensePlayResult','epa','playResult','isDefensivePI']], \
                     left_on=['gameId','playId'], \
                     right_on=['gameId','playId'])
df3.shape

(18848, 30)

#### Merge with player info

In [15]:
df4 = pd.merge(df3, players[['nflId', 'height','weight']], \
                     on=['nflId'])
df4.shape

(18848, 32)

#### Make penalty a dummy variable

In [16]:
df4.penaltyCodes.value_counts()

DPI         241
DH          227
RPS         161
OPI          96
OH           88
           ... 
OPI;ILF       1
DSQ;DSQd      1
ICB           1
RPS;UNRd      1
ICT;DH        1
Name: penaltyCodes, Length: 82, dtype: int64

In [17]:
df5 = df4.rename(columns={"penaltyCodes":"penalty"})
df5['penalty'] = np.where(pd.notna(df5['penalty']), 1,0)
df5.penalty.value_counts()

0    17653
1     1195
Name: penalty, dtype: int64

In [18]:
# filter out plays with penalties
df6 = df5[df5['penalty'] != 1]
df6.shape

(17653, 32)

#### Make passResult a dummy variable

In [19]:
df6.passResult.value_counts()

C     10853
I      5155
S      1206
IN      439
Name: passResult, dtype: int64

In [20]:
df6['passResult'] = np.where(df6['passResult'] == 'C', 1,0)
df6.passResult.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


1    10853
0     6800
Name: passResult, dtype: int64

### Export data

In [124]:
df6.to_csv('boost_models/data/finaldata_w_xy.csv')
print('Dataframe has been exported to finaldata.csv in boost_models>data')

Dataframe has been exported to finaldata.csv in boost_models>data


In [74]:
df6.iloc[0]

gameId                     2018090600
playId                             75
x                               91.36
y                               44.14
nflId                     2.49545e+06
event                       ball_snap
displayName               Julio Jones
position                           WR
frameId                            11
route                           HITCH
targetNflId               2.49545e+06
Opp_Dist                      7.47286
closestOpp_Id             2.55538e+06
Team_Dist                     8.00105
closestTeam_Id            2.53304e+06
QB_Dist                       17.4741
closestQB_Id                      310
FootDist                      17.3351
quarter                             1
down                                1
yardsToGo                          15
defendersInTheBox                   7
numberOfPassRushers                 4
absoluteYardlineNumber             90
passResult                          1
penalty                             0
height      

#### Make dummies for position

In [21]:
df7 = pd.get_dummies(df6, columns=['position','route'])
df7.shape

(17653, 54)

In [22]:
# filter out plays with undefined route
df8 = df7[df7['route_undefined']!=1]
df8.shape

(17639, 54)

In [24]:
df9.to_csv('boost_models/data/finaldata_w_xy_and_dummies.csv')
print('Dataframe has been exported to finaldata_w_xy_and_dummies in boost_models>data')

Dataframe has been exported to finaldata_w_xy_and_dummies in boost_models>data


In [None]:
df9.columns

In [23]:
df9.iloc[0]

NameError: name 'df9' is not defined