In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

Note we have 7 files from the competetition hosts: 
- Game Data: games.csv, the data for each game (season, date & time, location, home and visitor teams), key variable is `gameID`.
- PFF Scouting Data: PFFScoutingData.csv, "play-level scouting information for each game", key variables are `gameID` and `playID` (note `nflID` not included). Information about kick types, directions, and air time throughout the game.
- Player Data: players.csv, information for each player (height, weight, birth, college, position, name), key variable is `nflID` (Does not include `gameID` and `playID`).
- Play Data: plays.csv, "play-level information from each game", key variables are `gameID` and `playID`. Game-specific temporal information, type of play and play result. `kickerId` is `nflId` of kicker.
- Tracking Data: tracking2018.csv, tracking 2019.csv, and tracking2020.csv. Each contains "player tracking data" from the indicated season, key variables are `gameID`, `nflID`, and `playID`. Each player's position on the field as well as the football for all special teams plays per game.
We also have a weather dataset from ThomasJBliss.

In [2]:
games = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/games.csv")

scout = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/PFFScoutingData.csv")

players = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/players.csv")

play = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/plays.csv")

track18 = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/tracking2018.csv")
track19 = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/tracking2019.csv")
track20 = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/nfl-big-data-bowl-2022/tracking2020.csv")

g_weather = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/WeatherData/data/games_weather.csv")
game_ident = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/WeatherData/data/games.csv")
stadium_ident = pd.read_csv("/Users/elizabethgrace/Documents/coding/fall-2021 copia/NFL_Big_Data_Bowl_2022/WeatherData/data/stadium_coordinates.csv")


Weather data needs to be combined into three datasets, one per year.

In [3]:
def get_weather_data():
    # Pull down datasets
    
    # Merge game and weather data on game_id
    g_weather_merge = pd.merge(g_weather, game_ident, on='game_id')

    # Merge stadium data on StadiumName
    final_df = pd.merge(g_weather_merge, stadium_ident, on='StadiumName')

    # Convert time columns to datetime objects
    time_cols = ['TimeMeasure', 'TimeStartGame', 'TimeEndGame']

    for col in time_cols:
        final_df[col] = pd.to_datetime(final_df[col], format='%m/%d/%Y %H:%M')

    # Create sliced DataFrames
    weather2018 = final_df[final_df['TimeMeasure'].dt.year == 2018]
    weather2019 = final_df[final_df['TimeMeasure'].dt.year == 2019]
    weather2020 = final_df[final_df['TimeMeasure'].dt.year == 2020]

    return weather2018, weather2019, weather2020

In [4]:
weather2018, weather2019, weather2020 = get_weather_data()

In [7]:
weather2018

Unnamed: 0,game_id,Source,DistanceToStation,TimeMeasure,Temperature,DewPoint,Humidity,Precipitation,WindSpeed,WindDirection,...,Season,StadiumName,TimeStartGame,TimeEndGame,TZOffset,HomeTeam,RoofType,Longitude,Latitude,StadiumAzimuthAngle
3099,2018091608,Meteostat,3.98,2018-09-16 12:00:00,75.56,68.90,80.0,0.0,10.31,90.0,...,2018,FedExField,2018-09-16 13:00:00,2018-09-16 15:53:00,-4,WAS,Outdoor,-76.864444,38.907778,295.0
3100,2018091608,Meteostat,3.98,2018-09-16 13:00:00,75.92,68.54,78.0,0.0,8.08,40.0,...,2018,FedExField,2018-09-16 13:00:00,2018-09-16 15:53:00,-4,WAS,Outdoor,-76.864444,38.907778,295.0
3101,2018091608,Meteostat,3.98,2018-09-16 14:00:00,77.72,69.62,76.0,0.0,9.20,30.0,...,2018,FedExField,2018-09-16 13:00:00,2018-09-16 15:53:00,-4,WAS,Outdoor,-76.864444,38.907778,295.0
3102,2018091608,Meteostat,3.98,2018-09-16 15:00:00,78.80,69.44,73.0,0.0,9.20,70.0,...,2018,FedExField,2018-09-16 13:00:00,2018-09-16 15:53:00,-4,WAS,Outdoor,-76.864444,38.907778,295.0
3103,2018091608,Meteostat,3.98,2018-09-16 16:00:00,80.24,69.62,70.0,0.0,12.74,100.0,...,2018,FedExField,2018-09-16 13:00:00,2018-09-16 15:53:00,-4,WAS,Outdoor,-76.864444,38.907778,295.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39067,2018121600,Meteostat,4.47,2018-12-16 14:00:00,51.98,46.94,83.0,0.0,6.96,310.0,...,2018,Mercedes-Benz Stadium,2018-12-16 13:00:00,2018-12-16 16:07:00,-5,ATL,Retractable,-84.400000,33.755556,70.9
39068,2018121600,Meteostat,4.47,2018-12-16 15:00:00,53.06,44.96,74.0,0.0,5.84,310.0,...,2018,Mercedes-Benz Stadium,2018-12-16 13:00:00,2018-12-16 16:07:00,-5,ATL,Retractable,-84.400000,33.755556,70.9
39069,2018121600,Meteostat,4.47,2018-12-16 16:00:00,53.06,46.04,77.0,0.0,9.20,310.0,...,2018,Mercedes-Benz Stadium,2018-12-16 13:00:00,2018-12-16 16:07:00,-5,ATL,Retractable,-84.400000,33.755556,70.9
39070,2018121600,Meteostat,4.47,2018-12-16 17:00:00,53.06,46.04,77.0,0.0,14.98,310.0,...,2018,Mercedes-Benz Stadium,2018-12-16 13:00:00,2018-12-16 16:07:00,-5,ATL,Retractable,-84.400000,33.755556,70.9


We need to standardize the height of all players. Inches will be easier to code with, so we first define a function `ft_in` to convert all heights from ft-in to inches. It also takes inches to an `int`. Then we will apply it to the `height` column of our dataframe.

In [5]:
def ft_in(x):
    if '-' in x:
        meas=x.split('-')
        #this will be a list ['ft','in']
        inches = int(meas[0])*12 + int(meas[1])
        return inches
    else:
        return int(x)

In [6]:
players['height'] = players['height'].apply(ft_in)

Now, we turn to the tracking data. We must reorient this to reflect movement in the offense direction instead of the on-field coordinates (reorient the orgin from the bottom left to top right for a change in direction).

In [8]:
#2018 tracking data
track18.loc[track18['playDirection'] == 'left', 'x'] = 120 -track18.loc[track18['playDirection']=='left','x']
track18.loc[track18['playDirection'] == 'left', 'y'] = 160/3 -track18.loc[track18['playDirection']=='left','y']
#note that we have 160/3 for the y direction since the football field is 160ft, but our units are yards

#2019 tracking data
track19.loc[track19['playDirection'] == 'left', 'x'] = 120 -track19.loc[track19['playDirection']=='left','x']
track19.loc[track19['playDirection'] == 'left', 'y'] = 160/3 -track19.loc[track19['playDirection']=='left','y']

#2020 tracking data
track20.loc[track20['playDirection'] == 'left', 'x'] = 120 -track20.loc[track20['playDirection']=='left','x']
track20.loc[track20['playDirection'] == 'left', 'y'] = 160/3 -track20.loc[track20['playDirection']=='left','y']


We are specifically looking at `Extra Point` in this Notebook. So we pull just that play data.

In [9]:
#extraPoint
play_extrapoint = play.loc[play['specialTeamsPlayType']=='Extra Point']

In [10]:
play_extrapoint.value_counts('specialTeamsResult')

specialTeamsResult
Kick Attempt Good           3252
Kick Attempt No Good         199
Blocked Kick Attempt          24
Non-Special Teams Result      13
dtype: int64

In [11]:
play_extrapoint.value_counts('kickReturnYardage')

Series([], dtype: int64)

In [16]:
play_extrapoint.value_counts('passResult')

passResult
I    4
dtype: int64

4 of our Non-Special Teams Resutls are incomplete passes.

In [22]:
play_extrapoint.value_counts('yardlineNumber')

yardlineNumber
15    3438
20      29
10       8
25       6
30       5
7        2
dtype: int64

In [35]:
play_extrapoint.value_counts('penaltyYards')

penaltyYards
 15.0    31
 5.0     28
-15.0     3
 0.0      3
dtype: int64

Remove columns that have no values or set values, i.e., `kickReturnYardage` is null and `yardsToGo` is always `0`.

Note that `playDescription` should never be included in analysis, just good for reference later.

In [30]:
ep = play_extrapoint.drop(columns =['kickReturnYardage', 'kickLength', 'playResult', 'returnerId', 'yardsToGo', 'down', 'specialTeamsPlayType'])


In [61]:
ep

Unnamed: 0,gameId,playId,playDescription,quarter,possessionTeam,specialTeamsResult,kickerId,kickBlockerId,yardlineSide,yardlineNumber,gameClock,penaltyCodes,penaltyJerseyNumbers,penaltyYards,preSnapHomeScore,preSnapVisitorScore,passResult,absoluteYardlineNumber
15,2018090600,2883,"J.Elliott extra point is GOOD, Center-R.Lovato...",3,PHI,Kick Attempt Good,44966.0,,ATL,15,04:37:00,,,,9,6,,25
19,2018090600,3553,"M.Bryant extra point is No Good, Hit Right Upr...",4,ATL,Kick Attempt No Good,27091.0,,PHI,15,09:48:00,,,,10,12,,25
25,2018090900,380,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,08:42:00,,,,6,0,,95
30,2018090900,972,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,01:32:00,,,,13,0,,95
44,2018090900,2757,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",3,BAL,Kick Attempt Good,39470.0,,BUF,15,12:28:00,,,,32,0,,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19968,2021010315,2813,"T.Vizcaino extra point is GOOD, Center-C.Holba...",4,SF,Kick Attempt Good,47590.0,,SEA,15,14:22:00,,,,15,6,,95
19970,2021010315,3074,"J.Myers extra point is No Good, Wide Left, Cen...",4,SEA,Kick Attempt No Good,41175.0,,SF,15,10:54:00,,,,16,12,,25
19973,2021010315,3667,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,02:20:00,,,,16,18,,25
19975,2021010315,3870,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,01:49:00,,,,16,25,,25


In [60]:
ep_play = pd.merge(ep, players[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
             left_on = 'kickerId', right_on = 'nflId', suffixes = (False,'_kicker'))
ep_play

Unnamed: 0,gameId,playId,playDescription,quarter,possessionTeam,specialTeamsResult,kickerId,kickBlockerId,yardlineSide,yardlineNumber,...,penaltyYards,preSnapHomeScore,preSnapVisitorScore,passResult,absoluteYardlineNumber,nflId,height,weight,Position,displayName
0,2018090600,2883,"J.Elliott extra point is GOOD, Center-R.Lovato...",3,PHI,Kick Attempt Good,44966.0,,ATL,15,...,,9,6,,25,44966.0,69.0,167.0,K,Jake Elliott
1,2018090600,3553,"M.Bryant extra point is No Good, Hit Right Upr...",4,ATL,Kick Attempt No Good,27091.0,,PHI,15,...,,10,12,,25,27091.0,69.0,203.0,K,Matt Bryant
2,2018090900,380,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,6,0,,95,39470.0,73.0,183.0,K,Justin Tucker
3,2018090900,972,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,13,0,,95,39470.0,73.0,183.0,K,Justin Tucker
4,2018090900,2757,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",3,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,32,0,,25,39470.0,73.0,183.0,K,Justin Tucker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483,2021010315,2813,"T.Vizcaino extra point is GOOD, Center-C.Holba...",4,SF,Kick Attempt Good,47590.0,,SEA,15,...,,15,6,,95,47590.0,74.0,205.0,K,Tristan Vizcaino
3484,2021010315,3074,"J.Myers extra point is No Good, Wide Left, Cen...",4,SEA,Kick Attempt No Good,41175.0,,SF,15,...,,16,12,,25,41175.0,70.0,190.0,K,Jason Myers
3485,2021010315,3667,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,16,18,,25,41175.0,70.0,190.0,K,Jason Myers
3486,2021010315,3870,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,16,25,,25,41175.0,70.0,190.0,K,Jason Myers


In [42]:
ep_play.value_counts('Position')

Position
K    3466
P       9
dtype: int64

Rename player information to indicate kicker, then drop duplicate `nflId`.

In [62]:
ep_plays=ep_play.rename(columns = {"height": 'kicker_height', "weight": 'kicker_weight', "Position": 'kicker_position', "displayName": 'kicker_name'})

ep_plays=ep_plays.drop(columns=['nflId'])


In [63]:
ep_plays

Unnamed: 0,gameId,playId,playDescription,quarter,possessionTeam,specialTeamsResult,kickerId,kickBlockerId,yardlineSide,yardlineNumber,...,penaltyJerseyNumbers,penaltyYards,preSnapHomeScore,preSnapVisitorScore,passResult,absoluteYardlineNumber,kicker_height,kicker_weight,kicker_position,kicker_name
0,2018090600,2883,"J.Elliott extra point is GOOD, Center-R.Lovato...",3,PHI,Kick Attempt Good,44966.0,,ATL,15,...,,,9,6,,25,69.0,167.0,K,Jake Elliott
1,2018090600,3553,"M.Bryant extra point is No Good, Hit Right Upr...",4,ATL,Kick Attempt No Good,27091.0,,PHI,15,...,,,10,12,,25,69.0,203.0,K,Matt Bryant
2,2018090900,380,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,,6,0,,95,73.0,183.0,K,Justin Tucker
3,2018090900,972,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,,13,0,,95,73.0,183.0,K,Justin Tucker
4,2018090900,2757,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",3,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,,32,0,,25,73.0,183.0,K,Justin Tucker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483,2021010315,2813,"T.Vizcaino extra point is GOOD, Center-C.Holba...",4,SF,Kick Attempt Good,47590.0,,SEA,15,...,,,15,6,,95,74.0,205.0,K,Tristan Vizcaino
3484,2021010315,3074,"J.Myers extra point is No Good, Wide Left, Cen...",4,SEA,Kick Attempt No Good,41175.0,,SF,15,...,,,16,12,,25,70.0,190.0,K,Jason Myers
3485,2021010315,3667,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,,16,18,,25,70.0,190.0,K,Jason Myers
3486,2021010315,3870,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,,16,25,,25,70.0,190.0,K,Jason Myers


Now we add in the Blockers.

In [66]:
ep_full = pd.merge(ep_plays, players[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
             left_on = 'kickBlockerId', right_on = 'nflId')

In [67]:
eps=ep_full.rename(columns = {"height": 'blocker_height', "weight": 'blocker_weight', "Position": 'blocker_position', "displayName": 'blocker_name'})

eps=eps.drop(columns=['nflId'])

eps


Unnamed: 0,gameId,playId,playDescription,quarter,possessionTeam,specialTeamsResult,kickerId,kickBlockerId,yardlineSide,yardlineNumber,...,passResult,absoluteYardlineNumber,kicker_height,kicker_weight,kicker_position,kicker_name,blocker_height,blocker_weight,blocker_position,blocker_name
0,2018090600,2883,"J.Elliott extra point is GOOD, Center-R.Lovato...",3,PHI,Kick Attempt Good,44966.0,,ATL,15,...,,25,69.0,167.0,K,Jake Elliott,,,,
1,2018090600,3553,"M.Bryant extra point is No Good, Hit Right Upr...",4,ATL,Kick Attempt No Good,27091.0,,PHI,15,...,,25,69.0,203.0,K,Matt Bryant,,,,
2,2018090900,380,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,95,73.0,183.0,K,Justin Tucker,,,,
3,2018090900,972,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",1,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,95,73.0,183.0,K,Justin Tucker,,,,
4,2018090900,2757,"J.Tucker extra point is GOOD, Center-M.Cox, Ho...",3,BAL,Kick Attempt Good,39470.0,,BUF,15,...,,25,73.0,183.0,K,Justin Tucker,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483,2021010315,2813,"T.Vizcaino extra point is GOOD, Center-C.Holba...",4,SF,Kick Attempt Good,47590.0,,SEA,15,...,,95,74.0,205.0,K,Tristan Vizcaino,,,,
3484,2021010315,3074,"J.Myers extra point is No Good, Wide Left, Cen...",4,SEA,Kick Attempt No Good,41175.0,,SF,15,...,,25,70.0,190.0,K,Jason Myers,,,,
3485,2021010315,3667,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,25,70.0,190.0,K,Jason Myers,,,,
3486,2021010315,3870,"J.Myers extra point is GOOD, Center-T.Ott, Hol...",4,SEA,Kick Attempt Good,41175.0,,SF,15,...,,25,70.0,190.0,K,Jason Myers,,,,
