# 1 Data Wrangling

## Introduction

In the National Football League (NFL) one team has 4 downs to go 10 yards which would reset their down count and allow them to try another 4 times to get another 4 downs. Getting 10 yards within the set of downs is vital to the game as it allows that team to progress down the field and potentially score. But what if you could find a way to predict the play on 3rd downs (as those are typically the most important) based off of previous data. 

The data can be found here: https://www.dolthub.com/repositories/Liquidata/nfl-play-by-play/data/master/plays 

In [1]:
# import pandas as pd 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
#import raw data
YEARS = [2019,2018,2017]

data = pd.DataFrame()

for i in YEARS:  
    #low_memory=False eliminates a warning
    i_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    #sort=True eliminates a warning and alphabetically sorts columns
    data = data.append(i_data, sort=True)

#Give each row a unique index
data.reset_index(drop=True, inplace=True)

## Explore Raw Data

In [5]:
pd.set_option('display.max_columns', None)
with pd.option_context('display.max_rows',5):
    display(data)

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,assist_tackle_2_team,assist_tackle_3_player_id,assist_tackle_3_player_name,assist_tackle_3_team,assist_tackle_4_player_id,assist_tackle_4_player_name,assist_tackle_4_team,away_coach,away_score,away_team,away_timeouts_remaining,away_wp,away_wp_post,blocked_player_id,blocked_player_name,comp_air_epa,comp_air_wpa,comp_yac_epa,comp_yac_wpa,complete_pass,cp,cpoe,def_wp,defensive_extra_point_attempt,defensive_extra_point_conv,defensive_two_point_attempt,defensive_two_point_conv,defteam,defteam_score,defteam_score_post,defteam_timeouts_remaining,desc,div_game,down,drive,drive_end_transition,drive_end_yard_line,drive_ended_with_score,drive_first_downs,drive_game_clock_end,drive_game_clock_start,drive_inside20,drive_play_count,drive_play_id_ended,drive_play_id_started,drive_quarter_end,drive_quarter_start,drive_real_start_time,drive_start_transition,drive_start_yard_line,drive_time_of_possession,drive_yards_penalized,end_clock_time,end_yard_line,ep,epa,extra_point_attempt,extra_point_prob,extra_point_result,fantasy,fantasy_id,fantasy_player_id,fantasy_player_name,fg_prob,field_goal_attempt,field_goal_result,first_down,first_down_pass,first_down_penalty,first_down_rush,fixed_drive,fixed_drive_result,forced_fumble_player_1_player_id,forced_fumble_player_1_player_name,forced_fumble_player_1_team,forced_fumble_player_2_player_id,forced_fumble_player_2_player_name,forced_fumble_player_2_team,fourth_down_converted,fourth_down_failed,fumble,fumble_forced,fumble_lost,fumble_not_forced,fumble_out_of_bounds,fumble_recovery_1_player_id,fumble_recovery_1_player_name,fumble_recovery_1_team,fumble_recovery_1_yards,fumble_recovery_2_player_id,fumble_recovery_2_player_name,fumble_recovery_2_team,fumble_recovery_2_yards,fumbled_1_player_id,fumbled_1_player_name,fumbled_1_team,fumbled_2_player_id,fumbled_2_player_name,fumbled_2_team,game_date,game_half,game_id,game_seconds_remaining,game_stadium,goal_to_go,half_sack_1_player_id,half_sack_1_player_name,half_sack_2_player_id,half_sack_2_player_name,half_seconds_remaining,home_coach,home_opening_kickoff,home_score,home_team,home_timeouts_remaining,home_wp,home_wp_post,id,incomplete_pass,interception,interception_player_id,interception_player_name,jersey_number,kick_distance,kicker_player_id,kicker_player_name,kickoff_attempt,kickoff_downed,kickoff_fair_catch,kickoff_in_endzone,kickoff_inside_twenty,kickoff_out_of_bounds,kickoff_returner_player_id,kickoff_returner_player_name,lateral_interception_player_id,lateral_interception_player_name,lateral_kickoff_returner_player_id,lateral_kickoff_returner_player_name,lateral_punt_returner_player_id,lateral_punt_returner_player_name,lateral_receiver_player_id,lateral_receiver_player_name,lateral_receiving_yards,lateral_reception,lateral_recovery,lateral_return,lateral_rush,lateral_rusher_player_id,lateral_rusher_player_name,lateral_rushing_yards,lateral_sack_player_id,lateral_sack_player_name,location,name,nfl_api_id,no_huddle,no_score_prob,old_game_id,opp_fg_prob,opp_safety_prob,opp_td_prob,order_sequence,out_of_bounds,own_kickoff_recovery,own_kickoff_recovery_player_id,own_kickoff_recovery_player_name,own_kickoff_recovery_td,pass,pass_attempt,pass_defense_1_player_id,pass_defense_1_player_name,pass_defense_2_player_id,pass_defense_2_player_name,pass_length,pass_location,pass_oe,pass_touchdown,passer,passer_id,passer_jersey_number,passer_player_id,passer_player_name,passing_yards,penalty,penalty_player_id,penalty_player_name,penalty_team,penalty_type,penalty_yards,play,play_clock,play_deleted,play_id,play_type,play_type_nfl,posteam,posteam_score,posteam_score_post,posteam_timeouts_remaining,posteam_type,punt_attempt,punt_blocked,punt_downed,punt_fair_catch,punt_in_endzone,punt_inside_twenty,punt_out_of_bounds,punt_returner_player_id,punt_returner_player_name,punter_player_id,punter_player_name,qb_dropback,qb_epa,qb_hit,qb_hit_1_player_id,qb_hit_1_player_name,qb_hit_2_player_id,qb_hit_2_player_name,qb_kneel,qb_scramble,qb_spike,qtr,quarter_end,quarter_seconds_remaining,receiver,receiver_id,receiver_jersey_number,receiver_player_id,receiver_player_name,receiving_yards,replay_or_challenge,replay_or_challenge_result,result,return_team,return_touchdown,return_yards,roof,run_gap,run_location,rush,rush_attempt,rush_touchdown,rusher,rusher_id,rusher_jersey_number,rusher_player_id,rusher_player_name,rushing_yards,sack,sack_player_id,sack_player_name,safety,safety_player_id,safety_player_name,safety_prob,score_differential,score_differential_post,season,season_type,series,series_result,series_success,shotgun,side_of_field,solo_tackle,solo_tackle_1_player_id,solo_tackle_1_player_name,solo_tackle_1_team,solo_tackle_2_player_id,solo_tackle_2_player_name,solo_tackle_2_team,sp,special,special_teams_play,spread_line,st_play_type,stadium,stadium_id,start_time,success,surface,tackle_for_loss_1_player_id,tackle_for_loss_1_player_name,tackle_for_loss_2_player_id,tackle_for_loss_2_player_name,tackle_with_assist,tackle_with_assist_1_player_id,tackle_with_assist_1_player_name,tackle_with_assist_1_team,tackle_with_assist_2_player_id,tackle_with_assist_2_player_name,tackle_with_assist_2_team,tackled_for_loss,td_player_id,td_player_name,td_prob,td_team,temp,third_down_converted,third_down_failed,time,time_of_day,timeout,timeout_team,total,total_away_comp_air_epa,total_away_comp_air_wpa,total_away_comp_yac_epa,total_away_comp_yac_wpa,total_away_epa,total_away_pass_epa,total_away_pass_wpa,total_away_raw_air_epa,total_away_raw_air_wpa,total_away_raw_yac_epa,total_away_raw_yac_wpa,total_away_rush_epa,total_away_rush_wpa,total_away_score,total_home_comp_air_epa,total_home_comp_air_wpa,total_home_comp_yac_epa,total_home_comp_yac_wpa,total_home_epa,total_home_pass_epa,total_home_pass_wpa,total_home_raw_air_epa,total_home_raw_air_wpa,total_home_raw_yac_epa,total_home_raw_yac_wpa,total_home_rush_epa,total_home_rush_wpa,total_home_score,total_line,touchback,touchdown,two_point_attempt,two_point_conv_result,two_point_conversion_prob,vegas_home_wp,vegas_home_wpa,vegas_wp,vegas_wpa,weather,week,wind,wp,wpa,xpass,xyac_epa,xyac_fd,xyac_mean_yardage,xyac_median_yardage,xyac_success,yac_epa,yac_wpa,yardline_100,yards_after_catch,yards_gained,ydsnet,ydstogo,yrdln
0,0,,,,,,,,,,,,,,,,,Dan Quinn,12,ATL,3,0.433208,,,,,,,,,,,0.566792,,,,,,,,,GAME,0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,0.000000,,,,,,,1,Punt,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-09-08,Half1,2019_01_ATL_MIN,3600.0,U.S. Bank Stadium,0,,,,,1800.0,Mike Zimmer,0,28,MIN,3,0.566792,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Home,,10160000-0579-055e-7350-c0a2552a523f,0,0.000000,2019090804,0.000000,0.00000,0.000000,1.0,0,,,,,0,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,,GAME_START,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,0,900.0,,,,,,,0,,16,,,,dome,,,0,,,,,,,,,,,,,,,0.000000,,,2019,REG,1,Punt,0,0,,,,,,,,,0,0,0,3.5,,U.S. Bank Stadium,MIN01,13:00:00,,sportturf,,,,,,,,,,,,,,,0.000000,,,,,15:00,,,,40,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0,47.0,0,,,,0.0,0.632430,0.000000,0.367570,-0.000000,"N/A (Indoors) Temp: ° F, Wind: mph",1,,0.433208,-0.000000,,,,,,,,,,,,,0,MIN 35
1,0,,,,0.0,,,,,,,,,,,,,Dan Quinn,12,ATL,3,0.433208,0.433208,,,0.0,0.0,0.0,0.0,0.0,,,0.566792,0.0,0.0,0.0,0.0,MIN,0.0,0.0,3.0,5-D.Bailey kicks 65 yards from MIN 35 to end z...,0,,1.0,BLOCKED_PUNT,ATL 33,0.0,0.0,12:53,15:00,0.0,3.0,121.0,36.0,1.0,1.0,,KICKOFF,ATL 25,2:07,0.0,,ATL 25,1.008252,-0.000000,0.0,0.0,,,,,,0.208195,0.0,,0.0,0.0,0.0,0.0,1,Punt,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,2019-09-08,Half1,2019_01_ATL_MIN,3600.0,U.S. Bank Stadium,0,,,,,1800.0,Mike Zimmer,0,28,MIN,3,0.566792,0.566792,,0.0,0.0,,,,,00-0028660,D.Bailey,1.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,Home,,10160000-0579-055e-7350-c0a2552a523f,0,0.003473,2019090804,0.128879,0.00227,0.272088,36.0,0,0.0,,,0.0,0,0.0,,,,,,,,0.0,,,,,,,0.0,,,,,,0,10,0,36,kickoff,KICK_OFF,ATL,0.0,0.0,3.0,away,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,-0.000000,0.0,,,,,0,0,0,1,0,900.0,,,,,,,0,,16,ATL,0.0,0.0,dome,,,0,0.0,0.0,,,,,,,0.0,,,0.0,,,0.003240,0.0,0.0,2019,REG,1,Punt,0,0,MIN,0.0,,,,,,,0,1,1,3.5,,U.S. Bank Stadium,MIN01,13:00:00,0.0,sportturf,,,,,0.0,,,,,,,0.0,,,0.381854,,,0.0,0.0,15:00,17:04:02,0.0,,40,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0,47.0,1,0.0,0.0,,0.0,0.632430,0.000000,0.367570,-0.000000,"N/A (Indoors) Temp: ° F, Wind: mph",1,,0.433208,-0.000000,,,,,,,,,35.0,,0.0,8.0,0,MIN 35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143903,0,6.693423,0.0,51.0,0.0,,,,,,,,,,,,,Doug Pederson,41,PHI,0,0.982579,1.000000,,,0.0,0.0,0.0,0.0,0.0,0.23966,-23.965967,0.982579,0.0,0.0,0.0,0.0,PHI,41.0,41.0,0.0,(:09) (Shotgun) 12-T.Brady pass incomplete dee...,0,2.0,21.0,,NE 49,0.0,3.0,00:00,01:05,0.0,9.0,4441.0,4193.0,4.0,4.0,,KICKOFF,NE 9,1:05,0.0,,NE 49,0.306577,-0.306577,0.0,0.0,,R.Gronkowski,00-0027656,00-0027656,R.Gronkowski,0.076756,0.0,,0.0,0.0,0.0,0.0,21,End of half,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,2018-02-04,Half2,2017_21_PHI_NE,9.0,U.S. Bank Stadium,0,,,,,9.0,Bill Belichick,0,33,NE,0,0.017421,0.000000,00-0019596,1.0,0.0,,,12.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,Neutral,T.Brady,10160000-0575-00de-6a31-688199abb666,0,0.900149,2018020400,0.001236,0.00026,0.004919,4419.0,0,0.0,,,0.0,1,1.0,,,,,deep,right,0.225604,0.0,T.Brady,00-0019596,12.0,00-0019596,T.Brady,,0.0,,,,,,1,7,0,4419,pass,PASS,NE,33.0,33.0,0.0,home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,1.0,-0.306577,1.0,00-0029653,F.Cox,,,0,0,0,4,0,9.0,R.Gronkowski,00-0027656,87.0,00-0027656,R.Gronkowski,,0,,-8,,0.0,0.0,dome,,,0,0.0,0.0,,,,,,,0.0,,,0.0,,,0.000359,-8.0,-8.0,2017,POST,66,End of half,0,1,NE,0.0,,,,,,,0,0,0,4.5,,U.S. Bank Stadium,MIN01,18:30:00,0.0,sportturf,,,,,0.0,,,,,,,0.0,,,0.016322,,,0.0,0.0,00:09,03:17:14,0.0,,74,0.255085,0.356768,6.205628,-0.26306,13.875322,8.615163,0.208042,6.618793,0.361956,-1.071844,-0.287941,4.67062,0.293709,41,-0.255085,-0.356768,-6.205628,0.26306,-13.875322,-8.615163,-0.208042,-6.618793,-0.361956,1.071844,0.287941,-4.67062,-0.293709,33,48.5,0,0.0,0.0,,0.0,0.012703,-0.012703,0.012703,-0.012703,"Temp: 3° F, Wind: mph",21,,0.017421,-0.017421,0.997744,,,,,,-7.0,-0.017421,51.0,,0.0,40.0,10,NE 49
143904,0,,,,,,,,,,,,,,,,,Doug Pederson,41,PHI,0,1.000000,1.000000,,,,,,,,,,,,,,,,,,,END GAME,0,,21.0,,NE 49,0.0,3.0,00:00,01:05,0.0,9.0,4441.0,4193.0,4.0,4.0,,KICKOFF,NE 9,1:05,0.0,,,,,,0.0,,,,,,0.000000,,,,,,,21,End of half,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-04,Half2,2017_21_PHI_NE,0.0,U.S. Bank Stadium,0,,,,,0.0,Bill Belichick,0,33,NE,0,0.000000,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Neutral,,10160000-0575-00de-6a31-688199abb666,0,0.000000,2018020400,0.000000,0.00000,0.000000,4441.0,0,,,,,0,,,,,,,,,,,,,,,,,,,,,,0,0,0,4441,,END_GAME,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,4,0,0.0,,,,,,,0,,-8,,,,dome,,,0,,,,,,,,,,,,,,,0.000000,,,2017,POST,66,End of half,0,0,,,,,,,,,0,0,0,4.5,,U.S. Bank Stadium,MIN01,18:30:00,,sportturf,,,,,,,,,,,,,,,0.000000,,,,,00:00,,,,74,0.255085,0.356768,6.205628,-0.26306,13.875322,8.615163,0.208042,6.618793,0.361956,-1.071844,-0.287941,4.67062,0.293709,41,-0.255085,-0.356768,-6.205628,0.26306,-13.875322,-8.615163,-0.208042,-6.618793,-0.361956,1.071844,0.287941,-4.67062,-0.293709,33,48.5,0,,,,0.0,0.000000,,,,"Temp: 3° F, Wind: mph",21,,,,,,,,,,,,,,,40.0,0,


In [6]:
#Select important data columns 
important_columns = data[['air_epa', 'air_wpa', 'air_yards','away_score', 'away_team', 'posteam', 'complete_pass','defteam', 'defteam_score',
                          'defteam_score_post','down','drive_start_yard_line', 'drive_time_of_possession', 'first_down_pass',
                          'first_down_penalty', 'first_down_rush','game_half','game_seconds_remaining','home_score',
                          'incomplete_pass', 'interception','lateral_receiving_yards','lateral_reception','lateral_rush',
                          'lateral_rushing_yards', 'no_huddle','no_score_prob','pass_attempt','pass_length', 'pass_location',
                          'passing_yards','play_type', 'play_type_nfl', 'posteam_score', 'qb_kneel', 'qb_scramble',
                          'qb_spike','qtr', 'quarter_end', 'quarter_seconds_remaining','receiving_yards','run_gap',
                          'run_location', 'rush', 'rush_attempt', 'rushing_yards','score_differential', 'series',
                          'series_result', 'series_success', 'shotgun', 'side_of_field','success','third_down_converted',
                          'third_down_failed', 'time','timeout', 'timeout_team', 'touchdown', 'weather', 'wp', 'wpa', 
                          'xpass', 'xyac_epa', 'xyac_fd', 'xyac_mean_yardage', 'xyac_median_yardage', 'xyac_success',
                          'yardline_100', 'yards_after_catch', 'yards_gained', 'ydsnet', 'ydstogo', 'yrdln']]

In [9]:
third_downs = important_columns[important_columns.down == 3.0]
third_downs_with_nan.head()

Unnamed: 0,air_epa,air_wpa,air_yards,away_score,away_team,posteam,complete_pass,defteam,defteam_score,defteam_score_post,down,drive_start_yard_line,drive_time_of_possession,first_down_pass,first_down_penalty,first_down_rush,game_half,game_seconds_remaining,home_score,incomplete_pass,interception,lateral_receiving_yards,lateral_reception,lateral_rush,lateral_rushing_yards,no_huddle,no_score_prob,pass_attempt,pass_length,pass_location,passing_yards,play_type,play_type_nfl,posteam_score,qb_kneel,qb_scramble,qb_spike,qtr,quarter_end,quarter_seconds_remaining,receiving_yards,run_gap,run_location,rush,rush_attempt,rushing_yards,score_differential,series,series_result,series_success,shotgun,side_of_field,success,third_down_converted,third_down_failed,time,timeout,timeout_team,touchdown,weather,wp,wpa,xpass,xyac_epa,xyac_fd,xyac_mean_yardage,xyac_median_yardage,xyac_success,yardline_100,yards_after_catch,yards_gained,ydsnet,ydstogo,yrdln
4,,,,12,ATL,ATL,0.0,MIN,0.0,0.0,3.0,ATL 25,2:07,0.0,0.0,0.0,Half1,3521.0,28,0.0,0.0,,0.0,0.0,,0,0.003781,0.0,,,,run,RUSH,0.0,0,1,0,1,0,821.0,,end,left,0,1.0,12.0,0.0,1,Punt,0,1,ATL,1.0,0.0,1.0,13:41,0.0,,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.379757,0.018007,0.933516,,,,,,79.0,,12.0,8.0,14,ATL 21
13,,,,12,ATL,ATL,0.0,MIN,7.0,7.0,3.0,ATL 16,1:55,0.0,1.0,0.0,Half1,3323.0,28,0.0,0.0,,0.0,0.0,,0,0.005124,0.0,,,,no_play,PENALTY,0.0,0,0,0,1,0,623.0,,,,0,0.0,,-7.0,3,First down,1,1,ATL,1.0,0.0,0.0,10:23,0.0,,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.231108,0.015072,0.918794,,,,,,77.0,,0.0,12.0,3,ATL 23
23,,,,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,1.0,0.0,Half1,3123.0,28,0.0,0.0,,0.0,0.0,,0,0.010779,0.0,,,,no_play,PENALTY,0.0,0,0,0,1,0,423.0,,,,0,0.0,,-14.0,7,First down,1,1,ATL,1.0,0.0,0.0,07:03,0.0,,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.121559,0.013919,0.96993,,,,,,70.0,,0.0,34.0,5,ATL 30
26,1.520888,0.0,5.0,12,ATL,ATL,1.0,MIN,14.0,14.0,3.0,ATL 25,5:51,1.0,0.0,0.0,Half1,3061.0,28,0.0,0.0,,0.0,0.0,,0,0.013355,1.0,short,left,6.0,pass,PASS,0.0,0,0,0,1,0,361.0,6.0,,,0,0.0,,-14.0,8,First down,1,1,ATL,1.0,1.0,0.0,06:01,0.0,,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.128082,0.016882,0.967478,0.275905,0.97492,4.289433,2.0,0.97492,60.0,1.0,6.0,34.0,5,ATL 40
32,3.544151,0.0,21.0,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,0.0,0.0,Half1,2865.0,28,1.0,0.0,,0.0,0.0,,0,0.018408,1.0,deep,left,,pass,PASS,0.0,0,0,0,1,0,165.0,,,,0,0.0,,-14.0,10,Punt,0,1,MIN,0.0,0.0,1.0,02:45,0.0,,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.117711,-0.012735,0.972965,0.323985,0.997187,3.835246,2.0,1.0,41.0,,0.0,34.0,19,MIN 41


In [10]:
#Explore data 
third_downs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22791 entries, 4 to 143896
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   air_epa                    14704 non-null  float64
 1   air_wpa                    14704 non-null  float64
 2   air_yards                  14725 non-null  float64
 3   away_score                 22791 non-null  int64  
 4   away_team                  22791 non-null  object 
 5   posteam                    22791 non-null  object 
 6   complete_pass              22745 non-null  float64
 7   defteam                    22791 non-null  object 
 8   defteam_score              22791 non-null  float64
 9   defteam_score_post         22791 non-null  float64
 10  down                       22791 non-null  float64
 11  drive_start_yard_line      22791 non-null  object 
 12  drive_time_of_possession   22791 non-null  object 
 13  first_down_pass            22745 non-null  fl

In [11]:
#Remove columns with more than 50% NaN values
third_downs = third_downs_with_nan.columns[third_downs_with_nan.isnull().mean() < 0.5]

In [12]:
third_downs_df = data[third_downs]

In [13]:
third_downs_df = third_downs_df[third_downs_df.down == 3.0]

In [14]:
third_downs_df.head()

Unnamed: 0,air_epa,air_wpa,air_yards,away_score,away_team,posteam,complete_pass,defteam,defteam_score,defteam_score_post,down,drive_start_yard_line,drive_time_of_possession,first_down_pass,first_down_penalty,first_down_rush,game_half,game_seconds_remaining,home_score,incomplete_pass,interception,lateral_reception,lateral_rush,no_huddle,no_score_prob,pass_attempt,pass_length,pass_location,play_type,play_type_nfl,posteam_score,qb_kneel,qb_scramble,qb_spike,qtr,quarter_end,quarter_seconds_remaining,rush,rush_attempt,score_differential,series,series_result,series_success,shotgun,side_of_field,success,third_down_converted,third_down_failed,time,timeout,touchdown,weather,wp,wpa,xpass,xyac_epa,xyac_fd,xyac_mean_yardage,xyac_median_yardage,xyac_success,yardline_100,yards_gained,ydsnet,ydstogo,yrdln
4,,,,12,ATL,ATL,0.0,MIN,0.0,0.0,3.0,ATL 25,2:07,0.0,0.0,0.0,Half1,3521.0,28,0.0,0.0,0.0,0.0,0,0.003781,0.0,,,run,RUSH,0.0,0,1,0,1,0,821.0,0,1.0,0.0,1,Punt,0,1,ATL,1.0,0.0,1.0,13:41,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.379757,0.018007,0.933516,,,,,,79.0,12.0,8.0,14,ATL 21
13,,,,12,ATL,ATL,0.0,MIN,7.0,7.0,3.0,ATL 16,1:55,0.0,1.0,0.0,Half1,3323.0,28,0.0,0.0,0.0,0.0,0,0.005124,0.0,,,no_play,PENALTY,0.0,0,0,0,1,0,623.0,0,0.0,-7.0,3,First down,1,1,ATL,1.0,0.0,0.0,10:23,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.231108,0.015072,0.918794,,,,,,77.0,0.0,12.0,3,ATL 23
23,,,,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,1.0,0.0,Half1,3123.0,28,0.0,0.0,0.0,0.0,0,0.010779,0.0,,,no_play,PENALTY,0.0,0,0,0,1,0,423.0,0,0.0,-14.0,7,First down,1,1,ATL,1.0,0.0,0.0,07:03,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.121559,0.013919,0.96993,,,,,,70.0,0.0,34.0,5,ATL 30
26,1.520888,0.0,5.0,12,ATL,ATL,1.0,MIN,14.0,14.0,3.0,ATL 25,5:51,1.0,0.0,0.0,Half1,3061.0,28,0.0,0.0,0.0,0.0,0,0.013355,1.0,short,left,pass,PASS,0.0,0,0,0,1,0,361.0,0,0.0,-14.0,8,First down,1,1,ATL,1.0,1.0,0.0,06:01,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.128082,0.016882,0.967478,0.275905,0.97492,4.289433,2.0,0.97492,60.0,6.0,34.0,5,ATL 40
32,3.544151,0.0,21.0,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,0.0,0.0,Half1,2865.0,28,1.0,0.0,0.0,0.0,0,0.018408,1.0,deep,left,pass,PASS,0.0,0,0,0,1,0,165.0,0,0.0,-14.0,10,Punt,0,1,MIN,0.0,0.0,1.0,02:45,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.117711,-0.012735,0.972965,0.323985,0.997187,3.835246,2.0,1.0,41.0,0.0,34.0,19,MIN 41


In [15]:
third_downs_df = third_downs_df.reset_index()

In [16]:
third_downs_df.drop(columns=third_downs_df.columns[0], axis=1, inplace=True)

In [18]:
third_downs_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
air_epa,14704.0,0.780055,1.695466,-9.400001,-0.841593,1.065626,2.087206,6.761917
air_wpa,14704.0,0.004731,0.031003,-0.772345,0.0,0.0,0.0,0.495123
air_yards,14725.0,8.851409,9.736584,-15.0,3.0,6.0,13.0,61.0
away_score,22791.0,21.66636,9.923196,0.0,15.0,21.0,28.0,59.0
complete_pass,22745.0,0.381403,0.485742,0.0,0.0,0.0,1.0,1.0
defteam_score,22791.0,11.343952,9.975712,0.0,3.0,10.0,17.0,59.0
defteam_score_post,22791.0,11.360976,9.982198,0.0,3.0,10.0,17.0,59.0
down,22791.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0
first_down_pass,22745.0,0.257024,0.437002,0.0,0.0,0.0,1.0,1.0
first_down_penalty,22745.0,0.044801,0.206871,0.0,0.0,0.0,0.0,1.0


In [19]:
df = third_downs_df

In [20]:
df.head()

Unnamed: 0,air_epa,air_wpa,air_yards,away_score,away_team,posteam,complete_pass,defteam,defteam_score,defteam_score_post,down,drive_start_yard_line,drive_time_of_possession,first_down_pass,first_down_penalty,first_down_rush,game_half,game_seconds_remaining,home_score,incomplete_pass,interception,lateral_reception,lateral_rush,no_huddle,no_score_prob,pass_attempt,pass_length,pass_location,play_type,play_type_nfl,posteam_score,qb_kneel,qb_scramble,qb_spike,qtr,quarter_end,quarter_seconds_remaining,rush,rush_attempt,score_differential,series,series_result,series_success,shotgun,side_of_field,success,third_down_converted,third_down_failed,time,timeout,touchdown,weather,wp,wpa,xpass,xyac_epa,xyac_fd,xyac_mean_yardage,xyac_median_yardage,xyac_success,yardline_100,yards_gained,ydsnet,ydstogo,yrdln
0,,,,12,ATL,ATL,0.0,MIN,0.0,0.0,3.0,ATL 25,2:07,0.0,0.0,0.0,Half1,3521.0,28,0.0,0.0,0.0,0.0,0,0.003781,0.0,,,run,RUSH,0.0,0,1,0,1,0,821.0,0,1.0,0.0,1,Punt,0,1,ATL,1.0,0.0,1.0,13:41,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.379757,0.018007,0.933516,,,,,,79.0,12.0,8.0,14,ATL 21
1,,,,12,ATL,ATL,0.0,MIN,7.0,7.0,3.0,ATL 16,1:55,0.0,1.0,0.0,Half1,3323.0,28,0.0,0.0,0.0,0.0,0,0.005124,0.0,,,no_play,PENALTY,0.0,0,0,0,1,0,623.0,0,0.0,-7.0,3,First down,1,1,ATL,1.0,0.0,0.0,10:23,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.231108,0.015072,0.918794,,,,,,77.0,0.0,12.0,3,ATL 23
2,,,,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,1.0,0.0,Half1,3123.0,28,0.0,0.0,0.0,0.0,0,0.010779,0.0,,,no_play,PENALTY,0.0,0,0,0,1,0,423.0,0,0.0,-14.0,7,First down,1,1,ATL,1.0,0.0,0.0,07:03,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.121559,0.013919,0.96993,,,,,,70.0,0.0,34.0,5,ATL 30
3,1.520888,0.0,5.0,12,ATL,ATL,1.0,MIN,14.0,14.0,3.0,ATL 25,5:51,1.0,0.0,0.0,Half1,3061.0,28,0.0,0.0,0.0,0.0,0,0.013355,1.0,short,left,pass,PASS,0.0,0,0,0,1,0,361.0,0,0.0,-14.0,8,First down,1,1,ATL,1.0,1.0,0.0,06:01,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.128082,0.016882,0.967478,0.275905,0.97492,4.289433,2.0,0.97492,60.0,6.0,34.0,5,ATL 40
4,3.544151,0.0,21.0,12,ATL,ATL,0.0,MIN,14.0,14.0,3.0,ATL 25,5:51,0.0,0.0,0.0,Half1,2865.0,28,1.0,0.0,0.0,0.0,0,0.018408,1.0,deep,left,pass,PASS,0.0,0,0,0,1,0,165.0,0,0.0,-14.0,10,Punt,0,1,MIN,0.0,0.0,1.0,02:45,0.0,0.0,"N/A (Indoors) Temp: ° F, Wind: mph",0.117711,-0.012735,0.972965,0.323985,0.997187,3.835246,2.0,1.0,41.0,0.0,34.0,19,MIN 41


In [21]:
third_downs_df.shape

(22791, 65)

In [22]:
# save the data to a new csv file
df.to_csv(r'C:\Users\geral\Documents\Springboard\Capstone_project_1\third_downs_df_1.cvs', index=False)