In [8]:
import pandas as pd 
import numpy as np 
import nfl_data_py as nfl 
from IPython.display import display, HTML

In [106]:
plays = nfl.import_pbp_data([2023])

2023 done.
Downcasting floats.


In [107]:
#Select the fields that I am interested in using for predicting/feature engineering
fields = ['play_type_nfl', 'play_type'
            , 'week'
              , 'half_seconds_remaining', 'game_seconds_remaining', 'game_half'
              , 'play_id', 'drive', 'game_id'
              , 'posteam', 'defteam'
              , 'yardline_100'
              , 'season_type', 'posteam_type'
              , 'down', 'goal_to_go', 'ydstogo'
              , 'ydsnet', 'yards_gained', 'shotgun', 'no_huddle'
              , 'pass', 'rush', 'first_down'
              , 'posteam_score', 'defteam_score', 'score_differential'
              , 'game_date', 'home_team', 'away_team'
              , 'total_home_score', 'total_away_score'
              , 'home_timeouts_remaining', 'away_timeouts_remaining'
              , 'penalty_team', 'penalty_yards', 'penalty_type', 'penalty'
              , 'air_yards', 'yards_after_catch', 'incomplete_pass', 'complete_pass', 'passing_yards'
              , 'rush_attempt', 'pass_attempt', 'sack', 'qb_hit'
              , 'touchdown', 'pass_touchdown', 'rush_touchdown', 'field_goal_attempt'
              , 'interception', 'fumble_forced', 'fumble_not_forced', 'fumble']

sub_plays = plays.loc[(plays['play_deleted'] == 0) & (plays['play_type'].notna()), fields]

Next I want to create a series of play sequence IDs. I want to know the following: 

* play_seq_game: How many plays have been run in the game (excluding kickoffs, penalties, timeouts)
* play_seq_drive: What number play is this in this drive 
* play_seq_posteam: What number of play is this for the posessing team 


In [108]:
excl_pt = ['kickoff', 'no_play', 'extra_point', 'qb_kneel', 'qb_spike']

seq_config = {
    'play_seq_game': {
        'filters': lambda x: x['play_type'] not in excl_pt
        , 'sort_col': ['game_id', 'game_half', 'game_seconds_remaining']
        , 'sort_orders': [True, True, False]
        , 'grouper': ['game_id']
    }
    , 'play_seq_drive': {
        'filters': lambda x: x['play_type'] not in excl_pt
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'drive']
    } 
    , 'play_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'pass_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['pass'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'rush_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['rush'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'shot_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['shotgun'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'lead_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['score_differential'] > 0)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'fd_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['first_down'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'sacked_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['sack'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'fumble_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['fumble'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
    , 'int_seq_posteam': {
        'filters': lambda x: (x['play_type'] not in excl_pt) & (x['interception'] == 1)
        ,'sort_col': ['game_id', 'game_half', 'game_seconds_remaining'] 
        ,'sort_orders': [True, True, False] 
        ,'grouper': ['game_id', 'posteam']
    }
}

join_col = ['game_id', 'game_half', 'game_seconds_remaining', 'play_id', 'drive']

old_keys = []

for key, sub in seq_config.items(): 
    if key in sub_plays.columns: 
        print(f'{key} already exists')
        continue
    
    df = sub_plays[sub_plays.apply(sub['filters'], axis = 1)].drop(columns = old_keys)

    df_sorted = df.sort_values(sub['sort_col']
                               , ascending = sub['sort_orders'])
    
    df_sorted[key] = df_sorted.groupby(sub['grouper']).cumcount()

    keepers = join_col + [key]

    right = df_sorted.loc[:,keepers]

    sub_plays = sub_plays.merge(right
                                , on = join_col
                                , how = 'left')
    
    old_keys = old_keys + [key]

In [104]:
print(old_keys)


['fd_seq_posteam', 'sacked_seq_posteam', 'fumble_seq_posteam', 'int_seq_posteam']


In [109]:
test = sub_plays.loc[:, ['play_type', 'week', 'game_seconds_remaining', 'game_half'
              , 'play_seq_game', 'play_seq_drive', 'play_seq_posteam'
              , 'play_id', 'drive', 'game_id'
              , 'posteam', 'defteam'
              , 'pass', 'rush', 'first_down', 'shotgun'
              , 'pass_seq_posteam', 'rush_seq_posteam', 'shot_seq_posteam'
              , 'yardline_100'
              , 'down', 'goal_to_go', 'ydstogo'
              , 'ydsnet', 'yards_gained', 'no_huddle'
              ,'fd_seq_posteam', 'sacked_seq_posteam', 'fumble_seq_posteam', 'int_seq_posteam'
              , 'posteam_score', 'defteam_score', 'score_differential', 'lead_seq_posteam'
              , 'game_date', 'home_team', 'away_team'
              , 'total_home_score', 'total_away_score'
              , 'home_timeouts_remaining', 'away_timeouts_remaining'
              , 'penalty_team', 'penalty_yards', 'penalty_type', 'penalty'
              , 'air_yards', 'yards_after_catch', 'incomplete_pass', 'complete_pass', 'passing_yards'
              , 'rush_attempt', 'pass_attempt', 'sack', 'qb_hit'
              , 'touchdown', 'pass_touchdown', 'rush_touchdown', 'field_goal_attempt'
              , 'interception', 'fumble_forced', 'fumble_not_forced', 'fumble']]

In [44]:
plays.play_type.unique()

array([None, 'kickoff', 'run', 'pass', 'punt', 'no_play', 'extra_point',
       'field_goal', 'qb_kneel', 'qb_spike'], dtype=object)