In [10]:
from datetime import datetime
from io import StringIO

import pandas as pd
import numpy as np
import requests

In [8]:
BASE_URL = "https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play"
YEARS = range(1999, datetime.now().year)

In [9]:
def fetch_pbp_data(years):
    pbp_dfs = []
    for year in years:
        print(f"Fetching play-by-play data for year: {year}")
        response = requests.get(f"{BASE_URL}_{year}.csv")
        response.raise_for_status()  # Raises error if download fails
        df = pd.read_csv(StringIO(response.text))
        pbp_dfs.append(df)

    return pd.concat(pbp_dfs, ignore_index=True)

pbp = fetch_pbp_data(YEARS)

Fetching play-by-play data for year: 1999


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2000


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2001


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2002


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2003


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2004


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2005


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2006


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2007


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2008


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2009


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2010


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2011


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2012


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2013


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2014


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2015


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2016


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2017


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2018


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2019


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2020


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2021


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2022


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2023


  df = pd.read_csv(StringIO(response.text))


Fetching play-by-play data for year: 2024


  df = pd.read_csv(StringIO(response.text))


In [None]:
pbp = pbp.sort_values(['game_id', 'play_id'])

# Group by game_id and create lagged features
pbp['prev_time'] = pbp.groupby('game_id')['quarter_seconds_remaining'].shift(1)
pbp['prev_incomplete'] = pbp.groupby('game_id')['incomplete_pass'].shift(1)
pbp['prev_timeout'] = pbp.groupby('game_id')['timeout'].shift(1)
pbp['prev_spike'] = pbp.groupby('game_id')['qb_spike'].shift(1)
pbp['prev_out_of_bounds'] = pbp.groupby('game_id')['out_of_bounds'].shift(1)
pbp['prev_qb_kneel'] = pbp.groupby('game_id')['qb_kneel'].shift(1)

# is_clock_running logic
def get_is_clock_running(row):
    if pd.isna(row['prev_time']):
        return np.nan
    if row['prev_timeout'] == 1 or row['prev_spike'] == 1 or row['prev_incomplete'] == 1 or row['prev_out_of_bounds'] == 1:
        return False
    if row['prev_qb_kneel'] == 1:
        return True
    if row['prev_time'] > row['quarter_seconds_remaining']:
        return True
    return False

pbp['is_clock_running'] = pbp.apply(get_is_clock_running, axis=1)

pbp = pbp[
    pbp['down'].notna() &
    pbp['play_type'].notna() &
    (pbp['play_type'] != 'no_play')
]

  pbp['prev_time'] = pbp.groupby('game_id')['quarter_seconds_remaining'].shift(1)
  pbp['prev_incomplete'] = pbp.groupby('game_id')['incomplete_pass'].shift(1)
  pbp['prev_timeout'] = pbp.groupby('game_id')['timeout'].shift(1)
  pbp['prev_spike'] = pbp.groupby('game_id')['qb_spike'].shift(1)
  pbp['prev_out_of_bounds'] = pbp.groupby('game_id')['out_of_bounds'].shift(1)
  pbp['prev_qb_kneel'] = pbp.groupby('game_id')['qb_kneel'].shift(1)


In [None]:
pbp_model = pbp[[
    'qtr',
    'game_seconds_remaining',
    'posteam_score',
    'defteam_score',
    'down',
    'ydstogo',
    'yardline_100',
    'posteam_timeouts_remaining',
    'defteam_timeouts_remaining',
    'play_type',
    'is_clock_running'
]].copy()

pbp_model.rename(columns={'qtr': 'quarter'}, inplace=True)

pbp_model['play_type'] = pbp_model['play_type'].str.upper().astype('category')

pbp_model.head()