In [1]:
import pandas as pd

In [2]:
import sys
from pathlib import Path
repo_root = Path.cwd().resolve().parents[3]
print(f"Adding {repo_root} to sys.path")
sys.path.append(str(repo_root))
import utils

Adding /home/mrmath/sports_betting_empire/sports_betting_empire to sys.path


In [3]:
pbp_stats = utils.rush_yard_stats_from_s3("play_by_play", 2018, 2025)

In [4]:
pbp_stats

Unnamed: 0,Date,Tm,Opp,Quarter,Time,Down,ToGo,Location,Score,Detail,Yds,EPB,EPA,Diff,season
0,2018-11-11,Eagles,Cowboys,2,14:48,1,10,PHI 26,0-3,Josh Adams right end for 29 yards (tackle by A...,29,0.67,2.59,1.92,2018
1,2018-11-18,Eagles,Saints,2,8:49,2,4,NOR 28,6-17,"Josh Adams left guard for 28 yards, touchdown",28,3.58,7.00,3.42,2018
2,2018-09-06,Eagles,Falcons,4,3:57,3,2,ATL 35,10-12,Corey Clement right tackle for 21 yards (tackl...,21,2.54,4.65,2.11,2018
3,2018-10-28,Eagles,Jaguars,3,11:30,2,6,PHI 22,10-6,Josh Adams right guard for 21 yards (tackle by...,21,0.14,1.80,1.66,2018
4,2018-09-16,Eagles,Buccaneers,3,15:00,1,10,PHI 25,7-20,Jay Ajayi up the middle for 20 yards (tackle b...,20,0.61,1.93,1.32,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116727,2025-12-28,Raiders,Giants,1,10:29,1,10,RAI 16,0-0,Ashton Jeanty up the middle for -2 yards (tack...,-2,-0.14,-0.89,-0.75,2025
116728,2025-12-21,Raiders,Texans,2,6:09,3,1,RAI 45,7-10,Ashton Jeanty left end for -3 yards (tackle by...,-3,1.29,-0.59,-1.88,2025
116729,2025-11-23,Raiders,Browns,2,10:41,2,8,CLE 24,0-14,Geno Smith right tackle for -5 yards (tackle b...,-5,3.57,2.21,-1.36,2025
116730,2025-11-30,Raiders,Chargers,4,8:21,1,1,SDG 1,7-24,Geno Smith up the middle for -5 yards (tackle ...,-5,6.97,4.95,-2.02,2025


In [5]:
base_stats = utils.rush_yard_stats_from_s3("base_stats", 2018, 2025)

In [6]:
teams = base_stats['Team'].unique()

In [7]:
import re

def clean_player_name(name: str) -> str:
    """
    Remove generational suffixes from player names.

    Why this matters:
    - Player name keys must be consistent across datasets
    - Some sources include suffixes (e.g., "Jr.", "III")
    - Others omit them
    - Removing them prevents join mismatches and duplicate identities

    Handles:
    - Jr, Jr.
    - Sr, Sr.
    - II, III, IV, V, VI
    - Case-insensitive
    - Extra whitespace
    """

    if not isinstance(name, str):
        return name

    # Normalize whitespace
    name = name.strip()

    # Regex to remove suffix at end of string
    # \b ensures we only match whole suffix tokens
    suffix_pattern = r"\b(JR|SR|II|III|IV|V|VI)\.?$"

    # Remove suffix (case-insensitive)
    cleaned = re.sub(suffix_pattern, "", name, flags=re.IGNORECASE)

    # Remove any leftover trailing spaces
    return cleaned.strip()


In [8]:
base_stats['player_name_clean'] = base_stats['Player'].apply(clean_player_name)

In [9]:
base_stats = base_stats[base_stats['Pos.'] == "RB"]

In [10]:
from collections import defaultdict

def build_team_date_lookup(base_stats):
    """
    Precompute team → date → set(player_name_clean)

    Why:
    - Avoid filtering base_stats inside every map_name call
    - Reduces runtime from O(N^2) style scanning
    - Makes name mapping near O(1) lookup per row
    """

    lookup = defaultdict(set)
    for row in base_stats.itertuples(index=False):
        lookup[row.Date].add(row.player_name_clean)

    return lookup
def map_name_fast(row, team_date_lookup):
    """
    Faster name mapping.

    Logic:
    - Get all players on same team
    - Exclude same game date
    - Check if play Detail starts with player name
    """

    game_date = row['Date']
    detail = row['Detail']

    if game_date not in team_date_lookup:
        return None

    for date, players in team_date_lookup.items():

        # Skip same game
        if date == game_date:
            continue

        for player in players:
            if detail.startswith(player):
                return player

    return None


In [11]:
pbp_stats['Date'] = pd.to_datetime(pbp_stats['Date'])

In [12]:
pbp_stats = pbp_stats[pbp_stats['Detail'].notna()]

In [13]:
date_player_lookup = build_team_date_lookup(base_stats)
pbp_stats['player'] = pbp_stats.apply(lambda row: map_name_fast(row, date_player_lookup), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp_stats['player'] = pbp_stats.apply(lambda row: map_name_fast(row, date_player_lookup), axis=1)


In [14]:
pbp_stats = pbp_stats[pbp_stats['player'].notna()]

In [15]:
pbp_stats['player_key'] = pbp_stats['player'].astype(str) + '_' + pbp_stats['Date'].astype(str)

In [16]:
pbp_stats = pbp_stats.dropna(subset=['Detail', 'player'])

In [17]:
unique_zones = set()
zones = []

for row in pbp_stats.itertuples(index=False):
    player_name = row.player
    detail = row.Detail

    words = detail.split()  # split ONCE
    zone_words = []

    for word in words:
        if word.lower() == "for":
            break

        # If word is not part of player name, treat as zone descriptor
        if word not in player_name.split(" "):
            zone_words.append(word)

    zone = " ".join(zone_words)

    unique_zones.add(zone)
    zones.append(zone)

# Assign column once (much faster than .at in loop)
pbp_stats["zone"] = zones

unique_zones

{'',
 'aborted snap, recovered by Arden Key at TEN-30 (tackle by Dennis Daley)',
 'aborted snap, recovered by Baker Mayfield at MIA-45 (tackle by Raekwon McMillan)',
 'aborted snap, recovered by Bilal Nichols at KC-8 and returned',
 'aborted snap, recovered by Calvin Ridley at TEN-42 (tackle by Andre Cisco)',
 'aborted snap, recovered by Dean Lowry at GB-26 (tackle by Charles Leno)',
 'aborted snap, recovered by Desmond Ridder at CHI-39',
 'aborted snap, recovered by Frank Clark at DEN-28 and returned',
 'aborted snap, recovered by Jalyx Hunt at WAS-28',
 'aborted snap, recovered by Jared Goff at DET-32',
 'aborted snap, recovered by Jerry Jeudy at DEN-10',
 'aborted snap, recovered by Lavonte David at TB-29',
 'aborted snap, recovered by Quinton Jefferson at NYJ-49 and returned',
 'aborted snap, recovered by Trent Murphy at BUF-48 (tackle by Ballage)',
 'aborted snap, recovered by at BAL-30 and returned',
 'aborted snap, recovered by at CHI-39. Penalty on Kenny Clark: Defensive Holdin

In [18]:
good_zones = ['', 'left end',
 'left guard',
 'left tackle',
 'middle',
 'right end',
 'right guard',
 'right tackle',
 'up the middle']
pbp_stats = pbp_stats[pbp_stats['zone'].isin(good_zones)]

In [19]:
zones = {
    'left end': 'left end',
    'left tackle': 'left tackle',
    'left guard': 'left guard',
    'middle': 'up the middle',
    'up the middle': 'up the middle',
    'right guard': 'right guard',
    'right tackle': 'right tackle',
    'right end': 'right end'
}

In [20]:
play_by_play_player_level_stats = []
for k, v in pbp_stats.groupby('player_key'):
    player_name = v.iloc[0]['player']
    rushes_less_than_eq_zero = len(v[v['Yds'] <= 0])
    rushes_one_to_two = len(v[(v['Yds'] > 0) & (v['Yds'] <= 2)])
    rushes_three_to_five = len(v[(v['Yds'] > 2) & (v['Yds'] <= 5)])
    rushes_six_plus = len(v[v['Yds'] > 5])
    rushes_ten_plus = len(v[v['Yds'] > 10])
    rushes_twenty_plus = len(v[v['Yds'] > 20])
    rushes_forty_plus = len(v[v['Yds'] > 40])
    
    zone_diff = {}

    for i in range(len(v)):
        zone = v.iloc[i]['zone']
        if zone in zones:
            zone = zones[zone]
        else:
            continue
        diff = v.iloc[i]['Diff']
        if zone not in zone_diff:
            zone_diff[zone] = diff
        else:
            zone_diff[zone] += diff
    data_row = {
        'player_key': k,
        'player': player_name,
        'rushes_less_than_eq_zero': rushes_less_than_eq_zero,
        'rushes_one_to_two': rushes_one_to_two,
        'rushes_three_to_five': rushes_three_to_five,
        'rushes_six_plus': rushes_six_plus,
        'rushes_ten_plus': rushes_ten_plus,
        'rushes_twenty_plus': rushes_twenty_plus,
        'rushes_forty_plus': rushes_forty_plus,
        'total_rushes': len(v),
        'total_diff': v['Diff'].sum(),
        'Date': v.iloc[0]['Date'],
        'Team': v.iloc[0]['Tm'],
        'Opponent': v.iloc[0]['Opp']
    }
    for zone, diff in zone_diff.items():
        data_row[f'{zone}_diff'] = diff
    play_by_play_player_level_stats.append(data_row)
play_by_play_player_level_stat_df = pd.DataFrame(play_by_play_player_level_stats)

In [21]:
pbp_stats['team_key'] = pbp_stats['Tm'] + '_' + pbp_stats['Date'].dt.strftime('%Y-%m-%d')

In [22]:
play_by_play_team_level_stats = []
for k, v in pbp_stats.groupby('team_key'):
    team_name = v.iloc[0]['Tm']
    rushes_less_than_eq_zero = len(v[v['Yds'] <= 0])
    rushes_one_to_two = len(v[(v['Yds'] > 0) & (v['Yds'] <= 2)])
    rushes_three_to_five = len(v[(v['Yds'] > 2) & (v['Yds'] <= 5)])
    rushes_six_plus = len(v[v['Yds'] > 5])
    rushes_ten_plus = len(v[v['Yds'] > 10])
    rushes_twenty_plus = len(v[v['Yds'] > 20])
    rushes_forty_plus = len(v[v['Yds'] > 40])
    
    zone_diff = {}

    for i in range(len(v)):
        zone = v.iloc[i]['zone']
        if zone in zones:
            zone = zones[zone]
        else:
            continue
        diff = v.iloc[i]['Diff']
        if zone not in zone_diff:
            zone_diff[zone] = diff
        else:
            zone_diff[zone] += diff
    data_row = {
        'team_key': k,
        'team': team_name,
        'team_rushes_less_than_eq_zero': rushes_less_than_eq_zero,
        'team_rushes_one_to_two': rushes_one_to_two,
        'team_rushes_three_to_five': rushes_three_to_five,
        'team_rushes_six_plus': rushes_six_plus,
        'team_rushes_ten_plus': rushes_ten_plus,
        'team_rushes_twenty_plus': rushes_twenty_plus,
        'team_rushes_forty_plus': rushes_forty_plus,
        'team_total_diff': v['Diff'].sum(),
        'team_total_rushes': len(v),
        'Date': v.iloc[0]['Date'],
        'Team': v.iloc[0]['Tm'],
        'Opponent': v.iloc[0]['Opp']
    }
    for zone, diff in zone_diff.items():
        data_row[f'team_{zone}_diff'] = diff
    play_by_play_team_level_stats.append(data_row)
play_by_play_team_level_stat_df = pd.DataFrame(play_by_play_team_level_stats)

In [23]:
play_by_play_opponent_level_stats = []
pbp_stats['opponent_key'] = pbp_stats['Opp'] + '_' + pbp_stats['Date'].dt.strftime('%Y-%m-%d')
for k, v in pbp_stats.groupby('opponent_key'):
    opponent_name = v.iloc[0]['Opp']
    rushes_less_than_eq_zero = len(v[v['Yds'] <= 0])
    rushes_one_to_two = len(v[(v['Yds'] > 0) & (v['Yds'] <= 2)])
    rushes_three_to_five = len(v[(v['Yds'] > 2) & (v['Yds'] <= 5)])
    rushes_six_plus = len(v[v['Yds'] > 5])
    rushes_ten_plus = len(v[v['Yds'] > 10])
    rushes_twenty_plus = len(v[v['Yds'] > 20])
    rushes_forty_plus = len(v[v['Yds'] > 40])
    
    zone_diff = {}

    for i in range(len(v)):
        zone = v.iloc[i]['zone']
        if zone in zones:
            zone = zones[zone]
        else:
            continue
        diff = v.iloc[i]['Diff']
        if zone not in zone_diff:
            zone_diff[zone] = diff
        else:            
            zone_diff[zone] += diff
    data_row = {
        'opponent_key': k,
        'opponent': opponent_name,
        'opponent_rushes_less_than_eq_zero': rushes_less_than_eq_zero,
        'opponent_rushes_one_to_two': rushes_one_to_two,
        'opponent_rushes_three_to_five': rushes_three_to_five,
        'opponent_rushes_six_plus': rushes_six_plus,
        'opponent_rushes_ten_plus': rushes_ten_plus,
        'opponent_rushes_twenty_plus': rushes_twenty_plus,
        'opponent_rushes_forty_plus': rushes_forty_plus,
        'opponent_total_diff': v['Diff'].sum(),
        'opponent_total_rushes': len(v),
        'Date': v.iloc[0]['Date'],
        'Team': v.iloc[0]['Tm'],
        'Opponent': v.iloc[0]['Opp']
    }
    for zone, diff in zone_diff.items():
        data_row[f'opponent_{zone}_diff'] = diff
    play_by_play_opponent_level_stats.append(data_row)
play_by_play_opponent_level_stat_df = pd.DataFrame(play_by_play_opponent_level_stats)

In [24]:
play_by_play_opponent_level_stat_df['team_key'] = play_by_play_opponent_level_stat_df['Team'] + '_' + play_by_play_opponent_level_stat_df['Date'].dt.strftime('%Y-%m-%d')

In [25]:
zones = {
    'left end': 'left end',
    'left tackle': 'left tackle',
    'left guard': 'left guard',
    'middle': 'up the middle',
    'up the middle': 'up the middle',
    'right guard': 'right guard',
    'right tackle': 'right tackle',
    'right end': 'right end'
}

In [26]:
def build_rolling_features(
    df,
    group_col,
    ratio_cols,
    diff_cols,
    total_col,
    windows=(1, 3, 5)
):
    """
    Generic rolling feature builder.

    - Applies per-group rolling windows
    - Prevents leakage with shift(1)
    - Builds ratio-to-total features
    - Builds directional diff rolling means
    """

    lookup = {}

    for key, g in df.groupby(group_col):

        g = g.sort_values("Date").copy()

        # Precompute total rolling means once per window
        total_roll = {
            w: g[total_col].rolling(w).mean().shift(1)
            for w in windows
        }

        # Ratio features
        for col in ratio_cols:
            for w in windows:
                numerator = g[col].rolling(w).mean().shift(1)
                g[f"{col}_{w}ma"] = numerator / total_roll[w]

        # Diff features
        for col in diff_cols:
            for w in windows:
                g[f"{col}_{w}ma"] = (
                    g[col]
                    .rolling(w)
                    .mean()
                    .shift(1)
                )

        lookup[key] = g

    return lookup


In [27]:
play_by_play_opponent_level_stat_df

Unnamed: 0,opponent_key,opponent,opponent_rushes_less_than_eq_zero,opponent_rushes_one_to_two,opponent_rushes_three_to_five,opponent_rushes_six_plus,opponent_rushes_ten_plus,opponent_rushes_twenty_plus,opponent_rushes_forty_plus,opponent_total_diff,...,Team,Opponent,opponent_up the middle_diff,opponent_right guard_diff,opponent_left end_diff,opponent_right tackle_diff,opponent_left tackle_diff,opponent_left guard_diff,opponent_right end_diff,team_key
0,49ers_2018-09-09,49ers,4,12,6,5,1,0,0,-9.84,...,Vikings,49ers,-7.42,0.45,-0.74,-0.63,-0.82,-0.68,,Vikings_2018-09-09
1,49ers_2018-09-16,49ers,3,4,7,4,3,1,0,-0.38,...,Lions,49ers,-1.00,0.19,1.66,0.39,-1.48,-0.14,0.00,Lions_2018-09-16
2,49ers_2018-09-23,49ers,3,7,6,3,0,0,0,-0.78,...,Chiefs,49ers,0.79,0.39,,,0.26,-0.03,-2.19,Chiefs_2018-09-23
3,49ers_2018-09-30,49ers,3,7,6,7,5,1,0,0.80,...,Chargers,49ers,-1.54,-0.33,3.30,0.28,-1.25,-0.72,1.06,Chargers_2018-09-30
4,49ers_2018-10-07,49ers,4,5,5,5,0,0,0,-1.08,...,Cardinals,49ers,-3.15,-0.14,,-0.55,1.20,0.69,0.87,Cardinals_2018-10-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4249,Washington_2025-12-07,Washington,3,4,14,8,2,0,0,10.38,...,Vikings,Washington,0.34,2.37,2.57,-0.62,0.67,4.49,0.56,Vikings_2025-12-07
4250,Washington_2025-12-14,Washington,2,5,8,5,2,0,0,0.84,...,Giants,Washington,-0.60,-0.27,-0.21,0.65,-1.26,-1.48,0.13,Giants_2025-12-14
4251,Washington_2025-12-20,Washington,5,6,3,11,4,2,1,10.68,...,Eagles,Washington,0.83,3.32,,0.00,7.46,-1.32,0.39,Eagles_2025-12-20
4252,Washington_2025-12-25,Washington,3,10,16,12,2,1,0,3.36,...,Cowboys,Washington,1.55,3.22,-1.78,-0.69,1.21,0.42,-0.57,Cowboys_2025-12-25


In [28]:
player_pbp_stat_lookup = build_rolling_features(
    play_by_play_player_level_stat_df.fillna(0),
    group_col="player",
    ratio_cols=[
        "rushes_less_than_eq_zero",
        "rushes_one_to_two",
        "rushes_three_to_five",
        "rushes_six_plus",
        "rushes_ten_plus",
        "rushes_twenty_plus",
        "rushes_forty_plus",
    ],
    diff_cols=[
        "left end_diff",
        "left tackle_diff",
        "left guard_diff",
        "up the middle_diff",
        "right guard_diff",
        "right tackle_diff",
        "right end_diff",
        "total_diff",
    ],
    total_col="total_rushes",
)

team_pbp_stat_lookup = build_rolling_features(
    play_by_play_team_level_stat_df.fillna(0),
    group_col="team",
    ratio_cols=[
        "team_rushes_less_than_eq_zero",
        "team_rushes_one_to_two",
        "team_rushes_three_to_five",
        "team_rushes_six_plus",
        "team_rushes_ten_plus",
        "team_rushes_twenty_plus",
        "team_rushes_forty_plus",
    ],
    diff_cols=[
        "team_left end_diff",
        "team_left tackle_diff",
        "team_left guard_diff",
        "team_up the middle_diff",
        "team_right guard_diff",
        "team_right tackle_diff",
        "team_right end_diff",
        "team_total_diff",
    ],
    total_col="team_total_rushes",
)

opponent_pbp_stat_lookup = build_rolling_features(
    play_by_play_opponent_level_stat_df.fillna(0),
    group_col="Opponent",
    ratio_cols=[
        "opponent_rushes_less_than_eq_zero",
        "opponent_rushes_one_to_two",
        "opponent_rushes_three_to_five",
        "opponent_rushes_six_plus",
        "opponent_rushes_ten_plus",
        "opponent_rushes_twenty_plus",
        "opponent_rushes_forty_plus",
    ],
    diff_cols=[
        "opponent_left end_diff",
        "opponent_left tackle_diff",
        "opponent_left guard_diff",
        "opponent_up the middle_diff",
        "opponent_right guard_diff",
        "opponent_right tackle_diff",
        "opponent_right end_diff",
        "opponent_total_diff",
    ],
    total_col="opponent_total_rushes",
)

In [29]:
base_stats['player_key'] = base_stats['player_name_clean'] + '_' + base_stats['Date'].dt.strftime('%Y-%m-%d')

In [30]:
team_abv_to_name_map = {}
for i in range(len(base_stats)):
    row = base_stats.iloc[i]
    player_key = row.player_key
    curr_pbp_stats = pbp_stats[pbp_stats['player_key'] == player_key]
    if curr_pbp_stats.empty:
        continue
    team_abv_to_name_map[row.Team] = curr_pbp_stats.iloc[0]['Tm']

In [31]:
base_stats['Tm'] = base_stats['Team'].map(team_abv_to_name_map)
base_stats['Opponent'] = base_stats['Opp'].map(team_abv_to_name_map)

In [32]:
base_stats['team_key'] = base_stats['Tm'] + '_' + base_stats['Date'].dt.strftime('%Y-%m-%d')
base_stats['opponent_key'] = base_stats['Opponent'] + '_' + base_stats['Date'].dt.strftime('%Y-%m-%d')

In [33]:
base_stats = base_stats.sort_values(["Date", 'Team']).copy()

In [34]:
# Pre-index each lookup by key for O(1) access

def index_by_key(lookup_dict, key_col):
    indexed = {}
    for k, df in lookup_dict.items():
        if key_col in df.columns:
            indexed[k] = df.set_index(key_col)
    return indexed


player_indexed = index_by_key(player_pbp_stat_lookup, "player_key")
team_indexed = index_by_key(team_pbp_stat_lookup, "team_key")
opponent_indexed = index_by_key(opponent_pbp_stat_lookup, "opponent_key")


rows = []

for row in base_stats.itertuples(index=False):

    player_key = row.player_key
    team_key = row.team_key
    opponent_key = row.opponent_key
    player_name = row.player_name_clean
    team = row.Tm
    opponent = row.Opponent

    curr_player = (
        player_indexed.get(player_name, {}).loc[player_key].to_dict()
        if player_name in player_indexed and player_key in player_indexed[player_name].index
        else {}
    )

    curr_team = (
        team_indexed.get(team, {}).loc[team_key].to_dict()
        if team in team_indexed and team_key in team_indexed[team].index
        else {}
    )

    curr_opponent = (
        opponent_indexed.get(opponent, {}).loc[opponent_key].to_dict()
        if opponent in opponent_indexed and opponent_key in opponent_indexed[opponent].index
        else {}
    )
    
    if not curr_player:

        print(f"Missing data for player_key={player_key})")
    if not curr_team:
        print(f"Missing data for team_key={team_key})")
    if not curr_opponent:
        print(f"Missing data for opponent_key={opponent_key})")
    combined = {
        **curr_player,
        **curr_team,
        **curr_opponent,
        "player_key": player_key,
        "team_key": team_key,
        "opponent_key": opponent_key,
    }

    rows.append(combined)

pbp_train_df = pd.DataFrame(rows)


Missing data for player_key=Ito Smith_2018-09-06)
Missing data for player_key=Wendell Smallwood_2018-09-06)
Missing data for player_key=Derrick Coleman_2018-09-09)
Missing data for player_key=Taiwan Jones_2018-09-09)
Missing data for player_key=Michael Burton_2018-09-09)
Missing data for player_key=Benny Cunningham_2018-09-09)
Missing data for player_key=Tra Carson_2018-09-09)
Missing data for player_key=Brandon Wilson_2018-09-09)
Missing data for player_key=Darius Jackson_2018-09-09)
Missing data for player_key=Tyler Ervin_2018-09-09)
Missing data for player_key=Christine Michael_2018-09-09)
Missing data for player_key=Anthony Sherman_2018-09-09)
Missing data for player_key=De'Anthony Thomas_2018-09-09)
Missing data for player_key=Detrez Newsome_2018-09-09)
Missing data for player_key=Brandon Bolden_2018-09-09)
Missing data for player_key=Senorise Perry_2018-09-09)
Missing data for player_key=C.J. Ham_2018-09-09)
Missing data for player_key=Zach Line_2018-09-09)
Missing data for playe

In [35]:
pbp_train_df = pbp_train_df.dropna(subset=['player'])
pbp_train_df

Unnamed: 0,player,rushes_less_than_eq_zero,rushes_one_to_two,rushes_three_to_five,rushes_six_plus,rushes_ten_plus,rushes_twenty_plus,rushes_forty_plus,total_rushes,total_diff,...,opponent_right tackle_diff_3ma,opponent_right tackle_diff_5ma,opponent_right end_diff_1ma,opponent_right end_diff_3ma,opponent_right end_diff_5ma,opponent_total_diff_1ma,opponent_total_diff_3ma,opponent_total_diff_5ma,player_key,opponent_key
0,Devonta Freeman,3.0,0.0,2.0,2.0,1.0,0.0,0.0,7.0,-2.64,...,,,,,,,,,Devonta Freeman_2018-09-06,Eagles_2018-09-06
1,Tevin Coleman,4.0,3.0,2.0,1.0,0.0,0.0,0.0,10.0,-3.62,...,,,,,,,,,Tevin Coleman_2018-09-06,Eagles_2018-09-06
3,Darren Sproles,2.0,3.0,1.0,0.0,0.0,0.0,0.0,6.0,-1.26,...,,,,,,,,,Darren Sproles_2018-09-06,Falcons_2018-09-06
4,Jay Ajayi,1.0,5.0,5.0,4.0,1.0,0.0,0.0,15.0,3.26,...,,,,,,,,,Jay Ajayi_2018-09-06,Falcons_2018-09-06
5,Corey Clement,2.0,2.0,1.0,1.0,1.0,1.0,0.0,6.0,1.14,...,,,,,,,,,Corey Clement_2018-09-06,Falcons_2018-09-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13472,Tony Pollard,1.0,4.0,6.0,3.0,0.0,0.0,0.0,14.0,0.13,...,0.140000,-0.082,0.73,0.376667,0.226,0.91,2.156667,-0.128,Tony Pollard_2026-01-04,Jaguars_2026-01-04
13473,Tyjae Spears,0.0,0.0,1.0,2.0,0.0,0.0,0.0,3.0,1.05,...,0.140000,-0.082,0.73,0.376667,0.226,0.91,2.156667,-0.128,Tyjae Spears_2026-01-04,Jaguars_2026-01-04
13476,Jacory Croskey-Merritt,3.0,5.0,3.0,2.0,0.0,0.0,0.0,13.0,-4.39,...,-0.713333,-0.326,0.00,0.433333,1.050,-3.50,-1.140000,0.250,Jacory Croskey-Merritt_2026-01-04,Eagles_2026-01-04
13477,Chris Rodriguez,1.0,7.0,3.0,5.0,2.0,0.0,0.0,16.0,-0.97,...,-0.713333,-0.326,0.00,0.433333,1.050,-3.50,-1.140000,0.250,Chris Rodriguez_2026-01-04,Eagles_2026-01-04


In [36]:
pbp_train_df.to_csv("play_by_play_feature_engineering.csv", index=False)