In [1]:
pip install requests pandas nflreadpy scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import requests
import nflreadpy as nfl
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import os

In [3]:
# Load current season play-by-play data
pbp = nfl.load_pbp()

# Load player game-level stats for multiple seasons
player_stats = nfl.load_player_stats()

# Load all available team level stats
team_stats = nfl.load_team_stats([2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

schedules = nfl.load_schedules(seasons=[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

# nflreadpy uses Polars instead of pandas. Convert to pandas if needed:
df_pbp = pbp.to_pandas()
df_ps = player_stats.to_pandas()
df_ts = team_stats.to_pandas()
df_schedules = schedules.to_pandas()

In [4]:
df_ts = df_ts[df_ts["season_type"] == "REG"].copy()
df_ts["def_tackles"] = df_ts['def_tackles_solo'] + df_ts['def_tackles_with_assist']
df_ts["fumbles"] = df_ts["sack_fumbles"] + df_ts["rushing_fumbles"] + df_ts["receiving_fumbles"]
remove = ['passing_2pt_conversions', 'punt_returns', 'punt_return_yards', 'kickoff_returns', 'kickoff_return_yards', 'fg_made', 'fg_att', 'fg_missed',
          'fg_blocked', 'fg_long', 'fg_made_0_19', 'fg_made_20_29', 'fg_made_30_39', 'fg_made_40_49', 'fg_made_50_59', 'fg_made_60_', 'fg_missed_0_19',
          'fg_missed_20_29', 'fg_missed_30_39', 'fg_missed_40_49', 'fg_missed_50_59', 'fg_missed_60_', 'fg_made_list', 'fg_missed_list', 'fg_blocked_list',
          'fg_made_distance', 'fg_missed_distance', 'fg_blocked_distance', 'pat_made', 'pat_att', 'pat_missed', 'pat_blocked', 'pat_pct', 'gwfg_made', 'gwfg_att',
          'gwfg_missed', 'gwfg_blocked', 'gwfg_distance', 'def_safeties', 'misc_yards', 'fumble_recovery_own', 'fumble_recovery_yards_own', 'fumble_recovery_opp',
          'fumble_recovery_yards_opp', 'fumble_recovery_tds', 'sack_yards_lost', 'sack_fumbles_lost', 'timeouts', 'passing_air_yards', 'passing_yards_after_catch',
          'rushing_fumbles_lost', 'rushing_2pt_conversions', 'targets', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'season_type',
          'receiving_2pt_conversions', 'special_teams_tds', 'def_tackles_solo', 'def_tackles_with_assist', 'def_tackle_assists', 'def_tackles_for_loss_yards', 'def_sack_yards',
          'def_interception_yards', 'def_safeties', 'def_tds', 'rushing_fumbles', 'receiving_fumbles', 'sack_fumbles', 'def_fumbles']

df_ts_cleaned = df_ts.drop(remove, axis=1)
df_ts_cleaned

Unnamed: 0,season,week,team,opponent_team,completions,attempts,passing_yards,passing_tds,passing_interceptions,sacks_suffered,...,def_fumbles_forced,def_sacks,def_qb_hits,def_interceptions,def_pass_defended,penalties,penalty_yards,fg_pct,def_tackles,fumbles
0,2016,1,ARI,NE,24,37,271,2,0,3,...,2,2.0,4,0,1,6,58,0.000000,53,1
1,2016,1,ATL,TB,27,39,334,2,0,3,...,0,0.0,6,1,3,7,74,1.000000,40,0
2,2016,1,BAL,BUF,22,33,258,1,0,4,...,0,2.0,6,0,2,6,35,1.000000,36,1
3,2016,1,BUF,BAL,15,22,111,0,0,2,...,0,4.0,9,0,3,8,89,0.000000,41,0
4,2016,1,CAR,DEN,18,33,194,1,1,3,...,1,2.0,3,2,6,8,85,0.666667,43,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5135,2025,6,TB,SF,17,23,256,2,0,1,...,1,6.0,10,2,5,5,31,0.500000,46,1
5136,2025,6,TEN,LV,26,38,222,1,1,6,...,0,2.0,7,1,4,7,34,1.000000,36,2
5137,2025,6,WAS,CHI,19,26,211,3,1,3,...,0,3.0,5,0,5,5,40,0.500000,31,3
5138,2025,7,CIN,PIT,31,47,342,3,0,2,...,0,0.0,2,2,6,4,35,1.000000,33,0


In [5]:
df_schedules_cleaned = df_schedules[df_schedules["game_type"] == "REG"].copy()
is_home_first = df_schedules_cleaned["home_team"] < df_schedules_cleaned["away_team"]

df_schedules_cleaned["team"] = np.where(
    is_home_first,
    df_schedules_cleaned["home_team"],
    df_schedules_cleaned["away_team"]
)


df_schedules_cleaned["opponent_team"] = np.where(
    is_home_first,
    df_schedules_cleaned["away_team"],
    df_schedules_cleaned["home_team"]
)


score_team = np.where(
    is_home_first,
    df_schedules_cleaned["home_score"],
    df_schedules_cleaned["away_score"]
)


score_opponent = np.where(
    is_home_first,
    df_schedules_cleaned["away_score"],
    df_schedules_cleaned["home_score"]
)

df_schedules_cleaned["is_team_win"] = (score_team > score_opponent).astype(int)

df_schedules_final = df_schedules_cleaned[["season", "week", "team", "opponent_team", "is_team_win"]].copy()

df_schedules_final

Unnamed: 0,season,week,team,opponent_team,is_team_win
0,2016,1,CAR,DEN,0
1,2016,1,ATL,TB,0
2,2016,1,BAL,BUF,1
3,2016,1,CHI,HOU,0
4,2016,1,GB,JAX,1
...,...,...,...,...,...
2743,2025,18,DAL,NYG,0
2744,2025,18,PHI,WAS,0
2745,2025,18,BAL,PIT,0
2746,2025,18,SEA,SF,0


In [6]:
# Sort the data by season and week to ensure the rolling average is correct
df_ts_cleaned_sorted = df_ts_cleaned.sort_values(by=['season', 'week'])

# Define a list of ALL statistical columns
# Note: Ensure ALL columns used in d_feature calculations (like passing_interceptions, etc.) are included here
stat_cols = [col for col in df_ts_cleaned_sorted.columns if col not in ['season', 'week', 'team', 'opponent_team', 'score', 'opponent_score']]

df_ts_rolling = df_ts_cleaned_sorted.groupby('team')[stat_cols].transform(
    lambda x: x.expanding().mean()
)

# Combine the IDs and the rolling stats
df_ts_std = df_ts_cleaned_sorted[['season', 'week', 'team', 'opponent_team']].reset_index(drop=True)
df_ts_rolling.index = df_ts_std.index
df_ts_std = pd.concat([df_ts_std, df_ts_rolling], axis=1)

# FIX: Impute the Week 1 NaN values with 0.
# This preserves Week 1 games for training, giving them a "neutral" (0) prior average.
# The model will learn that having a neutral average at the start is often less predictive.
df_ts_std.fillna(0, inplace=True)


# --- CREATE MATCHUPS (Differential Stats) using STD Averages ---

# Create the opponent's STD stats DataFrame
df_opponent_std = df_ts_std.copy()
df_opponent_std.rename(columns={'team': 'opponent_match_key', 'opponent_team': 'team_match_check'}, inplace=True)

# Merge the STD dataframes to create the matchups using historical averages
df_std_matchups = pd.merge(
    df_ts_std,
    df_opponent_std,
    left_on=['season', 'week', 'opponent_team'],
    right_on=['season', 'week', 'opponent_match_key'],
    suffixes=('_team', '_opp'),
    how='inner'
)

# Apply your DIFFERENTIAL STATS formulas to the STD data
# Note: You need to re-create team_a_margin, team_b_margin, etc., using the _std_matchups columns

# EXAMPLE DIFFERENTIALS (You must do this for ALL your 'd_' features)
team_a_margin = (df_std_matchups["def_interceptions_team"] + df_std_matchups["def_fumbles_forced_team"]) - (df_std_matchups["passing_interceptions_team"] + df_std_matchups["fumbles_team"])
team_b_margin = (df_std_matchups["def_interceptions_opp"] + df_std_matchups["def_fumbles_forced_opp"]) - (df_std_matchups["passing_interceptions_opp"] + df_std_matchups["fumbles_opp"])

team_a_touchdowns = df_std_matchups["passing_tds_team"] + df_std_matchups["rushing_tds_team"]
team_b_touchdowns = df_std_matchups["passing_tds_opp"] + df_std_matchups["rushing_tds_opp"]

df_std_matchups["d_passing_yards"] = df_std_matchups["passing_yards_team"] - df_std_matchups["passing_yards_opp"]
df_std_matchups["d_touchdowns"] = team_a_touchdowns - team_b_touchdowns
df_std_matchups["d_sacks_suffered"] = df_std_matchups["sacks_suffered_opp"] - df_std_matchups["sacks_suffered_team"] # negative stat
df_std_matchups["d_passing_epa"] = df_std_matchups["passing_epa_team"] - df_std_matchups["passing_epa_opp"]
df_std_matchups["d_passing_cpoe"] = df_std_matchups["passing_cpoe_team"] - df_std_matchups["passing_cpoe_opp"]
df_std_matchups["d_rushing_yards"] = df_std_matchups["rushing_yards_team"] - df_std_matchups["rushing_yards_opp"]
df_std_matchups["d_receiving_yards"] = df_std_matchups["receiving_yards_team"] - df_std_matchups["receiving_yards_opp"]
df_std_matchups["d_receiving_epa"] = df_std_matchups["receiving_epa_team"] - df_std_matchups["receiving_epa_opp"]
df_std_matchups["d_tackles_for_loss"] = df_std_matchups["def_tackles_for_loss_team"] - df_std_matchups["def_tackles_for_loss_opp"]
df_std_matchups["d_sacks"] = df_std_matchups["def_sacks_team"] - df_std_matchups["def_sacks_opp"]
df_std_matchups["d_qb_hits"] = df_std_matchups["def_qb_hits_team"] - df_std_matchups["def_qb_hits_opp"] # could be removed later
df_std_matchups["d_pass_defended"] = df_std_matchups["def_pass_defended_team"] - df_std_matchups["def_pass_defended_opp"] # could be removed later
df_std_matchups["d_penalties"] = df_std_matchups["penalties_opp"] - df_std_matchups["penalties_team"] # negative stat
df_std_matchups["d_penalty_yards"] = df_std_matchups["penalty_yards_opp"] - df_std_matchups["penalty_yards_team"] # negative stat
df_std_matchups["d_fg_pct"] = df_std_matchups["fg_pct_team"] - df_std_matchups["fg_pct_opp"]
df_std_matchups["d_turnover_margin"] = team_a_margin - team_b_margin

remove_merged = ['completions_team', 'attempts_team', 'passing_yards_team', 'passing_tds_team', 'passing_interceptions_team', 'sacks_suffered_team',
                'passing_first_downs_team', 'passing_epa_team', 'passing_cpoe_team', 'carries_team', 'rushing_yards_team', 'rushing_tds_team',
                'rushing_first_downs_team', 'rushing_epa_team', 'receptions_team', 'receiving_yards_team', 'receiving_tds_team', 'receiving_first_downs_team',
                'receiving_epa_team', 'def_tackles_for_loss_team', 'def_sacks_team', 'def_qb_hits_team', 'def_interceptions_team', 'def_pass_defended_team', 'penalties_team',
                'penalty_yards_team', 'fg_pct_team', 'def_tackles_team', 'fumbles_team', 'def_fumbles_forced_team', 'completions_opp', 'attempts_opp',
                'passing_yards_opp', 'passing_tds_opp', 'passing_interceptions_opp', 'sacks_suffered_opp', 'passing_first_downs_opp', 'passing_epa_opp', 'passing_cpoe_opp',
                'carries_opp', 'rushing_yards_opp', 'rushing_tds_opp', 'rushing_first_downs_opp', 'rushing_epa_opp', 'receptions_opp', 'receiving_yards_opp', 'receiving_tds_opp',
                'receiving_first_downs_opp', 'receiving_epa_opp', 'def_tackles_for_loss_opp', 'def_sacks_opp', 'def_qb_hits_opp', 'def_interceptions_opp', 'def_pass_defended_opp',
                'penalties_opp', 'penalty_yards_opp', 'fg_pct_opp', 'def_tackles_opp', 'fumbles_opp', 'def_fumbles_forced_opp', 'opponent_match_key', 'team_match_check']

df_std_matchups = df_std_matchups.drop(remove_merged, axis=1)
df_std_matchups

Unnamed: 0,season,week,team,opponent_team,d_passing_yards,d_touchdowns,d_sacks_suffered,d_passing_epa,d_passing_cpoe,d_rushing_yards,d_receiving_yards,d_receiving_epa,d_tackles_for_loss,d_sacks,d_qb_hits,d_pass_defended,d_penalties,d_penalty_yards,d_fg_pct,d_turnover_margin
0,2016,1,ARI,NE,7.000000,1.000000,-1.000000,-5.018846,-2.862210,-14.000000,7.000000,-5.413889,0.000000,-1.000000,-1.000000,-5.000000,2.000000,11.000000,-1.000000,4.000000
1,2016,1,ATL,TB,53.000000,-2.000000,-3.000000,-4.093628,-10.577591,-38.000000,53.000000,-0.812834,-6.000000,-3.000000,2.000000,-3.000000,-1.000000,-26.000000,0.000000,2.000000
2,2016,1,BAL,BUF,147.000000,0.000000,-2.000000,4.996015,1.054210,18.000000,147.000000,8.585967,2.000000,-2.000000,-3.000000,-1.000000,2.000000,54.000000,1.000000,-1.000000
3,2016,1,BUF,BAL,-147.000000,0.000000,2.000000,-4.996015,-1.054210,-18.000000,-147.000000,-8.585967,-2.000000,2.000000,3.000000,1.000000,-2.000000,-54.000000,-1.000000,1.000000
4,2016,1,CAR,DEN,16.000000,-1.000000,-1.000000,4.040165,-5.363583,9.000000,16.000000,4.718431,3.000000,-1.000000,-5.000000,1.000000,-4.000000,-63.000000,0.666667,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4917,2025,6,TB,SF,32.000000,0.149351,0.259740,1.754908,2.124591,-26.220779,32.000000,1.178846,0.415584,0.227273,0.493506,0.305195,-0.116883,1.688312,-0.051428,0.032468
4918,2025,6,TEN,LV,-33.857143,0.142857,-0.610390,-1.053370,0.195477,23.142857,-33.857143,-0.654948,0.227273,0.441558,0.337662,0.155844,0.493506,3.584416,-0.052244,0.279221
4919,2025,6,WAS,CHI,20.169425,0.188015,-0.001443,1.183454,1.409123,-6.308505,20.169425,1.234515,0.396401,0.329599,0.817503,-0.124905,0.215686,2.745140,-0.013197,-0.193574
4920,2025,7,CIN,PIT,9.948052,0.142857,-0.935065,0.156278,-0.229158,-7.883117,9.948052,1.950694,-1.084416,-0.909091,-0.889610,-0.396104,0.551948,5.896104,-0.053037,-0.409091


In [7]:
# --- MERGE WIN/LOSS DATA (REQUIRED BEFORE ML STUFF) ---
# NOTE: This part was missing from your latest post, but is necessary to get 'is_team_win'
df_std_matchups = pd.merge(
    df_std_matchups,
    df_schedules_final,
    left_on=['season', 'week', 'team', 'opponent_team'],
    right_on=['season', 'week', 'team', 'opponent_team'],
    how='inner'
)

# --- CREATE FINAL MODEL DATAFRAME (df_model_ready) ---
# Select ONLY the final differential features and the target/ID columns.
model_cols = [col for col in df_std_matchups.columns if col.startswith('d_') or col in ['season', 'week', 'is_team_win']]

# Add 'team' and 'opponent_team' back in for clarity in X_final split later
model_cols.extend(['team', 'opponent_team'])
model_cols = list(set(model_cols)) # Remove duplicates

df_model_ready = df_std_matchups[model_cols].copy()

In [8]:
# ML Stuff

y = df_model_ready['is_team_win']

# X now contains ONLY d_features, 'season', and 'week'
X_cols = [col for col in df_model_ready.columns if col.startswith('d_') or col in ['season', 'week']]
X = df_model_ready[X_cols] # X is now the final feature set including ID columns for splitting

TRAIN_SEASONS = df_model_ready['season'].unique()[:-2]
TEST_SEASONS = df_model_ready['season'].unique()[-2:-1]

# Split X and y based on season
X_train = X[X['season'].isin(TRAIN_SEASONS)].copy()
y_train = y[X['season'].isin(TRAIN_SEASONS)].copy()

X_test = X[X['season'].isin(TEST_SEASONS)].copy()
y_test = y[X['season'].isin(TEST_SEASONS)].copy()

# Drop the ID columns from the feature sets before scaling/training
X_train = X_train.drop(columns=['season', 'week'], errors='ignore')
X_test = X_test.drop(columns=['season', 'week'], errors='ignore')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {'C': [0.005, 0.01, 0.05, 0.1, 0.5, 1.0]}

model = GridSearchCV(
    LogisticRegression(solver='liblinear', random_state=1),
    param_grid,
    scoring='roc_auc',  # Key to maximizing AUC
    cv=5                 # 5-fold cross-validation
)

model.fit(X_train_scaled, y_train)

print("Model Training Complete.")

# 1. Predict probabilities on the test set
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# 2. Predict class labels (0 or 1) using a standard threshold of 0.5
y_pred = (y_pred_proba > 0.5).astype(int)

# 3. Print Classification Report and AUC
print("--- Test Set Evaluation ---")
print(classification_report(y_test, y_pred))
print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

Model Training Complete.
--- Test Set Evaluation ---
              precision    recall  f1-score   support

           0       0.57      0.67      0.62       125
           1       0.67      0.56      0.61       147

    accuracy                           0.61       272
   macro avg       0.62      0.62      0.61       272
weighted avg       0.62      0.61      0.61       272

AUC Score: 0.6838
