# NFL Game Prediction Model - Prior Features Only (No Data Leakage)

This notebook builds a predictive model using **only prior game statistics** to avoid data leakage.

All features end with `_prior` suffix and are calculated from games **before** the target game.


In [2]:
# Import libraries
import nflreadpy as nfl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler


In [3]:
# ============================================================================
# STEP 1: Load NFL datasets
# ============================================================================
print("Loading NFL data...")
team_stats = nfl.load_team_stats([2021, 2022, 2023, 2024, 2025]).to_pandas()
schedules = nfl.load_schedules([2021, 2022, 2023, 2024, 2025]).to_pandas()
player_stats = nfl.load_player_stats([2021, 2022, 2023, 2024, 2025]).to_pandas()
ff_rankings = nfl.load_ff_rankings().to_pandas()
print("‚úì Data loaded successfully")


Loading NFL data...
‚úì Data loaded successfully


In [5]:
# ============================================================================
# STEP 2: Create PRIOR GAME features (no data leakage!)
# ============================================================================
# These features only use data from games BEFORE the current game

print("\nBuilding prior features...")

# Start with completed regular season games
completed_games = schedules[schedules['game_type'] == 'REG'].dropna(subset=['home_score', 'away_score']).copy()

# Prepare game-level stats
game_stats = team_stats[['season', 'week', 'team', 
                          'passing_yards', 'rushing_yards', 'passing_interceptions',
                          'def_interceptions', 'sacks_suffered', 'rushing_fumbles_lost',
                          'receiving_fumbles_lost', 'sack_fumbles_lost', 'def_fumbles',
                          'penalty_yards', 'attempts', 'carries']].copy()

# Calculate derived metrics at game level
game_stats['total_fumbles_lost'] = (
    game_stats['rushing_fumbles_lost'] + 
    game_stats['receiving_fumbles_lost'] + 
    game_stats['sack_fumbles_lost']
)

game_stats['total_plays'] = game_stats['attempts'] + game_stats['carries']
game_stats['total_yards'] = game_stats['passing_yards'] + game_stats['rushing_yards']
game_stats['yards_per_play'] = game_stats['total_yards'] / game_stats['total_plays'].replace(0, 1)

game_stats['turnovers_forced'] = game_stats['def_interceptions'] + game_stats['def_fumbles']
game_stats['turnovers_committed'] = game_stats['passing_interceptions'] + game_stats['total_fumbles_lost']
game_stats['turnover_diff'] = game_stats['turnovers_forced'] - game_stats['turnovers_committed']

# Sort by season and week to ensure temporal ordering
game_stats = game_stats.sort_values(['season', 'week']).reset_index(drop=True)
completed_games = completed_games.sort_values(['season', 'week']).reset_index(drop=True)

# ============================================================================
# Calculate cumulative PRIOR stats for each team at each game
# ============================================================================

def calculate_prior_features(df, team_col, season_col='season', week_col='week'):
    """
    Calculate cumulative statistics from all prior games for each team.
    Returns a dataframe with one row per team-season-week combo.
    """
    df = df.sort_values([season_col, week_col]).reset_index(drop=True)
    
    # Features to calculate rolling means for
    stat_cols = [
        'passing_yards', 'rushing_yards', 'passing_interceptions',
        'def_interceptions', 'sacks_suffered', 'total_fumbles_lost',
        'def_fumbles', 'penalty_yards', 'yards_per_play', 'turnover_diff'
    ]
    
    results = []
    
    # Group by season and team
    for (season, team), group in df.groupby([season_col, team_col]):
        group = group.sort_values(week_col).reset_index(drop=True)
        
        for idx, row in group.iterrows():
            current_week = row[week_col]
            
            # Get all prior games (exclude current game)
            prior_games = group[group[week_col] < current_week]
            
            if len(prior_games) == 0:
                # First game of season - use neutral priors
                prior_stats = {f'{col}_prior': 0.0 for col in stat_cols}
                prior_stats['games_played_prior'] = 0
            else:
                # Calculate cumulative averages from all prior games
                prior_stats = {f'{col}_prior': prior_games[col].mean() for col in stat_cols}
                prior_stats['games_played_prior'] = len(prior_games)
            
            result = {
                'season': season,
                'team': team,
                'week': current_week,
                **prior_stats
            }
            results.append(result)
    
    return pd.DataFrame(results)

# Calculate prior features for all teams
team_prior_features = calculate_prior_features(game_stats, 'team')

# ============================================================================
# Add win percentage and scoring stats from schedules
# ============================================================================

# Create a record of each team's games with outcomes
home_games = completed_games[['season', 'week', 'home_team', 'home_score', 'away_score']].copy()
home_games['team'] = home_games['home_team']
home_games['points_for'] = home_games['home_score']
home_games['points_against'] = home_games['away_score']
home_games['won'] = (home_games['home_score'] > home_games['away_score']).astype(int)

away_games = completed_games[['season', 'week', 'away_team', 'away_score', 'home_score']].copy()
away_games['team'] = away_games['away_team']
away_games['points_for'] = away_games['away_score']
away_games['points_against'] = away_games['home_score']
away_games['won'] = (away_games['away_score'] > away_games['home_score']).astype(int)

# Combine all games
all_team_games = pd.concat([
    home_games[['season', 'week', 'team', 'points_for', 'points_against', 'won']],
    away_games[['season', 'week', 'team', 'points_for', 'points_against', 'won']]
]).sort_values(['season', 'week']).reset_index(drop=True)

# Calculate cumulative win% and scoring stats
scoring_prior_features = []

for (season, team), group in all_team_games.groupby(['season', 'team']):
    group = group.sort_values('week').reset_index(drop=True)
    
    for idx, row in group.iterrows():
        current_week = row['week']
        prior_games = group[group['week'] < current_week]
        
        if len(prior_games) == 0:
            # First game - use neutral priors
            win_pct_prior = 0.5
            avg_points_for_prior = 0.0
            avg_points_against_prior = 0.0
            avg_scoring_margin_prior = 0.0
        else:
            win_pct_prior = prior_games['won'].mean()
            avg_points_for_prior = prior_games['points_for'].mean()
            avg_points_against_prior = prior_games['points_against'].mean()
            avg_scoring_margin_prior = avg_points_for_prior - avg_points_against_prior
        
        scoring_prior_features.append({
            'season': season,
            'team': team,
            'week': current_week,
            'win_pct_prior': win_pct_prior,
            'avg_points_for_prior': avg_points_for_prior,
            'avg_points_against_prior': avg_points_against_prior,
            'avg_scoring_margin_prior': avg_scoring_margin_prior
        })

scoring_prior_df = pd.DataFrame(scoring_prior_features)

# Merge scoring stats
team_prior_features = team_prior_features.merge(
    scoring_prior_df, on=['season', 'team', 'week'], how='left'
)

# ============================================================================
# Add QB stats (using prior games only)
# ============================================================================

qb_prior_list = []
qb_game_data = player_stats[player_stats["position"] == "QB"][
    ['season', 'week', 'team', 'passing_yards']
].copy()

for (season, team), group in qb_game_data.groupby(['season', 'team']):
    group = group.sort_values('week').reset_index(drop=True)
    
    for idx, row in group.iterrows():
        current_week = row['week']
        prior_games = group[group['week'] < current_week]
        
        if len(prior_games) == 0:
            avg_qb_passing_yards_prior = 0.0
        else:
            avg_qb_passing_yards_prior = prior_games['passing_yards'].mean()
        
        qb_prior_list.append({
            'season': season,
            'team': team,
            'week': current_week,
            'avg_qb_passing_yards_prior': avg_qb_passing_yards_prior
        })

qb_prior_df = pd.DataFrame(qb_prior_list)

team_prior_features = team_prior_features.merge(
    qb_prior_df, on=['season', 'team', 'week'], how='left'
)

# ============================================================================
# Add fantasy football rankings
# ============================================================================

ff_strength = (
    ff_rankings.groupby("team")
    .agg({"ecr": "mean"})
    .reset_index()
    .rename(columns={"ecr": "avg_ff_rank_prior"})
)

team_prior_features = team_prior_features.merge(
    ff_strength, on='team', how='left'
)

# ============================================================================
# Calculate Strength of Schedule (prior games only)
# ============================================================================

sos_prior_list = []

for (season, team), group in all_team_games.groupby(['season', 'team']):
    group = group.sort_values('week').reset_index(drop=True)
    
    for idx, row in group.iterrows():
        current_week = row['week']
        
        if current_week == group['week'].min():
            # First game - no prior opponents
            avg_sos_prior = 0.5
        else:
            # Find opponents from prior weeks
            home_opponents = completed_games[
                (completed_games['season'] == season) & 
                (completed_games['home_team'] == team) & 
                (completed_games['week'] < current_week)
            ]['away_team'].tolist()
            
            away_opponents = completed_games[
                (completed_games['season'] == season) & 
                (completed_games['away_team'] == team) & 
                (completed_games['week'] < current_week)
            ]['home_team'].tolist()
            
            all_opponents = home_opponents + away_opponents
            
            if len(all_opponents) == 0:
                avg_sos_prior = 0.5
            else:
                # Get win% of these opponents up to when they were faced
                opponent_win_pcts = []
                for opp in all_opponents:
                    opp_prior_games = all_team_games[
                        (all_team_games['season'] == season) &
                        (all_team_games['team'] == opp) &
                        (all_team_games['week'] < current_week)
                    ]
                    if len(opp_prior_games) > 0:
                        opp_win_pct = opp_prior_games['won'].mean()
                    else:
                        opp_win_pct = 0.5
                    opponent_win_pcts.append(opp_win_pct)
                
                avg_sos_prior = np.mean(opponent_win_pcts) if opponent_win_pcts else 0.5
        
        sos_prior_list.append({
            'season': season,
            'team': team,
            'week': current_week,
            'avg_sos_prior': avg_sos_prior
        })

sos_prior_df = pd.DataFrame(sos_prior_list)

team_prior_features = team_prior_features.merge(
    sos_prior_df, on=['season', 'team', 'week'], how='left'
)

# ============================================================================
# Display summary
# ============================================================================

print("\n" + "="*80)
print("PRIOR FEATURES (No Data Leakage!)")
print("="*80)
print(f"\nTotal records: {len(team_prior_features):,}")
print(f"Seasons covered: {team_prior_features['season'].min()} - {team_prior_features['season'].max()}")
print(f"Teams: {team_prior_features['team'].nunique()}")

print("\nFeature List:")
feature_cols = [c for c in team_prior_features.columns if c.endswith('_prior')]
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")

print("\n‚úì All features use ONLY data from games prior to the current game")
print("‚úì No data leakage - safe to use for prediction!")

print("\nSample data:")

display(team_prior_features.head(10))



Building prior features...

PRIOR FEATURES (No Data Leakage!)

Total records: 3,062
Seasons covered: 2021 - 2025
Teams: 32

Feature List:
    1. passing_yards_prior
    2. rushing_yards_prior
    3. passing_interceptions_prior
    4. def_interceptions_prior
    5. sacks_suffered_prior
    6. total_fumbles_lost_prior
    7. def_fumbles_prior
    8. penalty_yards_prior
    9. yards_per_play_prior
   10. turnover_diff_prior
   11. games_played_prior
   12. win_pct_prior
   13. avg_points_for_prior
   14. avg_points_against_prior
   15. avg_scoring_margin_prior
   16. avg_qb_passing_yards_prior
   17. avg_ff_rank_prior
   18. avg_sos_prior

‚úì All features use ONLY data from games prior to the current game
‚úì No data leakage - safe to use for prediction!

Sample data:


Unnamed: 0,season,team,week,passing_yards_prior,rushing_yards_prior,passing_interceptions_prior,def_interceptions_prior,sacks_suffered_prior,total_fumbles_lost_prior,def_fumbles_prior,...,yards_per_play_prior,turnover_diff_prior,games_played_prior,win_pct_prior,avg_points_for_prior,avg_points_against_prior,avg_scoring_margin_prior,avg_qb_passing_yards_prior,avg_ff_rank_prior,avg_sos_prior
0,2021,ARI,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.5,0.0,0.0,0.0,0.0,132.594826,0.5
1,2021,ARI,2,289.0,136.0,1.0,1.0,2.0,0.0,0.0,...,6.538462,0.0,1,1.0,38.0,13.0,25.0,289.0,132.594826,0.0
2,2021,ARI,3,344.5,119.5,1.5,0.5,2.5,0.0,0.0,...,7.681511,-1.0,2,1.0,36.0,23.0,13.0,344.5,132.594826,0.25
3,2021,ARI,4,335.0,110.0,1.333333,1.0,1.666667,0.0,0.333333,...,7.240799,0.0,3,1.0,34.333333,21.666667,12.666667,335.0,132.594826,0.333333
4,2021,ARI,4,335.0,110.0,1.333333,1.0,1.666667,0.0,0.333333,...,7.240799,0.0,3,1.0,34.333333,21.666667,12.666667,335.0,132.594826,0.333333
5,2021,ARI,5,318.25,136.5,1.0,1.0,2.0,0.0,0.25,...,7.111155,0.25,4,1.0,35.0,21.25,13.75,254.6,132.594826,0.375
6,2021,ARI,6,302.4,128.0,0.8,1.0,2.0,0.2,0.2,...,6.8372,0.2,5,1.0,31.4,19.0,12.4,252.0,132.594826,0.44
7,2021,ARI,6,302.4,128.0,0.8,1.0,2.0,0.2,0.2,...,6.8372,0.2,5,1.0,31.4,19.0,12.4,252.0,132.594826,0.44
8,2021,ARI,7,290.166667,130.666667,0.666667,1.0,2.0,0.166667,0.166667,...,6.625527,0.333333,6,1.0,32.333333,18.166667,14.166667,217.625,132.594826,0.511111
9,2021,ARI,7,290.166667,130.666667,0.666667,1.0,2.0,0.166667,0.166667,...,6.625527,0.333333,6,1.0,32.333333,18.166667,14.166667,217.625,132.594826,0.511111


In [11]:
# ============================================================================
# STEP 3: Create modeling dataset
# ============================================================================

print("\nCreating modeling dataset...")

# Filter to regular season completed games only
games = completed_games.copy()

# Merge home team prior features
games = games.merge(
    team_prior_features.add_prefix("home_"),
    left_on=["season", "week", "home_team"],
    right_on=["home_season", "home_week", "home_team"],
    how="left"
)

# Merge away team prior features
games = games.merge(
    team_prior_features.add_prefix("away_"),
    left_on=["season", "week", "away_team"],
    right_on=["away_season", "away_week", "away_team"],
    how="left"
)

# Create target variable
games["home_team_win"] = (games["home_score"] > games["away_score"]).astype(int)

# Select feature columns
base_cols = ["season", "week", "home_team", "away_team", "home_team_win"]
home_feature_cols = sorted([col for col in games.columns if col.startswith("home_") and col.endswith("_prior")])
away_feature_cols = sorted([col for col in games.columns if col.startswith("away_") and col.endswith("_prior")])

model_df = games[base_cols + home_feature_cols + away_feature_cols].dropna()

# Compute feature differences (home - away)
diff_features = [
    c.replace("home_", "diff_")
    for c in model_df.columns if c.startswith("home_") and c.endswith("_prior")
]

for hf, af, df in zip(
    [c for c in model_df.columns if c.startswith("home_") and c.endswith("_prior")],
    [c for c in model_df.columns if c.startswith("away_") and c.endswith("_prior")],
    diff_features
):
    model_df[df] = model_df[hf] - model_df[af]

# Select features and target
X = model_df[diff_features]
y = model_df["home_team_win"]

print("\n" + "="*80)
print("Modeling Dataset Summary")
print("="*80)
print(model_df.head())
print(f"Total games: {len(model_df):,}")
print(f"Features: {len(diff_features)}")
print(f"Home team win rate: {y.mean():.3f}")

print("\nFeature differences (home - away):")
for i, col in enumerate(diff_features, 1):
    print(f"   {i:2d}. {col}")

print("\nSample features:")
display(X.head())



Creating modeling dataset...

Modeling Dataset Summary
   season  week home_team away_team  home_team_win  home_avg_ff_rank_prior  \
0    2021     1        TB       DAL              1              115.175826   
1    2021     1       ATL       PHI              0              109.172867   
2    2021     1       BUF       PIT              0              115.410872   
3    2021     1       CAR       NYJ              1              127.472235   
4    2021     1       CIN       MIN              1              113.042530   

   home_avg_points_against_prior  home_avg_points_for_prior  \
0                            0.0                        0.0   
1                            0.0                        0.0   
2                            0.0                        0.0   
3                            0.0                        0.0   
4                            0.0                        0.0   

   home_avg_qb_passing_yards_prior  home_avg_scoring_margin_prior  ...  \
0                     

Unnamed: 0,diff_avg_ff_rank_prior,diff_avg_points_against_prior,diff_avg_points_for_prior,diff_avg_qb_passing_yards_prior,diff_avg_scoring_margin_prior,diff_avg_sos_prior,diff_def_fumbles_prior,diff_def_interceptions_prior,diff_games_played_prior,diff_passing_interceptions_prior,diff_passing_yards_prior,diff_penalty_yards_prior,diff_rushing_yards_prior,diff_sacks_suffered_prior,diff_total_fumbles_lost_prior,diff_turnover_diff_prior,diff_win_pct_prior,diff_yards_per_play_prior
0,-4.887191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-14.942171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.336527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.684438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# ============================================================================
# STEP 4: Train/Test Split
# ============================================================================

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n" + "="*80)
print("Train/Test Split")
print("="*80)
print(f"Training set size: {len(X_train):,}")
print(f"Testing set size: {len(X_test):,}")
print(f"Training set home win rate: {y_train.mean():.3f}")
print(f"Testing set home win rate: {y_test.mean():.3f}")

# Scale the features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n‚úì Features scaled using StandardScaler")



Train/Test Split
Training set size: 1,279
Testing set size: 320
Training set home win rate: 0.556
Testing set home win rate: 0.556

‚úì Features scaled using StandardScaler


In [27]:
# ============================================================================
# STEP 5b: Compare Previous vs Updated Model Performance
# ============================================================================

print("\n" + "="*80)
print("MODEL PERFORMANCE COMPARISON")
print("="*80)

print("PREVIOUS MODEL (with data leakage):")
print("   Training Accuracy: 0.6781")
print("   Testing Accuracy:  0.6792")

print ("\n")
print("UPDATED MODEL (no data leakage - prior features only):")
print(f"   Training Accuracy: {train_acc:.4f}")
print(f"   Testing Accuracy:  {test_acc:.4f}")



MODEL PERFORMANCE COMPARISON
PREVIOUS MODEL (with data leakage):
   Training Accuracy: 0.6781
   Testing Accuracy:  0.6792


UPDATED MODEL (no data leakage - prior features only):
   Training Accuracy: 0.6544
   Testing Accuracy:  0.6469


In [20]:
# ============================================================================
# STEP 5: L1 Regularization for Feature Selection
# ============================================================================

print("\n" + "="*80)
print("L1 REGULARIZATION - FEATURE SELECTION")
print("="*80)

# Get feature names
feature_names = X.columns.tolist()

# Train L1 logistic regression
log_reg_l1 = LogisticRegression(
    penalty='l1',
    solver='liblinear',  
    C=1.0,              
    max_iter=1000,
    random_state=42
)

log_reg_l1.fit(X_train_scaled, y_train)

# Get coefficients and identify selected features
coeffs = log_reg_l1.coef_.flatten()
selected_mask = coeffs != 0
selected_features = np.array(feature_names)[selected_mask]
dropped_features = np.array(feature_names)[~selected_mask]

# Display results
print(f"\nSelected features ({len(selected_features)}):")
for i, (feat, coef) in enumerate(zip(selected_features, coeffs[selected_mask]), 1):
    print(f"  {i:2d}. {feat:40s} ‚Üí {coef:7.4f}")

print(f"\nDropped features ({len(dropped_features)}):")
for feat in dropped_features:
    print(f"  - {feat}")

# Evaluate performance
y_train_pred = log_reg_l1.predict(X_train_scaled)
y_test_pred = log_reg_l1.predict(X_test_scaled)
y_train_proba = log_reg_l1.predict_proba(X_train_scaled)[:, 1]
y_test_proba = log_reg_l1.predict_proba(X_test_scaled)[:, 1]

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

print("\n" + "="*80)
print("UPDATED MODEL PERFORMANCE")
print("="*80)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy:  {test_acc:.4f}")

print("\n" + "="*80)
print("Classification Report (Test Set)")
print("="*80)
print(classification_report(y_test, y_test_pred, target_names=['Away Win', 'Home Win']))

print("\n" + "="*80)
print("Confusion Matrix (Test Set)")
print("="*80)
cm = confusion_matrix(y_test, y_test_pred)
print(f"\n                Predicted")
print(f"                Away  Home")
print(f"Actual Away     {cm[0,0]:4d}  {cm[0,1]:4d}")
print(f"       Home     {cm[1,0]:4d}  {cm[1,1]:4d}")



L1 REGULARIZATION - FEATURE SELECTION

Selected features (15):
   1. diff_avg_ff_rank_prior                   ‚Üí -0.1419
   2. diff_avg_qb_passing_yards_prior          ‚Üí  0.0121
   3. diff_avg_scoring_margin_prior            ‚Üí  0.1652
   4. diff_avg_sos_prior                       ‚Üí  0.1915
   5. diff_def_fumbles_prior                   ‚Üí -0.0073
   6. diff_def_interceptions_prior             ‚Üí  0.1820
   7. diff_games_played_prior                  ‚Üí -0.0633
   8. diff_passing_interceptions_prior         ‚Üí  0.1326
   9. diff_passing_yards_prior                 ‚Üí  0.0979
  10. diff_penalty_yards_prior                 ‚Üí  0.0452
  11. diff_rushing_yards_prior                 ‚Üí  0.0997
  12. diff_sacks_suffered_prior                ‚Üí -0.2863
  13. diff_total_fumbles_lost_prior            ‚Üí  0.2067
  14. diff_win_pct_prior                       ‚Üí  0.1670
  15. diff_yards_per_play_prior                ‚Üí  0.3137

Dropped features (3):
  - diff_avg_points_against_

In [9]:
# ============================================================================
# STEP 6: Display Coefficients Sorted by Magnitude
# ============================================================================

coef_df = (
    pd.DataFrame({
        "Feature": feature_names,
        "Coefficient": coeffs
    })
    .query("Coefficient != 0")  # keep only selected features
    .assign(AbsCoeff=lambda d: d["Coefficient"].abs())  # magnitude for sorting
    .sort_values("AbsCoeff", ascending=False)
    .drop(columns=['AbsCoeff'])
    .reset_index(drop=True)
)

print("\n" + "="*80)
print("COEFFICIENTS FOR SELECTED FEATURES (sorted by magnitude)")
print("="*80)
display(coef_df)



COEFFICIENTS FOR SELECTED FEATURES (sorted by magnitude)


Unnamed: 0,Feature,Coefficient
0,diff_yards_per_play_prior,0.313689
1,diff_sacks_suffered_prior,-0.286279
2,diff_total_fumbles_lost_prior,0.206734
3,diff_avg_sos_prior,0.191543
4,diff_def_interceptions_prior,0.181973
5,diff_win_pct_prior,0.167033
6,diff_avg_scoring_margin_prior,0.165228
7,diff_avg_ff_rank_prior,-0.14187
8,diff_passing_interceptions_prior,0.132632
9,diff_rushing_yards_prior,0.099665


In [24]:
# ============================================================================
# STEP 7: Predict Winner of Upcoming Game 
# ============================================================================

# Thursday Night Football: Denver Broncos @ Las Vegas Raiders (Nov 6, 2025)
home_team = "LV"   # Las Vegas Raiders (home)
away_team = "DEN"  # Denver Broncos (away)
season_year = 2024

print("\n" + "="*80)
print(f"GAME PREDICTION: {away_team} @ {home_team}")
print("="*80)

# Get ALL data for both teams in the season
home_all_data = team_prior_features[
    (team_prior_features['team'] == home_team) & 
    (team_prior_features['season'] == season_year)
].sort_values('week', ascending=False)

away_all_data = team_prior_features[
    (team_prior_features['team'] == away_team) & 
    (team_prior_features['season'] == season_year)
].sort_values('week', ascending=False)

if len(home_all_data) == 0 or len(away_all_data) == 0:
    print("\n‚ö†Ô∏è  ERROR: One or both teams have NO data for 2024 season!")
    print("\nAvailable teams in 2024:")
    teams_2024 = team_prior_features[team_prior_features['season'] == season_year]['team'].unique()
    print(sorted(teams_2024))
else:
    # OPTION 1 IMPLEMENTATION: Find latest week with complete data for BOTH teams
    print(f"\n1. Finding latest week with complete data...")
    
    # Filter to rows with NO NaN values
    home_complete = home_all_data.dropna()
    away_complete = away_all_data.dropna()
    
    if len(home_complete) == 0 or len(away_complete) == 0:
        print(f"\n‚ö†Ô∏è  ERROR: No complete data found for one or both teams!")
        print(f"   {home_team}: {len(home_complete)} complete weeks")
        print(f"   {away_team}: {len(away_complete)} complete weeks")
    else:
        # Get the latest week that exists for BOTH teams with complete data
        home_latest_week = home_complete.iloc[0]['week']
        away_latest_week = away_complete.iloc[0]['week']
        
        # Use the minimum of the two (the week both teams have data for)
        latest_common_week = min(home_latest_week, away_latest_week)
        
        # Get stats for that week
        home_stats = home_complete[home_complete['week'] == latest_common_week].head(1)
        away_stats = away_complete[away_complete['week'] == latest_common_week].head(1)
        
        if len(home_stats) == 0 or len(away_stats) == 0:
            print(f"\n‚ö†Ô∏è  ERROR: Could not find matching week {latest_common_week} for both teams")
        else:
            print(f"   ‚úÖ Using Week {int(latest_common_week)} (latest week with complete data for both teams)")
            print(f"\n2. Prior statistics being used:")
            print(f"   {home_team}: Week {int(home_stats['week'].values[0])}, Games played: {home_stats['games_played_prior'].values[0]:.0f}")
            print(f"   {away_team}: Week {int(away_stats['week'].values[0])}, Games played: {away_stats['games_played_prior'].values[0]:.0f}")
            
            # Calculate difference features (home - away) for all features
            print(f"\n3. Building prediction features...")
            diff_data = {}
            for feature in X.columns:
                orig_feature = feature.replace('diff_', '')
                home_val = home_stats[orig_feature].values[0] if orig_feature in home_stats.columns else 0
                away_val = away_stats[orig_feature].values[0] if orig_feature in away_stats.columns else 0
                diff_data[feature] = [home_val - away_val]
            
            prediction_df = pd.DataFrame(diff_data)
            
            # Verify no NaN values (should be clean now!)
            if prediction_df.isna().any().any():
                print("   ‚ö†Ô∏è  Warning: Some NaN values still present, filling with 0")
                prediction_df = prediction_df.fillna(0)
            else:
                print("   ‚úÖ All features complete - no missing data!")
            
            # Scale the features using the same scaler
            prediction_scaled = scaler.transform(prediction_df)
            
            # Predict using L1 model
            prediction_proba = log_reg_l1.predict_proba(prediction_scaled)[0, 1]
            prediction = 1 if prediction_proba > 0.5 else 0
            
            # Display results
            print("\n" + "="*80)
            print("PREDICTION RESULTS")
            print("="*80)
            
            if prediction == 1:
                winner = home_team
                confidence = prediction_proba * 100
            else:
                winner = away_team
                confidence = (1 - prediction_proba) * 100
            
            print(f"\nüèà PREDICTED WINNER: {winner}")
            print(f"   Confidence: {confidence:.1f}%")
            print(f"\n   Home ({home_team}) win probability: {prediction_proba:.1%}")
            print(f"   Away ({away_team}) win probability: {(1-prediction_proba):.1%}")
            
            # Show key feature differences
            print(f"\nüìä Key Feature Differences (Home - Away):")
            top_features = ['diff_avg_scoring_margin_prior', 'diff_win_pct_prior', 
                           'diff_yards_per_play_prior', 'diff_sacks_suffered_prior',
                           'diff_turnover_diff_prior']
            for feat in top_features:
                if feat in prediction_df.columns:
                    val = prediction_df[feat].values[0]
                    direction = "‚Üë" if val > 0 else "‚Üì" if val < 0 else "="
                    print(f"   {direction} {feat.replace('diff_', '').replace('_prior', '')}: {val:+.3f}")



GAME PREDICTION: DEN @ LV

1. Finding latest week with complete data...
   ‚úÖ Using Week 18 (latest week with complete data for both teams)

2. Prior statistics being used:
   LV: Week 18, Games played: 16
   DEN: Week 18, Games played: 16

3. Building prediction features...
   ‚úÖ All features complete - no missing data!

PREDICTION RESULTS

üèà PREDICTED WINNER: DEN
   Confidence: 63.2%

   Home (LV) win probability: 36.8%
   Away (DEN) win probability: 63.2%

üìä Key Feature Differences (Home - Away):
   ‚Üì avg_scoring_margin: -11.688
   ‚Üì win_pct: -0.312
   ‚Üì yards_per_play: -0.078
   ‚Üë sacks_suffered: +1.625
   = turnover: +0.000


In [18]:
# ============================================================================
# STEP 8: Diagnose Missing Broncos Data
# ============================================================================

print("\n" + "="*80)
print("DIAGNOSING: Why are DEN's scoring stats missing?")
print("="*80)

# Check if DEN exists in the scoring_prior_df
print("\n1. Does DEN exist in scoring_prior_df?")
den_scoring = scoring_prior_df[scoring_prior_df['team'] == 'DEN']
print(f"   Records found: {len(den_scoring)}")
if len(den_scoring) > 0:
    print(f"   Seasons: {sorted(den_scoring['season'].unique())}")
    print(f"   Sample data:")
    display(den_scoring.head())
else:
    print("   ‚ö†Ô∏è DEN NOT FOUND in scoring_prior_df!")

# Check if DEN exists in sos_prior_df
print("\n2. Does DEN exist in sos_prior_df?")
den_sos = sos_prior_df[sos_prior_df['team'] == 'DEN']
print(f"   Records found: {len(den_sos)}")
if len(den_sos) > 0:
    print(f"   Seasons: {sorted(den_sos['season'].unique())}")
else:
    print("   ‚ö†Ô∏è DEN NOT FOUND in sos_prior_df!")

# Check if DEN exists in completed_games
print("\n3. Does DEN appear in completed_games (schedules)?")
den_home = completed_games[completed_games['home_team'] == 'DEN']
den_away = completed_games[completed_games['away_team'] == 'DEN']
print(f"   Home games: {len(den_home)}")
print(f"   Away games: {len(den_away)}")
print(f"   Total games: {len(den_home) + len(den_away)}")

if len(den_home) > 0 or len(den_away) > 0:
    print(f"   Seasons: {sorted(set(list(den_home['season'].unique()) + list(den_away['season'].unique())))}")
else:
    print("   ‚ö†Ô∏è DEN NOT FOUND in completed_games!")

# Check what team codes exist in schedules
print("\n4. All team codes in schedules:")
all_home_teams = completed_games['home_team'].unique()
all_away_teams = completed_games['away_team'].unique()
all_schedule_teams = sorted(set(list(all_home_teams) + list(all_away_teams)))
print(f"   Total teams: {len(all_schedule_teams)}")
print(f"   Teams: {all_schedule_teams}")

# Check if Broncos are under a different code
print("\n5. Looking for 'Broncos' under different codes:")
broncos_variants = [t for t in all_schedule_teams if 'DEN' in t or 'den' in t.lower()]
print(f"   Found: {broncos_variants if broncos_variants else 'None'}")

# Check team_stats for DEN
print("\n6. Does DEN exist in team_stats (game_stats)?")
den_game_stats = game_stats[game_stats['team'] == 'DEN']
print(f"   Records found: {len(den_game_stats)}")
if len(den_game_stats) > 0:
    print(f"   Seasons: {sorted(den_game_stats['season'].unique())}")
    print(f"   Weeks: {sorted(den_game_stats['week'].unique())}")
else:
    print("   ‚ö†Ô∏è DEN NOT FOUND in game_stats!")

# NOW THE KEY DIAGNOSTIC: Check the merge keys
print("\n" + "="*80)
print("üîç MERGE INVESTIGATION: Why isn't scoring_prior_df joining properly?")
print("="*80)

# Get DEN data from base features (before scoring merge)
print("\n7. DEN in base team_prior_features (before scoring merge):")
den_base = calculate_prior_features(game_stats[game_stats['team'] == 'DEN'], 'team')
print(f"   Records: {len(den_base)}")
if len(den_base) > 0:
    print(f"   Seasons: {sorted(den_base['season'].unique())}")
    print(f"   2024 records: {len(den_base[den_base['season'] == 2024])}")
    print(f"   Sample weeks in 2024: {sorted(den_base[den_base['season'] == 2024]['week'].unique())[:5]}")
    print(f"   Data types: season={den_base['season'].dtype}, week={den_base['week'].dtype}, team={den_base['team'].dtype}")

# Get DEN data from scoring_prior_df
print("\n8. DEN in scoring_prior_df:")
den_scoring_2024 = scoring_prior_df[(scoring_prior_df['team'] == 'DEN') & (scoring_prior_df['season'] == 2024)]
print(f"   2024 records: {len(den_scoring_2024)}")
if len(den_scoring_2024) > 0:
    print(f"   Sample weeks in 2024: {sorted(den_scoring_2024['week'].unique())[:5]}")
    print(f"   Data types: season={den_scoring_2024['season'].dtype}, week={den_scoring_2024['week'].dtype}, team={den_scoring_2024['team'].dtype}")
    print(f"\n   Sample data:")
    display(den_scoring_2024.head())

# Check the actual merge result for DEN
print("\n9. DEN in final team_prior_features (after merge):")
den_final = team_prior_features[(team_prior_features['team'] == 'DEN') & (team_prior_features['season'] == 2024)]
print(f"   2024 records: {len(den_final)}")
if len(den_final) > 0:
    print(f"   Weeks: {sorted(den_final['week'].unique())[:5]}")
    print(f"\n   Checking specific columns for NaN:")
    print(f"   - win_pct_prior: {den_final['win_pct_prior'].isna().sum()} NaN out of {len(den_final)}")
    print(f"   - avg_scoring_margin_prior: {den_final['avg_scoring_margin_prior'].isna().sum()} NaN out of {len(den_final)}")
    print(f"\n   Week 19 data (latest):")
    display(den_final[den_final['week'] == 19][['season', 'team', 'week', 'games_played_prior', 
                                                   'win_pct_prior', 'avg_scoring_margin_prior', 
                                                   'avg_sos_prior']].T)



DIAGNOSING: Why are DEN's scoring stats missing?

1. Does DEN exist in scoring_prior_df?
   Records found: 77
   Seasons: [2021, 2022, 2023, 2024, 2025]
   Sample data:


Unnamed: 0,season,team,week,win_pct_prior,avg_points_for_prior,avg_points_against_prior,avg_scoring_margin_prior
153,2021,DEN,1,0.5,0.0,0.0,0.0
154,2021,DEN,2,1.0,27.0,13.0,14.0
155,2021,DEN,3,1.0,25.0,13.0,12.0
156,2021,DEN,4,1.0,25.333333,8.666667,16.666667
157,2021,DEN,5,0.75,20.75,12.25,8.5



2. Does DEN exist in sos_prior_df?
   Records found: 77
   Seasons: [2021, 2022, 2023, 2024, 2025]

3. Does DEN appear in completed_games (schedules)?
   Home games: 38
   Away games: 39
   Total games: 77
   Seasons: [2021, 2022, 2023, 2024, 2025]

4. All team codes in schedules:
   Total teams: 32
   Teams: ['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

5. Looking for 'Broncos' under different codes:
   Found: ['DEN']

6. Does DEN exist in team_stats (game_stats)?
   Records found: 78
   Seasons: [2021, 2022, 2023, 2024, 2025]
   Weeks: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

üîç MERGE INVESTIGATION: Why isn't scoring_prior_df joining properly?

7. DEN in base team_prior_features (before scoring merge):
   Records: 78
   Seasons: [2021, 2022, 2023, 2024, 2025]
   2024 records: 18
   Sa

Unnamed: 0,season,team,week,win_pct_prior,avg_points_for_prior,avg_points_against_prior,avg_scoring_margin_prior
1783,2024,DEN,1,0.5,0.0,0.0,0.0
1784,2024,DEN,2,0.0,20.0,26.0,-6.0
1785,2024,DEN,3,0.0,13.0,19.5,-6.5
1786,2024,DEN,4,0.333333,17.333333,15.333333,2.0
1787,2024,DEN,5,0.5,15.5,13.75,1.75



9. DEN in final team_prior_features (after merge):
   2024 records: 20
   Weeks: [1, 2, 3, 4, 5]

   Checking specific columns for NaN:
   - win_pct_prior: 1 NaN out of 20
   - avg_scoring_margin_prior: 1 NaN out of 20

   Week 19 data (latest):


Unnamed: 0,2259
season,2024
team,DEN
week,19
games_played_prior,17
win_pct_prior,
avg_scoring_margin_prior,
avg_sos_prior,


In [None]:
# ============================================================================
# STEP 9: Final Diagnosis - Week-by-Week Comparison
# ============================================================================

print("\n" + "="*80)
print("üìä WEEK-BY-WEEK COMPARISON FOR DEN 2024")
print("="*80)

# Show which weeks exist in each dataset
den_base_weeks = set(den_base[den_base['season'] == 2024]['week'].unique())
den_scoring_weeks = set(den_scoring_2024['week'].unique())
den_final_weeks = set(den_final['week'].unique())

print(f"\nWeeks in base features: {sorted(den_base_weeks)}")
print(f"Weeks in scoring_prior_df: {sorted(den_scoring_weeks)}")
print(f"Weeks in final team_prior_features: {sorted(den_final_weeks)}")

# Find the missing weeks
missing_in_scoring = den_base_weeks - den_scoring_weeks
extra_in_final = den_final_weeks - den_base_weeks

print(f"\n‚ö†Ô∏è  Weeks in base BUT NOT in scoring: {sorted(missing_in_scoring) if missing_in_scoring else 'None'}")
print(f"‚ö†Ô∏è  Weeks in final BUT NOT in base: {sorted(extra_in_final) if extra_in_final else 'None'}")

# Show the actual Week 19 data
print(f"\nüìã Full Week 19 row for DEN:")
display(den_final[den_final['week'] == 19].T)

print(f"\nüí° SOLUTION:")
print("   Since Week 19 exists in team_stats but NOT in schedules (no completed games),")
print("   we should use the LATEST WEEK that has complete data for both teams.")
print(f"\n   For predictions, we should use:")
print(f"   - Latest week with NO NaN values")
print(f"   - Or filter to only weeks where scoring data exists")


In [None]:
)