# Week 10 Diagnostics

Evaluate saved predictions vs actual results, inspect the largest misses, and pull feature diagnostics.

In [11]:
import sys
from pathlib import Path
PROJ_ROOT = Path('..').resolve()
if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))


In [12]:
import pandas as pd
import plotly.express as px
from evaluate_predictions import load_predictions, load_actual_results, evaluate
from src.data import nfl_fetcher
from src.models.predictor import GamePredictor


## 1. Load Predictions and Actual Results

In [13]:
PREDICTIONS_PATH = '../data/predictions/week10.csv'
SEASON = 2025
WEEK = 10

preds = load_predictions(PREDICTIONS_PATH)
actuals = load_actual_results(SEASON, WEEK)
merged, metrics = evaluate(preds, actuals)
metrics

{'games_evaluated': 13,
 'direction_accuracy': np.float64(0.5384615384615384),
 'mean_abs_spread_error': np.float64(10.985611288126897),
 'spread_rmse': np.float64(12.486527254700658),
 'brier_score': np.float64(0.21650146164575906)}

## 2. Biggest Misses

In [14]:
top_misses = merged.sort_values('abs_spread_error', ascending=False).head(5)
top_misses[['home_team', 'away_team', 'game_date', 'predicted_spread', 'actual_margin', 'spread_error']]

Unnamed: 0,home_team,away_team,game_date,predicted_spread,actual_margin,spread_error
11,WAS,DET,2025-11-09,1.103845,-22.0,23.103845
5,MIA,BUF,2025-11-09,-0.98149,17.0,-17.98149
10,SF,LA,2025-11-09,0.470755,-16.0,16.470755
6,MIN,BAL,2025-11-09,6.309941,-8.0,14.309941
9,SEA,ARI,2025-11-09,7.832373,22.0,-14.167627


## 3. Visualize Predicted vs Actual Margin

In [15]:
fig = px.scatter(merged, x='predicted_spread', y='actual_margin', hover_data=['home_team', 'away_team', 'game_date'])
fig.add_hline(y=0, line_dash='dash', line_color='gray')
fig.add_vline(x=0, line_dash='dash', line_color='gray')
line_df = pd.DataFrame({'predicted_spread': [merged['predicted_spread'].min(), merged['predicted_spread'].max()]})
line_df['actual_margin'] = line_df['predicted_spread']
fig.add_trace(px.line(line_df, x='predicted_spread', y='actual_margin').data[0])
fig.update_layout(title='Predicted vs Actual Margin')
fig

## 4. Pull Feature Diagnostics For Top Misses

In [16]:
predictor = GamePredictor('NFL', 'v2')
schedule = nfl_fetcher.fetch_nfl_schedule(SEASON)
schedule['game_date'] = pd.to_datetime(schedule['gameday']) if 'gameday' in schedule.columns else pd.to_datetime(schedule['game_date'])

feature_dfs = []
for _, row in top_misses.iterrows():
    game_row = pd.DataFrame([{
        'home_team': row['home_team'],
        'away_team': row['away_team'],
        'game_date': row['game_date'],
        'season': SEASON
    }])
    features = predictor.build_features_for_game(game_row, schedule)
    matchup_label = f"{row['away_team']}@{row['home_team']}"
    features['matchup'] = matchup_label
    feature_dfs.append(features)

diagnostic_features = pd.concat(feature_dfs, ignore_index=True)
diagnostic_features.head()


Unnamed: 0,home_team,away_team,game_date,season,rest_home,rest_away,b2b_home,b2b_away,opp_strength_home_season,opp_strength_away_season,...,away_team_point_diff,rest_differential,rest_advantage_home,win_pct_differential,point_diff_differential,opp_strength_differential,week_number,month,is_playoff,matchup
0,WAS,DET,2025-11-09,2025,7,7,False,False,0.42284,-0.564236,...,7.625,0,0,-0.291667,-11.513889,0.987076,45,11,0,DET@WAS
1,MIA,BUF,2025-11-09,2025,10,7,False,False,0.746914,-2.413194,...,8.5,3,1,-0.527778,-15.5,3.160108,45,11,0,BUF@MIA
2,SF,LA,2025-11-09,2025,7,7,False,False,0.770062,-0.875,...,10.25,0,0,-0.083333,-9.583333,1.645062,45,11,0,LA@SF
3,MIN,BAL,2025-11-09,2025,7,10,False,False,-1.079861,3.118056,...,-1.75,-3,0,0.125,1.25,-4.197917,45,11,0,BAL@MIN
4,SEA,ARI,2025-11-09,2025,7,6,False,False,-0.704861,-0.414931,...,1.125,1,1,0.375,9.0,-0.289931,45,11,0,ARI@SEA


Use `diagnostic_features` to inspect rest, strength, and form values for the largest misses. Feed these rows into `diagnose_key_games.py` or additional plots to pinpoint mis-specified features.

## 5. Join Diagnostics With Residuals

In [17]:
diagnostic_summary = diagnostic_features.merge(
    top_misses[['home_team','away_team','game_date','predicted_spread','home_win_probability',
                'actual_margin','spread_error','abs_spread_error','brier_component']],
    on=['home_team','away_team','game_date'],
    how='left'
)
diagnostic_summary[['matchup','predicted_spread','actual_margin','spread_error','abs_spread_error']].head()


Unnamed: 0,matchup,predicted_spread,actual_margin,spread_error,abs_spread_error
0,DET@WAS,1.103845,-22.0,23.103845,23.103845
1,BUF@MIA,-0.98149,17.0,-17.98149,17.98149
2,LA@SF,0.470755,-16.0,16.470755,16.470755
3,BAL@MIN,6.309941,-8.0,14.309941,14.309941
4,ARI@SEA,7.832373,22.0,-14.167627,14.167627


## 6. Feature vs Error Correlations

In [18]:
corr = diagnostic_summary.select_dtypes('number').corrwith(diagnostic_summary['spread_error']).dropna().sort_values(key=abs, ascending=False)
corr.head(15)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



spread_error                 1.000000
rest_advantage_home         -0.982735
actual_margin               -0.980884
rest_differential           -0.701442
rest_home                   -0.657995
brier_component              0.498206
opp_strength_away_season     0.463023
rest_away                    0.420618
abs_spread_error             0.402693
opp_strength_differential   -0.324376
home_team_point_diff        -0.205847
point_diff_differential     -0.182905
predicted_spread            -0.114956
away_team_point_diff         0.099885
home_win_probability        -0.081216
dtype: float64

## 7. Bucket Errors By Rest & Point Differential

In [19]:
diagnostic_summary['rest_bin'] = pd.cut(diagnostic_summary['rest_differential'], bins=[-10,-3,-1,0,1,3,10])
diagnostic_summary['point_diff_bin'] = pd.cut(diagnostic_summary['point_diff_differential'], bins=[-30,-10,-5,0,5,10,30])
rest_error = diagnostic_summary.groupby('rest_bin')['abs_spread_error'].mean().to_frame('mean_abs_error')
point_error = diagnostic_summary.groupby('point_diff_bin')['abs_spread_error'].mean().to_frame('mean_abs_error')
rest_error, point_error







(           mean_abs_error
 rest_bin                 
 (-10, -3]       14.309941
 (-3, -1]              NaN
 (-1, 0]         19.787300
 (0, 1]          14.167627
 (1, 3]          17.981490
 (3, 10]               NaN,
                 mean_abs_error
 point_diff_bin                
 (-30, -10]           20.542667
 (-10, -5]            16.470755
 (-5, 0]                    NaN
 (0, 5]               14.309941
 (5, 10]              14.167627
 (10, 30]                   NaN)

## 8. Compare To Training Feature Medians

In [20]:
import pickle
from pathlib import Path
candidate_paths = [
    Path('models/feature_medians_nfl_v2.pkl'),
    Path('../models/feature_medians_nfl_v2.pkl'),
    Path('../../models/feature_medians_nfl_v2.pkl')
]
for medians_path in candidate_paths:
    if medians_path.exists():
        break
else:
    raise FileNotFoundError('feature_medians_nfl_v2.pkl not found in expected locations')
with open(medians_path, 'rb') as f:
    feature_medians = pickle.load(f)
median_diffs = []
for feature in ['rest_differential','point_diff_differential','opp_strength_differential','model_spread_feature']:
    if feature in diagnostic_summary.columns:
        median_val = feature_medians.get(feature, 0)
        mean_val = diagnostic_summary[feature].mean()
        median_diffs.append({
            'feature': feature,
            'training_median': median_val,
            'mean_on_misses': mean_val,
            'delta': mean_val - median_val
        })
pd.DataFrame(median_diffs)


Unnamed: 0,feature,training_median,mean_on_misses,delta
0,rest_differential,0.0,0.2,0.2
1,point_diff_differential,0.0,-5.269444,-5.269444
2,opp_strength_differential,0.014286,0.26088,0.246594


## 9. Notes & Next Steps
- Use the correlation and binning outputs to decide which features need scaling/capping.
- If certain bins show huge errors, consider adding interaction terms or retraining with those segments emphasized.