# NFL 2025 Season - Data Exploration

This notebook explores the NFL 2025 season data loaded from nflverse using nflreadpy.


In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_nfl_data, merge_game_data

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

%matplotlib inline
sns.set_style('whitegrid')

print("✓ All imports successful!")


## Load Data

Load schedules, betting lines, and play-by-play data for the 2024 season (or 2025 when available).


In [None]:
# Load 2024 season data (change to 2025 when available)
SEASON = 2024

print(f"Loading {SEASON} season data...")
schedules, betting_lines, pbp_data = load_nfl_data(SEASON)

print(f"\n✓ Data loaded successfully!")
print(f"  Schedules: {schedules.shape}")
print(f"  Betting Lines: {betting_lines.shape}")
print(f"  Play-by-Play: {pbp_data.shape}")


## Explore Schedules


In [None]:
# Display first few rows
schedules.head()


In [None]:
# Check data types and missing values
schedules.info()


In [None]:
# Summary statistics for scores
print("Score Statistics:")
print(schedules[['home_score', 'away_score']].describe())


## Explore Betting Lines


In [None]:
# Check betting columns
betting_cols = [col for col in schedules.columns if 'spread' in col.lower() or 'line' in col.lower() or 'total' in col.lower()]
print("Betting-related columns:")
print(betting_cols)
print("\nSample data:")
schedules[betting_cols].head(10)


In [None]:
# Distribution of spread lines
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(schedules['spread_line'].dropna(), bins=30, edgecolor='black', alpha=0.7, color='steelblue')
plt.xlabel('Spread Line')
plt.ylabel('Frequency')
plt.title('Distribution of Betting Spreads')
plt.axvline(x=0, color='red', linestyle='--', label='Even', linewidth=2)
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(schedules['total_line'].dropna(), bins=30, edgecolor='black', alpha=0.7, color='green')
plt.xlabel('Total Line (O/U)')
plt.ylabel('Frequency')
plt.title('Distribution of Over/Under Lines')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()


## Explore Play-by-Play Data


In [None]:
# Sample of play-by-play data
print(f"Total plays: {len(pbp_data):,}")
print(f"\nColumns: {pbp_data.shape[1]}")
print(f"\nSample data:")
pbp_data.head()


In [None]:
# EPA distribution for offensive plays
offensive_plays = pbp_data[pbp_data['play_type'].isin(['pass', 'run']) & pbp_data['epa'].notna()]

plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
plt.hist(offensive_plays['epa'], bins=50, edgecolor='black', alpha=0.7, color='purple')
plt.xlabel('EPA (Expected Points Added)')
plt.ylabel('Frequency')
plt.title(f'Distribution of EPA per Play\n({len(offensive_plays):,} plays)')
plt.axvline(x=0, color='red', linestyle='--', label='Zero EPA', linewidth=2)
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 3, 2)
pass_plays = offensive_plays[offensive_plays['pass'] == 1]
plt.hist(pass_plays['epa'], bins=50, edgecolor='black', alpha=0.7, color='blue')
plt.xlabel('EPA')
plt.ylabel('Frequency')
plt.title(f'Passing Plays EPA\n({len(pass_plays):,} plays)')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(alpha=0.3)

plt.subplot(1, 3, 3)
rush_plays = offensive_plays[offensive_plays['rush'] == 1]
plt.hist(rush_plays['epa'], bins=50, edgecolor='black', alpha=0.7, color='green')
plt.xlabel('EPA')
plt.ylabel('Frequency')
plt.title(f'Rushing Plays EPA\n({len(rush_plays):,} plays)')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Key offensive statistics
print("Average EPA by play type:")
print(f"  Pass plays: {pass_plays['epa'].mean():.3f}")
print(f"  Rush plays: {rush_plays['epa'].mean():.3f}")
print(f"\nSuccess rate (EPA > 0):")
print(f"  Pass plays: {(pass_plays['epa'] > 0).mean():.1%}")
print(f"  Rush plays: {(rush_plays['epa'] > 0).mean():.1%}")


## Quick Analysis: Home vs Away Performance


In [None]:
# Calculate home win percentage
completed = schedules[schedules['home_score'].notna() & schedules['away_score'].notna()].copy()
completed['home_win'] = completed['home_score'] > completed['away_score']

home_win_pct = completed['home_win'].mean()

print(f"Completed games: {len(completed)}")
print(f"Home team win percentage: {home_win_pct:.1%}")
print(f"Away team win percentage: {(1-home_win_pct):.1%}")

# Visualize
plt.figure(figsize=(10, 6))
categories = ['Home Wins', 'Away Wins']
counts = [completed['home_win'].sum(), (~completed['home_win']).sum()]
colors = ['steelblue', 'coral']

plt.bar(categories, counts, color=colors, alpha=0.7, edgecolor='black')
plt.ylabel('Number of Games')
plt.title(f'{SEASON} Season: Home vs Away Performance', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

for i, (cat, count) in enumerate(zip(categories, counts)):
    plt.text(i, count + 2, f'{count}\n({count/len(completed):.1%})', 
             ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()


## Next Steps

For deeper analysis, see:
1. **`02_spread_analysis.ipynb`** - Detailed spread coverage analysis
2. **`03_offensive_stats.ipynb`** - Offensive efficiency correlations  
3. **`04_ml_predictions.ipynb`** - Machine learning models

Or run the complete analysis pipeline:
```python
python main.py
```

This will generate:
- Visualizations in `outputs/figures/`
- Data reports in `outputs/reports/`
- Execution log in `nfl_analysis.log`
