# NFL Data Exploratory Analysis

This notebook explores the NFL data loaded from nflfastR to understand patterns and prepare for model development.

## Contents
1. Data Loading
2. Basic Statistics
3. Team Performance Analysis
4. Feature Exploration
5. Target Variable Analysis


In [None]:
# Setup and imports
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Import our modules
from data.nfl_data_loader import NFLDataLoader
from data.data_processor import NFLDataProcessor
from utils.config import get_config
from utils.logger import setup_logger

# Initialize
logger = setup_logger("eda_notebook")
config = get_config()

print("Setup complete!")


## 1. Data Loading

Load NFL data using our data loader. For this example, we'll use sample data or cached data if available.


In [None]:
# Load data
loader = NFLDataLoader()
processor = NFLDataProcessor()

# Try to load real data, fall back to sample if needed
seasons = [2022, 2023]

try:
    # Try loading from cache or API
    pbp_data = loader.get_cached_data('pbp', seasons)
    if pbp_data is None:
        print("Creating sample data for demonstration...")
        # Create sample data
        np.random.seed(42)
        teams = ['BUF', 'KC', 'PHI', 'SF', 'DAL', 'MIA', 'CIN', 'MIN']
        pbp_data = pd.DataFrame({
            'game_id': np.repeat([f'2023_01_{teams[i]}_{teams[i+1]}' for i in range(0, len(teams), 2)], 50),
            'play_id': range(200),
            'posteam': np.random.choice(teams, 200),
            'defteam': np.random.choice(teams, 200),
            'home_team': np.random.choice(teams[:4], 200),
            'away_team': np.random.choice(teams[4:], 200),
            'play_type': np.random.choice(['pass', 'run'], 200),
            'yards_gained': np.random.normal(5, 8, 200),
            'epa': np.random.normal(0, 1.5, 200),
            'wp': np.random.uniform(0.2, 0.8, 200),
            'interception': np.random.choice([0, 1], 200, p=[0.95, 0.05]),
            'fumble_lost': np.random.choice([0, 1], 200, p=[0.98, 0.02])
        })
except Exception as e:
    print(f"Error loading data: {e}")
    print("Using sample data...")

print(f"Loaded {len(pbp_data)} plays")
pbp_data.head()


## 2. Basic Statistics


In [None]:
# Basic statistics
print("Dataset Overview:")
print(f"Total plays: {len(pbp_data)}")
print(f"Unique games: {pbp_data['game_id'].nunique()}")
print(f"Unique teams: {pbp_data['posteam'].nunique()}")
print("\nPlay type distribution:")
print(pbp_data['play_type'].value_counts())
print("\nNumeric columns summary:")
pbp_data[['yards_gained', 'epa', 'wp']].describe()


## 3. Visualizations


In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Yards gained distribution
axes[0, 0].hist(pbp_data['yards_gained'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Yards Gained per Play')
axes[0, 0].set_xlabel('Yards Gained')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(pbp_data['yards_gained'].mean(), color='red', linestyle='--', label=f'Mean: {pbp_data["yards_gained"].mean():.2f}')
axes[0, 0].legend()

# EPA distribution
axes[0, 1].hist(pbp_data['epa'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].set_title('Distribution of EPA (Expected Points Added)')
axes[0, 1].set_xlabel('EPA')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(0, color='red', linestyle='--', label='Zero EPA')
axes[0, 1].legend()

# Play type comparison
play_stats = pbp_data.groupby('play_type')['yards_gained'].mean()
axes[1, 0].bar(play_stats.index, play_stats.values, color=['blue', 'orange'])
axes[1, 0].set_title('Average Yards by Play Type')
axes[1, 0].set_xlabel('Play Type')
axes[1, 0].set_ylabel('Average Yards')

# Win probability distribution
axes[1, 1].hist(pbp_data['wp'], bins=20, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_title('Win Probability Distribution')
axes[1, 1].set_xlabel('Win Probability')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()
