# 🔍 NFL Data Exploration - Deep Dive

**Comprehensive Data Analysis and Visualization**

This notebook provides an in-depth exploration of the NFL player tracking data.

---

## 📋 Table of Contents

1. [Setup](#setup)
2. [Data Loading](#loading)
3. [Data Dictionary](#dictionary)
4. [Statistical Analysis](#stats)
5. [Distribution Analysis](#distributions)
6. [Correlation Analysis](#correlations)
7. [Player Position Analysis](#positions)
8. [Game/Play Analysis](#games)
9. [Field Position Heatmaps](#heatmaps)
10. [Key Insights](#insights)

---

## 1. Setup 🔧

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Plotting configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

DATA_DIR = Path('../data')
OUTPUT_DIR = Path('../outputs/data_exploration')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("✅ Setup complete")

## 2. Data Loading 📂

In [None]:
# Load sample data
train_dir = DATA_DIR / 'raw' / 'train'
input_files = sorted(train_dir.glob('input_*.csv'))[:2]
output_files = sorted(train_dir.glob('output_*.csv'))[:2]

print("📂 Loading data files...")
input_df = pd.concat([pd.read_csv(f) for f in input_files], ignore_index=True)
output_df = pd.concat([pd.read_csv(f) for f in output_files], ignore_index=True)

# Sample for faster exploration
SAMPLE_SIZE = 50000
if len(input_df) > SAMPLE_SIZE:
    input_df = input_df.sample(n=SAMPLE_SIZE, random_state=42)
    # Filter output to match sampled input using proper keys
    sampled_keys = input_df[['game_id', 'play_id', 'nfl_id', 'frame_id']]
    output_df = output_df.merge(sampled_keys, on=['game_id', 'play_id', 'nfl_id', 'frame_id'])

print(f"✅ Data loaded")
print(f"   Input: {input_df.shape}")
print(f"   Output: {output_df.shape}")

## 3. Data Dictionary 📖

Understanding the columns and their meaning

In [None]:
data_dict = {
    'Tracking Data': {
        'x': 'Player X position (yards, 0-120)',
        'y': 'Player Y position (yards, 0-53.3)',
        's': 'Speed (yards/second)',
        'a': 'Acceleration (yards/second²)',
        'dir': 'Direction of motion (degrees, 0-360)',
        'o': 'Orientation/body angle (degrees, 0-360)'
    },
    'Player Info': {
        'nfl_id': 'Unique player identifier',
        'player_name': 'Player name',
        'player_position': 'Player position (QB, WR, CB, etc.)',
        'player_weight': 'Player weight (pounds)',
        'player_height': 'Player height (feet-inches)',
        'player_role': 'Role in play (Passer, Targeted Receiver, etc.)'
    },
    'Game Context': {
        'game_id': 'Unique game identifier',
        'play_id': 'Unique play identifier within game',
        'frame_id': 'Frame number in play sequence',
        'ball_land_x': 'X coordinate where ball lands',
        'ball_land_y': 'Y coordinate where ball lands'
    }
}

print("📖 DATA DICTIONARY")
print("=" * 70)
for category, fields in data_dict.items():
    print(f"\n{category}:")
    for field, desc in fields.items():
        if field in input_df.columns:
            print(f"  ✓ {field:20s} - {desc}")

print(f"\n\n📋 Column Info:")
print(f"   Total columns: {len(input_df.columns)}")
print(f"   Numeric columns: {len(input_df.select_dtypes(include=[np.number]).columns)}")
print(f"   Categorical columns: {len(input_df.select_dtypes(include=['object']).columns)}")

## 4. Statistical Analysis 📊

In [None]:
print("📊 STATISTICAL SUMMARY")
print("=" * 70)

# Key numeric columns
key_cols = ['x', 'y', 's', 'a', 'dir', 'o', 'player_weight']
available_cols = [c for c in key_cols if c in input_df.columns]

stats_df = input_df[available_cols].describe()
display(stats_df)

# Missing values
print("\n❓ Missing Values:")
missing = input_df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if len(missing) > 0:
    for col, count in missing.items():
        pct = 100 * count / len(input_df)
        print(f"   {col:20s}: {count:>8,} ({pct:>5.1f}%)")
else:
    print("   ✓ No missing values")

# Data types
print("\n📋 Data Types:")
print(input_df.dtypes.value_counts())

## 5. Distribution Analysis 📈

Analyzing distributions of key variables

In [None]:
# Distribution plots
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
axes = axes.ravel()

plot_cols = ['x', 'y', 's', 'a', 'dir', 'o', 'player_weight', 'ball_land_x', 'ball_land_y']
colors = plt.cm.Set3(np.linspace(0, 1, len(plot_cols)))

for idx, col in enumerate(plot_cols):
    if col in input_df.columns:
        data = input_df[col].dropna()
        axes[idx].hist(data, bins=50, edgecolor='black', alpha=0.7, color=colors[idx])
        axes[idx].set_title(f'{col} Distribution', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(alpha=0.3)
        
        # Add statistics
        mean_val = data.mean()
        median_val = data.median()
        axes[idx].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
        axes[idx].axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
        axes[idx].legend(fontsize=9)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Distribution plots saved")

In [None]:
# Speed analysis by player role
if 's' in input_df.columns and 'player_role' in input_df.columns:
    print("🏃 SPEED ANALYSIS BY PLAYER ROLE")
    print("=" * 70)
    
    speed_by_role = input_df.groupby('player_role')['s'].agg(['mean', 'median', 'std', 'count'])
    speed_by_role = speed_by_role.sort_values('mean', ascending=False)
    display(speed_by_role)
    
    # Visualize
    fig, ax = plt.subplots(figsize=(12, 6))
    input_df.boxplot(column='s', by='player_role', ax=ax, rot=45)
    plt.suptitle('')
    ax.set_title('Speed Distribution by Player Role', fontweight='bold', fontsize=14)
    ax.set_xlabel('Player Role', fontsize=12)
    ax.set_ylabel('Speed (yards/sec)', fontsize=12)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'speed_by_role.png', dpi=150, bbox_inches='tight')
    plt.show()

## 6. Correlation Analysis 🔗

Exploring relationships between variables

In [None]:
# Correlation heatmap
numeric_cols = ['x', 'y', 's', 'a', 'dir', 'o', 'player_weight']
available_numeric = [c for c in numeric_cols if c in input_df.columns]

corr_matrix = input_df[available_numeric].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

# Top correlations
print("\n🔗 Top Positive Correlations:")
corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[corr_pairs < 1.0]  # Remove self-correlations
for (var1, var2), corr in corr_pairs.head(5).items():
    print(f"   {var1} <-> {var2}: {corr:.3f}")

print("\n🔗 Top Negative Correlations:")
for (var1, var2), corr in corr_pairs.tail(5).items():
    print(f"   {var1} <-> {var2}: {corr:.3f}")

## 7. Player Position Analysis 👥

In [None]:
if 'player_position' in input_df.columns:
    print("👥 PLAYER POSITION ANALYSIS")
    print("=" * 70)
    
    # Position counts
    pos_counts = input_df['player_position'].value_counts()
    print("\n📊 Position Distribution:")
    print(pos_counts)
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Bar plot
    pos_counts.head(15).plot(kind='barh', ax=axes[0], color='skyblue', edgecolor='black')
    axes[0].set_title('Top 15 Player Positions', fontweight='bold', fontsize=14)
    axes[0].set_xlabel('Count', fontsize=12)
    axes[0].set_ylabel('Position', fontsize=12)
    axes[0].grid(alpha=0.3)
    
    # Pie chart (top 10)
    pos_counts.head(10).plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title('Top 10 Position Distribution', fontweight='bold', fontsize=14)
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'position_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Physical attributes by position
    if 'player_weight' in input_df.columns:
        print("\n⚖️ Average Weight by Position (Top 10):")
        weight_by_pos = input_df.groupby('player_position')['player_weight'].mean().sort_values(ascending=False)
        print(weight_by_pos.head(10))

## 8. Game/Play Analysis 🏈

In [None]:
print("🏈 GAME/PLAY ANALYSIS")
print("=" * 70)

if 'game_id' in input_df.columns:
    print(f"\n📊 Number of games: {input_df['game_id'].nunique()}")
    
if 'play_id' in input_df.columns:
    print(f"📊 Number of plays: {input_df['play_id'].nunique()}")
    
    # Plays per game
    if 'game_id' in input_df.columns:
        plays_per_game = input_df.groupby('game_id')['play_id'].nunique()
        print(f"\n📈 Plays per game:")
        print(f"   Mean: {plays_per_game.mean():.1f}")
        print(f"   Median: {plays_per_game.median():.1f}")
        print(f"   Min: {plays_per_game.min()}")
        print(f"   Max: {plays_per_game.max()}")

if 'frame_id' in input_df.columns:
    print(f"\n📊 Number of frames: {input_df['frame_id'].nunique()}")
    
    # Frames per play
    if 'play_id' in input_df.columns:
        frames_per_play = input_df.groupby('play_id')['frame_id'].nunique()
        print(f"\n📈 Frames per play:")
        print(f"   Mean: {frames_per_play.mean():.1f}")
        print(f"   Median: {frames_per_play.median():.1f}")
        print(f"   Min: {frames_per_play.min()}")
        print(f"   Max: {frames_per_play.max()}")

# Players per play
if all(col in input_df.columns for col in ['play_id', 'nfl_id']):
    players_per_play = input_df.groupby('play_id')['nfl_id'].nunique()
    print(f"\n👥 Players per play:")
    print(f"   Mean: {players_per_play.mean():.1f}")
    print(f"   Median: {players_per_play.median():.1f}")
    print(f"   Min: {players_per_play.min()}")
    print(f"   Max: {players_per_play.max()}")

## 9. Field Position Heatmaps 🗺️

Visualizing player positions on the field

In [None]:
if 'x' in input_df.columns and 'y' in input_df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(18, 12))
    
    # Sample for heatmap
    sample = input_df.sample(min(10000, len(input_df)))
    
    # 1. Overall position heatmap
    axes[0, 0].hexbin(sample['x'], sample['y'], gridsize=50, cmap='YlOrRd', mincnt=1)
    axes[0, 0].set_title('Player Position Heatmap', fontweight='bold', fontsize=14)
    axes[0, 0].set_xlabel('X (yards)')
    axes[0, 0].set_ylabel('Y (yards)')
    axes[0, 0].set_xlim(0, 120)
    axes[0, 0].set_ylim(0, 53.3)
    
    # 2. Speed heatmap
    if 's' in input_df.columns:
        speed_scatter = axes[0, 1].scatter(sample['x'], sample['y'], c=sample['s'], 
                                          cmap='viridis', s=1, alpha=0.5)
        axes[0, 1].set_title('Speed Distribution on Field', fontweight='bold', fontsize=14)
        axes[0, 1].set_xlabel('X (yards)')
        axes[0, 1].set_ylabel('Y (yards)')
        axes[0, 1].set_xlim(0, 120)
        axes[0, 1].set_ylim(0, 53.3)
        plt.colorbar(speed_scatter, ax=axes[0, 1], label='Speed (yards/sec)')
    
    # 3. Ball landing positions
    if 'ball_land_x' in input_df.columns and 'ball_land_y' in input_df.columns:
        axes[1, 0].hexbin(sample['ball_land_x'], sample['ball_land_y'], 
                         gridsize=30, cmap='Blues', mincnt=1)
        axes[1, 0].set_title('Ball Landing Positions', fontweight='bold', fontsize=14)
        axes[1, 0].set_xlabel('X (yards)')
        axes[1, 0].set_ylabel('Y (yards)')
        axes[1, 0].set_xlim(0, 120)
        axes[1, 0].set_ylim(0, 53.3)
    
    # 4. Field zones
    axes[1, 1].scatter(sample['x'], sample['y'], s=1, alpha=0.3, c='blue')
    # Add zone lines
    axes[1, 1].axvline(20, color='red', linestyle='--', linewidth=2, label='Red Zone')
    axes[1, 1].axvline(60, color='green', linestyle='--', linewidth=2, label='Midfield')
    axes[1, 1].axhline(26.65, color='orange', linestyle='--', linewidth=2, label='Field Center')
    axes[1, 1].set_title('Field Zones', fontweight='bold', fontsize=14)
    axes[1, 1].set_xlabel('X (yards)')
    axes[1, 1].set_ylabel('Y (yards)')
    axes[1, 1].set_xlim(0, 120)
    axes[1, 1].set_ylim(0, 53.3)
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'field_heatmaps.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("✅ Field heatmaps saved")

## 10. Key Insights 💡

Summary of findings from data exploration

In [None]:
print("💡 KEY INSIGHTS")
print("=" * 70)

insights = []

# Speed insights
if 's' in input_df.columns:
    mean_speed = input_df['s'].mean()
    max_speed = input_df['s'].max()
    insights.append(f"Average player speed: {mean_speed:.2f} yards/sec")
    insights.append(f"Maximum observed speed: {max_speed:.2f} yards/sec")

# Field coverage
if 'x' in input_df.columns and 'y' in input_df.columns:
    x_coverage = (input_df['x'].max() - input_df['x'].min())
    y_coverage = (input_df['y'].max() - input_df['y'].min())
    insights.append(f"Field coverage: {x_coverage:.1f} yards (length) × {y_coverage:.1f} yards (width)")

# Player diversity
if 'player_position' in input_df.columns:
    n_positions = input_df['player_position'].nunique()
    top_position = input_df['player_position'].mode()[0]
    insights.append(f"Number of unique positions: {n_positions}")
    insights.append(f"Most common position: {top_position}")

# Missing data
missing_pct = 100 * input_df.isnull().sum().sum() / (input_df.shape[0] * input_df.shape[1])
insights.append(f"Overall missing data: {missing_pct:.2f}%")

# Data volume
if 'game_id' in input_df.columns and 'play_id' in input_df.columns:
    n_games = input_df['game_id'].nunique()
    n_plays = input_df['play_id'].nunique()
    insights.append(f"Games in dataset: {n_games}")
    insights.append(f"Plays in dataset: {n_plays}")
    insights.append(f"Frames per play (avg): {input_df.groupby('play_id')['frame_id'].nunique().mean():.1f}")

# Print insights
for i, insight in enumerate(insights, 1):
    print(f"\n{i}. {insight}")

# Save insights
with open(OUTPUT_DIR / 'insights.txt', 'w') as f:
    f.write("KEY INSIGHTS FROM DATA EXPLORATION\n")
    f.write("=" * 70 + "\n\n")
    for i, insight in enumerate(insights, 1):
        f.write(f"{i}. {insight}\n")

print("\n✅ Insights saved to insights.txt")

---

## 🎉 Exploration Complete!

### What we learned:
- ✅ Data structure and column meanings
- ✅ Statistical distributions and patterns
- ✅ Correlations between features
- ✅ Player position characteristics
- ✅ Spatial patterns on the field

### Next Steps:
1. Use insights for feature engineering (`03_feature_engineering.ipynb`)
2. Build and compare models (`04_model_comparison.ipynb`)
3. Try sequence modeling (`05_lstm_sequence_modeling.ipynb`)

---