# NFL Game Prediction - Explorative Datenanalyse
## Verstehe die Daten, bevor du modellierst!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nfl_data_py as nfl

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Daten Laden

In [None]:
# Spiele laden (2018-2023)
years = [2018, 2019, 2020, 2021, 2022, 2023]
games = nfl.import_schedules(years)

# Nur Regular Season
games = games[games['game_type'] == 'REG'].copy()
games['gameday'] = pd.to_datetime(games['gameday'])

print(f"Anzahl Spiele: {len(games)}")
print(f"Zeitraum: {games['gameday'].min()} bis {games['gameday'].max()}")
games.head()

## 2. Home Field Advantage

In [None]:
# Home Win Rate berechnen
games['home_win'] = (games['home_score'] > games['away_score']).astype(int)

home_win_rate = games['home_win'].mean()
print(f"Home Win Rate: {home_win_rate:.2%}")

# Pro Saison
home_win_by_season = games.groupby('season')['home_win'].mean()

plt.figure(figsize=(10, 5))
home_win_by_season.plot(kind='bar', color='steelblue')
plt.axhline(y=0.5, color='red', linestyle='--', label='50% (kein Vorteil)')
plt.xlabel('Saison')
plt.ylabel('Home Win Rate')
plt.title('Home Field Advantage über die Jahre')
plt.legend()
plt.ylim([0.4, 0.65])
plt.tight_layout()
plt.show()

## 3. Punkteverteilung

In [None]:
# Total Points pro Spiel
games['total_points'] = games['home_score'] + games['away_score']

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Home Score
axes[0].hist(games['home_score'], bins=30, color='green', alpha=0.7, edgecolor='black')
axes[0].axvline(games['home_score'].mean(), color='red', linestyle='--', label=f'Mean: {games["home_score"].mean():.1f}')
axes[0].set_xlabel('Points')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Home Team Score Distribution')
axes[0].legend()

# Away Score
axes[1].hist(games['away_score'], bins=30, color='orange', alpha=0.7, edgecolor='black')
axes[1].axvline(games['away_score'].mean(), color='red', linestyle='--', label=f'Mean: {games["away_score"].mean():.1f}')
axes[1].set_xlabel('Points')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Away Team Score Distribution')
axes[1].legend()

# Total Points
axes[2].hist(games['total_points'], bins=30, color='purple', alpha=0.7, edgecolor='black')
axes[2].axvline(games['total_points'].mean(), color='red', linestyle='--', label=f'Mean: {games["total_points"].mean():.1f}')
axes[2].set_xlabel('Total Points')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Total Points per Game')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"Durchschnitt Home Score: {games['home_score'].mean():.2f}")
print(f"Durchschnitt Away Score: {games['away_score'].mean():.2f}")
print(f"Durchschnitt Total Points: {games['total_points'].mean():.2f}")

## 4. Punktedifferenz

In [None]:
# Point Differential (positiv = Home Win)
games['point_diff'] = games['home_score'] - games['away_score']

plt.figure(figsize=(12, 6))
plt.hist(games['point_diff'], bins=50, color='steelblue', alpha=0.7, edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Unentschieden')
plt.axvline(games['point_diff'].mean(), color='green', linestyle='--', linewidth=2, label=f'Mean: {games["point_diff"].mean():.2f}')
plt.xlabel('Point Differential (Home - Away)')
plt.ylabel('Frequency')
plt.title('Point Differential Distribution')
plt.legend()
plt.tight_layout()
plt.show()

# Close Games (< 7 Punkte Unterschied)
close_games = games[games['point_diff'].abs() <= 7]
print(f"\nClose Games (<= 7 Punkte): {len(close_games)} ({len(close_games)/len(games):.1%})")
print(f"Blowouts (> 14 Punkte): {len(games[games['point_diff'].abs() > 14])} ({len(games[games['point_diff'].abs() > 14])/len(games):.1%})")

## 5. Team Performance

In [None]:
# Win Rate pro Team (Home + Away)
home_wins = games.groupby('home_team')['home_win'].sum()
away_wins = games.groupby('away_team').apply(lambda x: (x['away_score'] > x['home_score']).sum())

total_games = games.groupby('home_team').size() + games.groupby('away_team').size()
total_wins = home_wins + away_wins

win_rate = (total_wins / total_games).sort_values(ascending=False)

# Top 10 und Bottom 10
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 10
win_rate.head(10).plot(kind='barh', ax=axes[0], color='green')
axes[0].set_xlabel('Win Rate')
axes[0].set_title('Top 10 Teams (2018-2023)')
axes[0].set_xlim([0, 1])

# Bottom 10
win_rate.tail(10).plot(kind='barh', ax=axes[1], color='red')
axes[1].set_xlabel('Win Rate')
axes[1].set_title('Bottom 10 Teams (2018-2023)')
axes[1].set_xlim([0, 1])

plt.tight_layout()
plt.show()

## 6. Zeitliche Trends

In [None]:
# Durchschnittliche Punkte pro Woche (über alle Saisons)
weekly_stats = games.groupby('week').agg({
    'total_points': 'mean',
    'home_win': 'mean'
}).reset_index()

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Total Points über Weeks
axes[0].plot(weekly_stats['week'], weekly_stats['total_points'], marker='o', linewidth=2, markersize=8)
axes[0].set_xlabel('Week')
axes[0].set_ylabel('Average Total Points')
axes[0].set_title('Average Total Points by Week')
axes[0].grid(True, alpha=0.3)

# Home Win Rate über Weeks
axes[1].plot(weekly_stats['week'], weekly_stats['home_win'], marker='o', linewidth=2, markersize=8, color='green')
axes[1].axhline(y=0.5, color='red', linestyle='--', label='50%')
axes[1].set_xlabel('Week')
axes[1].set_ylabel('Home Win Rate')
axes[1].set_title('Home Win Rate by Week')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Korrelationsanalyse

In [None]:
# Numerische Features auswählen
numeric_features = ['home_score', 'away_score', 'total_points', 'point_diff', 'week', 'home_win']
correlation_matrix = games[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 8. Key Insights

Basierend auf dieser EDA:

1. **Home Field Advantage existiert**: ~57% Home Win Rate
2. **Punkteverteilung**: Normal distributed, Mean ~23 Punkte pro Team
3. **Close Games**: ~40% der Spiele sind "close" (≤7 Punkte)
4. **Varianz**: NFL ist sehr kompetitiv, selbst beste Teams verlieren ~30% ihrer Spiele
5. **Zeitliche Muster**: Leichte Trends über die Saison (z.B. mehr Punkte später in Saison)

### Implikationen für ML:
- Accuracy von ~65% wäre sehr gut (vs 57% Baseline)
- Features für "Form" (recent performance) wichtig
- Home/Away Split Features essentiell
- Point Differential besser als Win/Loss als Target?

## 9. Nächste Schritte

1. **Feature Engineering**: Rolling averages, Elo ratings, advanced stats
2. **Model Selection**: Logistic Regression → Random Forest → XGBoost
3. **Validation**: Time-based split, Cross-Validation
4. **Tuning**: Hyperparameter optimization
5. **Evaluation**: Multiple metrics, feature importance analysis