# Exploratory Data Analysis for Sports Analytics

This notebook explores and analyzes sports data to understand patterns and relationships that can be used for game outcome prediction.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the visualizations
sns.set(style="whitegrid")

# Increase the font size for better readability
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14
})

# Make plots larger by default
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 8]

## 1. Data Loading and Inspection

First, let's load our sports data and examine its structure.

In [None]:
# This path will be updated once we have data
# data_path = '../data/raw/games.csv'

# For now, let's create some sample data for demonstration

# Create a list of teams
teams = ['Team A', 'Team B', 'Team C', 'Team D', 'Team E', 'Team F', 'Team G', 'Team H']

# Generate sample game data
np.random.seed(42)  # For reproducibility

# Create dates for the games (one game per day for 50 days)
import datetime
start_date = datetime.datetime(2023, 1, 1)
dates = [start_date + datetime.timedelta(days=i) for i in range(50)]

# Create a list to store game data
games_data = []

for game_date in dates:
    # Randomly select home and away teams
    home_team, away_team = np.random.choice(teams, size=2, replace=False)
    
    # Generate random scores
    home_score = np.random.randint(70, 110)
    away_score = np.random.randint(70, 110)
    
    # Determine the outcome
    if home_score > away_score:
        outcome = 'home_win'
    else:
        outcome = 'away_win'
    
    # Add a row to the data
    games_data.append({
        'date': game_date.strftime('%Y-%m-%d'),
        'home_team': home_team,
        'away_team': away_team,
        'home_score': home_score,
        'away_score': away_score,
        'outcome': outcome
    })

# Create a DataFrame
df = pd.DataFrame(games_data)

# Display the first few rows
df.head()

In [None]:
# Basic information about the dataset
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nSummary statistics:")
df.describe()

## 2. Basic Exploratory Analysis

Now, let's explore some basic statistics and distributions in our data.

In [None]:
# Calculate the win-loss record for each team

# Function to count wins and losses for a team
def count_wins_losses(team):
    home_games = df[df['home_team'] == team]
    away_games = df[df['away_team'] == team]
    
    home_wins = len(home_games[home_games['outcome'] == 'home_win'])
    home_losses = len(home_games[home_games['outcome'] == 'away_win'])
    
    away_wins = len(away_games[away_games['outcome'] == 'away_win'])
    away_losses = len(away_games[away_games['outcome'] == 'home_win'])
    
    return {
        'team': team,
        'wins': home_wins + away_wins,
        'losses': home_losses + away_losses,
        'total_games': home_wins + home_losses + away_wins + away_losses,
        'win_rate': (home_wins + away_wins) / (home_wins + home_losses + away_wins + away_losses)
    }

# Calculate the win-loss record for each team
team_records = []
for team in teams:
    team_records.append(count_wins_losses(team))

# Create a DataFrame with the team records
team_records_df = pd.DataFrame(team_records)
team_records_df = team_records_df.sort_values(by='win_rate', ascending=False)

# Display the team records
team_records_df

In [None]:
# Visualize the win-loss record for each team
plt.figure(figsize=(12, 8))

# Create a stacked bar chart
ax = team_records_df.plot(x='team', y=['wins', 'losses'], kind='bar', stacked=True, color=['#1f77b4', '#ff7f0e'])

# Add labels and title
plt.xlabel('Team')
plt.ylabel('Number of Games')
plt.title('Win-Loss Record by Team')

# Add a legend
plt.legend(title='Result')

# Add value labels on the bars
for c in ax.containers:
    ax.bar_label(c, label_type='center')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Visualize the win rate for each team
plt.figure(figsize=(12, 6))

# Create a bar chart
ax = team_records_df.plot(x='team', y='win_rate', kind='bar', color='#2ca02c')

# Add labels and title
plt.xlabel('Team')
plt.ylabel('Win Rate')
plt.title('Win Rate by Team')

# Add a horizontal line at 0.5 (50% win rate)
plt.axhline(y=0.5, color='r', linestyle='--', label='50% Win Rate')

# Add value labels on the bars
for i, v in enumerate(team_records_df['win_rate']):
    ax.text(i, v + 0.01, f'{v:.2f}', ha='center')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Set the y-axis limits
plt.ylim(0, 1)

# Add a legend
plt.legend()

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

## 3. Score Analysis

Let's analyze the scoring patterns in our games.

In [None]:
# Calculate the average score for each team

# Function to calculate average scores for a team
def calculate_average_scores(team):
    home_games = df[df['home_team'] == team]
    away_games = df[df['away_team'] == team]
    
    home_points_scored = home_games['home_score'].mean()
    home_points_allowed = home_games['away_score'].mean()
    
    away_points_scored = away_games['away_score'].mean()
    away_points_allowed = away_games['home_score'].mean()
    
    total_games = len(home_games) + len(away_games)
    avg_points_scored = (home_games['home_score'].sum() + away_games['away_score'].sum()) / total_games
    avg_points_allowed = (home_games['away_score'].sum() + away_games['home_score'].sum()) / total_games
    
    return {
        'team': team,
        'avg_points_scored': avg_points_scored,
        'avg_points_allowed': avg_points_allowed,
        'point_differential': avg_points_scored - avg_points_allowed
    }

# Calculate the average scores for each team
team_scores = []
for team in teams:
    team_scores.append(calculate_average_scores(team))

# Create a DataFrame with the team scores
team_scores_df = pd.DataFrame(team_scores)
team_scores_df = team_scores_df.sort_values(by='point_differential', ascending=False)

# Display the team scores
team_scores_df

In [None]:
# Visualize the average scores for each team
plt.figure(figsize=(12, 8))

# Create a grouped bar chart
x = range(len(team_scores_df))
width = 0.35

plt.bar(x, team_scores_df['avg_points_scored'], width, label='Points Scored', color='#1f77b4')
plt.bar([i + width for i in x], team_scores_df['avg_points_allowed'], width, label='Points Allowed', color='#ff7f0e')

# Add labels and title
plt.xlabel('Team')
plt.ylabel('Average Points')
plt.title('Average Points Scored and Allowed by Team')

# Add a legend
plt.legend()

# Set the x-tick positions and labels
plt.xticks([i + width/2 for i in x], team_scores_df['team'], rotation=45, ha='right')

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Visualize the point differential for each team
plt.figure(figsize=(12, 6))

# Create a bar chart with a diverging color palette
bars = plt.bar(team_scores_df['team'], team_scores_df['point_differential'])

# Color the bars based on the point differential
for i, bar in enumerate(bars):
    if team_scores_df['point_differential'].iloc[i] > 0:
        bar.set_color('#2ca02c')  # Green for positive differential
    else:
        bar.set_color('#d62728')  # Red for negative differential

# Add labels and title
plt.xlabel('Team')
plt.ylabel('Point Differential')
plt.title('Average Point Differential by Team')

# Add a horizontal line at 0
plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)

# Add value labels on the bars
for i, v in enumerate(team_scores_df['point_differential']):
    plt.text(i, v + (0.5 if v >= 0 else -0.5), f'{v:.2f}', ha='center')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

## 4. Home vs. Away Performance

Let's analyze the performance of teams at home versus away.

In [None]:
# Calculate home and away win rates
home_wins = len(df[df['outcome'] == 'home_win'])
away_wins = len(df[df['outcome'] == 'away_win'])
total_games = len(df)

print(f"Home Win Rate: {home_wins / total_games:.2f} ({home_wins} out of {total_games} games)")
print(f"Away Win Rate: {away_wins / total_games:.2f} ({away_wins} out of {total_games} games)")

In [None]:
# Visualize the home vs. away win rates
plt.figure(figsize=(10, 6))

# Create a pie chart
plt.pie([home_wins, away_wins], labels=['Home Wins', 'Away Wins'], autopct='%1.1f%%', colors=['#1f77b4', '#ff7f0e'], explode=(0.1, 0))

# Add a title
plt.title('Home vs. Away Win Distribution')

# Display the plot
plt.show()

In [None]:
# Calculate home and away performance for each team
home_away_performance = []

for team in teams:
    # Home performance
    home_games = df[df['home_team'] == team]
    home_wins = len(home_games[home_games['outcome'] == 'home_win'])
    home_losses = len(home_games[home_games['outcome'] == 'away_win'])
    home_win_rate = home_wins / (home_wins + home_losses) if (home_wins + home_losses) > 0 else 0
    
    # Away performance
    away_games = df[df['away_team'] == team]
    away_wins = len(away_games[away_games['outcome'] == 'away_win'])
    away_losses = len(away_games[away_games['outcome'] == 'home_win'])
    away_win_rate = away_wins / (away_wins + away_losses) if (away_wins + away_losses) > 0 else 0
    
    home_away_performance.append({
        'team': team,
        'home_win_rate': home_win_rate,
        'away_win_rate': away_win_rate,
        'home_away_diff': home_win_rate - away_win_rate
    })

# Create a DataFrame with the home-away performance
home_away_df = pd.DataFrame(home_away_performance)
home_away_df = home_away_df.sort_values(by='home_away_diff', ascending=False)

# Display the home-away performance
home_away_df

In [None]:
# Visualize the home vs. away win rates for each team
plt.figure(figsize=(12, 8))

# Create a scatter plot
plt.scatter(home_away_df['home_win_rate'], home_away_df['away_win_rate'], alpha=0.8, s=100)

# Add team labels to each point
for i, row in home_away_df.iterrows():
    plt.annotate(row['team'], (row['home_win_rate'], row['away_win_rate']), xytext=(5, 5), textcoords='offset points')

# Add a diagonal line (home win rate = away win rate)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Home Win Rate = Away Win Rate')

# Add labels and title
plt.xlabel('Home Win Rate')
plt.ylabel('Away Win Rate')
plt.title('Home vs. Away Win Rate by Team')

# Set the axis limits
plt.xlim(0, 1)
plt.ylim(0, 1)

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.7)

# Add a legend
plt.legend()

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

## 5. Correlation Analysis

Finally, let's look at the relationships between different performance metrics.

In [None]:
# Merge the team records and scores DataFrames
team_stats = pd.merge(team_records_df, team_scores_df, on='team')

# Merge with the home-away performance DataFrame
team_stats = pd.merge(team_stats, home_away_df, on='team')

# Display the merged DataFrame
team_stats

In [None]:
# Calculate the correlation matrix
correlation_columns = [
    'win_rate', 'avg_points_scored', 'avg_points_allowed', 'point_differential',
    'home_win_rate', 'away_win_rate', 'home_away_diff'
]
correlation_matrix = team_stats[correlation_columns].corr()

# Display the correlation matrix
correlation_matrix

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(12, 10))

# Create a heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True)

# Add a title
plt.title('Correlation Matrix of Team Performance Metrics')

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Visualize the relationship between point differential and win rate
plt.figure(figsize=(12, 8))

# Create a scatter plot
plt.scatter(team_stats['point_differential'], team_stats['win_rate'], alpha=0.8, s=100)

# Add team labels to each point
for i, row in team_stats.iterrows():
    plt.annotate(row['team'], (row['point_differential'], row['win_rate']), xytext=(5, 5), textcoords='offset points')

# Add a regression line
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(team_stats['point_differential'], team_stats['win_rate'])
x = np.linspace(team_stats['point_differential'].min(), team_stats['point_differential'].max(), 100)
y = slope * x + intercept
plt.plot(x, y, 'r-', label=f'R² = {r_value**2:.2f}')

# Add labels and title
plt.xlabel('Point Differential')
plt.ylabel('Win Rate')
plt.title('Relationship Between Point Differential and Win Rate')

# Add a horizontal line at 0.5 win rate
plt.axhline(y=0.5, color='k', linestyle='--', alpha=0.3)

# Add a vertical line at 0 point differential
plt.axvline(x=0, color='k', linestyle='--', alpha=0.3)

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.7)

# Add a legend
plt.legend()

# Tight layout to ensure everything fits
plt.tight_layout()

# Display the plot
plt.show()

## 6. Conclusions and Next Steps

From our exploratory analysis, we can draw several conclusions:

1. There is a strong correlation between point differential and win rate, which makes intuitive sense.
2. Teams tend to perform better at home than away, as evidenced by the home win rate being higher than the away win rate.
3. Some teams show significant differences in their home versus away performance.

Next steps for our analytics pipeline:

1. Build more advanced features that capture team form and momentum.
2. Engineer features that account for head-to-head matchups and historical performance.
3. Develop a predictive model using the engineered features to forecast game outcomes.
4. Evaluate the model's performance and refine it as needed.
5. Create a dashboard for visualizing predictions and team analytics.