# Mount Diablo Challenge: Statistical Analysis

This notebook performs statistical analysis on the race results and weather data.

## Analysis Goals:
1. Calculate yearly race time statistics
2. Analyze weather conditions across years
3. Correlate weather with race performance
4. Identify trends and patterns

In [None]:
# Setup
import sys
import os

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

from database import DiabloDatabase
from analysis import DiabloAnalyzer
import config

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data from Database

In [None]:
# Initialize analyzer
analyzer = DiabloAnalyzer()

# Load race results
df_race = analyzer.get_race_results_df()

print(f"Loaded {len(df_race)} race results")
print(f"Years available: {sorted(df_race['year'].unique())}")
print(f"\nData shape: {df_race.shape}")
print(f"Columns: {df_race.columns.tolist()}")

In [None]:
# View sample data
df_race.head()

In [None]:
# Load weather data
df_weather = analyzer.get_weather_data_df()

print(f"\nLoaded {len(df_weather)} weather records")
print(f"Locations: {df_weather['location'].unique()}")

df_weather.head()

## 2. Race Time Statistics by Year

In [None]:
# Calculate yearly statistics
stats = analyzer.calculate_yearly_statistics()

print("Yearly Race Statistics:")
stats

In [None]:
# Distribution of chip times for each year
fig, ax = plt.subplots(figsize=(14, 6))

# Convert to minutes for better readability
df_race['chip_time_minutes'] = df_race['chip_time_seconds'] / 60

# Box plot
df_race.boxplot(column='chip_time_minutes', by='year', ax=ax, figsize=(14, 6))
ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Chip Time (minutes)', fontsize=12, fontweight='bold')
ax.set_title('Distribution of Chip Times by Year', fontsize=14, fontweight='bold')
plt.suptitle('')  # Remove default title
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Violin plot for distribution
fig, ax = plt.subplots(figsize=(14, 6))

sns.violinplot(data=df_race, x='year', y='chip_time_minutes', ax=ax)
ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Chip Time (minutes)', fontsize=12, fontweight='bold')
ax.set_title('Chip Time Distribution by Year (Violin Plot)', fontsize=14, fontweight='bold')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 3. Weather Analysis

In [None]:
# Weather statistics by year
weather_stats = analyzer.calculate_weather_statistics_by_year()

print("Weather Statistics by Year and Location:")
weather_stats

In [None]:
# Compare wind speeds at start vs summit
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Pivot data for plotting
wind_avg = weather_stats.pivot_table(
    index='year', 
    columns='location', 
    values='windspeed_10m_mean'
)

wind_max = weather_stats.pivot_table(
    index='year', 
    columns='location', 
    values='wind_gusts_10m_max'
)

# Plot average wind speeds
if not wind_avg.empty:
    wind_avg.plot(ax=ax1, marker='o', linewidth=2, markersize=8)
    ax1.set_xlabel('Year', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Average Wind Speed (mph)', fontsize=12, fontweight='bold')
    ax1.set_title('Average Wind Speed Comparison', fontsize=14, fontweight='bold')
    ax1.legend(title='Location', fontsize=10)
    ax1.grid(True, alpha=0.3)

# Plot max wind gusts
if not wind_max.empty:
    wind_max.plot(ax=ax2, marker='s', linewidth=2, markersize=8)
    ax2.set_xlabel('Year', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Max Wind Gust (mph)', fontsize=12, fontweight='bold')
    ax2.set_title('Max Wind Gusts Comparison', fontsize=14, fontweight='bold')
    ax2.legend(title='Location', fontsize=10)
    ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Performance vs Weather Correlation

In [None]:
# Get combined statistics
combined = analyzer.get_combined_statistics()

print("Combined Race and Weather Statistics:")
combined

In [None]:
# Plot median times with wind overlay
if not combined.empty and 'windspeed_10m_mean_summit' in combined.columns:
    fig, ax1 = plt.subplots(figsize=(14, 6))
    
    # Chip times
    color1 = 'tab:blue'
    ax1.set_xlabel('Year', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Median Chip Time (minutes)', fontsize=12, fontweight='bold', color=color1)
    ax1.plot(combined['year'], combined['median'] / 60, 
             marker='o', color=color1, linewidth=2, markersize=8, label='Median Time')
    ax1.tick_params(axis='y', labelcolor=color1)
    ax1.grid(True, alpha=0.3)
    
    # Wind speeds on secondary axis
    ax2 = ax1.twinx()
    color2 = 'tab:red'
    ax2.set_ylabel('Wind Speed (mph)', fontsize=12, fontweight='bold', color=color2)
    ax2.plot(combined['year'], combined['windspeed_10m_mean_summit'], 
             marker='s', color=color2, linewidth=2, markersize=8, 
             label='Avg Wind (Summit)', linestyle='--')
    ax2.tick_params(axis='y', labelcolor=color2)
    
    plt.title('Race Performance vs Wind Conditions', fontsize=14, fontweight='bold')
    
    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis (if multiple years available)
if len(combined) > 2 and 'windspeed_10m_mean_summit' in combined.columns:
    # Calculate correlation between median time and wind speed
    corr_wind = combined[['median', 'windspeed_10m_mean_summit']].corr()
    
    print("Correlation between median chip time and summit wind speed:")
    print(corr_wind)
    
    # Scatter plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    ax.scatter(combined['windspeed_10m_mean_summit'], 
               combined['median'] / 60, 
               s=100, alpha=0.6)
    
    # Add year labels
    for idx, row in combined.iterrows():
        ax.annotate(str(int(row['year'])), 
                   (row['windspeed_10m_mean_summit'], row['median'] / 60),
                   xytext=(5, 5), textcoords='offset points')
    
    ax.set_xlabel('Average Wind Speed at Summit (mph)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Median Chip Time (minutes)', fontsize=12, fontweight='bold')
    ax.set_title('Chip Time vs Wind Speed', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 5. Demographic Analysis

In [None]:
# Gender distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Count by gender
gender_counts = df_race.groupby(['year', 'gender']).size().unstack(fill_value=0)
gender_counts.plot(kind='bar', ax=ax1, stacked=False)
ax1.set_xlabel('Year', fontsize=12, fontweight='bold')
ax1.set_ylabel('Number of Racers', fontsize=12, fontweight='bold')
ax1.set_title('Participation by Gender', fontsize=14, fontweight='bold')
ax1.legend(title='Gender')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=0)

# Average times by gender
gender_times = df_race.groupby(['year', 'gender'])['chip_time_minutes'].mean().unstack()
gender_times.plot(kind='line', ax=ax2, marker='o', linewidth=2, markersize=8)
ax2.set_xlabel('Year', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average Chip Time (minutes)', fontsize=12, fontweight='bold')
ax2.set_title('Average Times by Gender', fontsize=14, fontweight='bold')
ax2.legend(title='Gender')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Age distribution
fig, ax = plt.subplots(figsize=(12, 6))

df_race[df_race['age'].notna()]['age'].hist(bins=30, ax=ax, edgecolor='black')
ax.set_xlabel('Age', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Racers', fontsize=12, fontweight='bold')
ax.set_title('Age Distribution of All Racers', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 6. Key Insights

Summary of key findings from the analysis:

In [None]:
# Print summary insights
print("KEY INSIGHTS")
print("="*60)

# Overall statistics
total_racers = len(df_race)
years_available = df_race['year'].nunique()
avg_participants = df_race.groupby('year').size().mean()

print(f"\nTotal racers in database: {total_racers}")
print(f"Years analyzed: {years_available}")
print(f"Average participants per year: {avg_participants:.0f}")

# Fastest times
print(f"\nFastest time overall: {df_race['chip_time_minutes'].min():.2f} minutes")
print(f"Slowest time overall: {df_race['chip_time_minutes'].max():.2f} minutes")
print(f"Overall average: {df_race['chip_time_minutes'].mean():.2f} minutes")
print(f"Overall median: {df_race['chip_time_minutes'].median():.2f} minutes")

# Gender breakdown
if 'gender' in df_race.columns:
    gender_pct = df_race['gender'].value_counts(normalize=True) * 100
    print(f"\nGender distribution:")
    for gender, pct in gender_pct.items():
        print(f"  {gender}: {pct:.1f}%")

In [None]:
# Close analyzer
analyzer.close()
print("\nAnalysis complete!")