In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


drivers = pd.read_csv('drivers.csv')
constructors = pd.read_csv('constructors.csv')
results = pd.read_csv('results.csv')
races=pd.read_csv('races.csv')
imp_df = [constructors, drivers,  results,races]

drivers['dob'] = pd.to_datetime(drivers['dob'])
for df in imp_df:
    df.replace('\\N', np.nan, inplace=True)
    print(df.dtypes)    
    
races.drop(columns=['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],inplace=True)

df=pd.merge(results,races[['raceId','year','name','round']],on='raceId',how='left')
df=pd.merge(df,drivers[['driverId','driverRef','nationality','dob']],on='driverId',how='left')
df=pd.merge(df,constructors[['constructorId','constructorRef','name','nationality']],on='constructorId',how='left')
df.drop(columns=['number','position','positionText','laps','fastestLap','statusId','resultId','driverId','constructorId'],axis=1,inplace=True)
df.rename(columns={'rank':'fastest_lap_rank','name_x':'GP_Name','nationality_x':'driver_nationality','name_y':'constructor_name','nationality_y':'constructors_nationality','driverRef':'driver'},inplace=True)




In [None]:
season_winners = df[df['positionOrder'] == 1].copy()

# Calculate Driver Age During Championship Year
season_winners['championship_age'] = season_winners['year'] - pd.to_datetime(season_winners['dob']).dt.year

# Select Only the Driver with Most Points Per Year
season_winners = season_winners.groupby(['year', 'driver', 'dob']).agg({
    'points': 'sum',
    'championship_age': 'mean'  # Age during the winning season
}).reset_index()

# Identify True Season Winners (Highest Points in Each Year)
season_winners = season_winners.loc[season_winners.groupby('year')['points'].idxmax()].reset_index(drop=True)

# Create Age Ranges
age_bins = [20, 24, 29, 34, 39, 44, 49, 54]
age_labels = ['20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54']
season_winners['age_range'] = pd.cut(season_winners['championship_age'], bins=age_bins, labels=age_labels, right=True)

# Add Decade Column
season_winners['decade'] = (season_winners['year'] // 10) * 10

# Preview Data
print(season_winners[['year', 'driver', 'championship_age', 'age_range', 'decade']].head(100))

In [None]:
# Count Championships per Age Range and Decade
age_decade_stats = season_winners.groupby(['decade', 'age_range']).size().reset_index(name='championship_wins')

# Pivot for Heatmap (Fixed)
heatmap_data = age_decade_stats.pivot(index='decade', columns='age_range', values='championship_wins').fillna(0)

# Plot Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap='YlOrRd', linewidths=.5)
plt.title('Championship Wins by Age Range Across Decades')
plt.xlabel('Driver Age Range')
plt.ylabel('Decade')
plt.show()


In [None]:
age_range_stats = season_winners.groupby('age_range').size().reset_index(name='championship_wins')
plt.figure(figsize=(10, 6))
sns.barplot(data=age_range_stats, x='age_range', y='championship_wins', palette='viridis')
plt.title('Total Championship Wins by Driver Age Range')
plt.xlabel('Driver Age Range')
plt.ylabel('Number of Championships')
plt.show()

In [None]:
# Prepare Data for Stacked Barplot
age_decade_stats = season_winners.groupby(['decade', 'age_range']).size().reset_index(name='championship_wins')
pivot_df = age_decade_stats.pivot(index='decade', columns='age_range', values='championship_wins').fillna(0)

# Stacked Barplot
pivot_df.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='plasma')
plt.title('Championship Wins by Age Range per Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Championships')
plt.legend(title='Age Range')
plt.show()
