In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


drivers = pd.read_csv('drivers.csv')
constructors = pd.read_csv('constructors.csv')
results = pd.read_csv('results.csv')
races=pd.read_csv('races.csv')
imp_df = [constructors, drivers, results,races]


for df in imp_df:
    df.replace('\\N', np.nan, inplace=True)
    print(df.dtypes)    
    
races.drop(columns=['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],inplace=True)




In [None]:
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(drivers.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Drivers DataFrame', fontsize=16)
plt.show()
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(constructors.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Constructors DataFrame', fontsize=16)
plt.show()
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(results.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Results DataFrame', fontsize=16)
plt.show()
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(races.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()


In [None]:
df=pd.merge(results,races[['raceId','year','name','round']],on='raceId',how='left')
df=pd.merge(df,drivers[['driverId','driverRef','nationality']],on='driverId',how='left')
df=pd.merge(df,constructors[['constructorId','constructorRef','name','nationality']],on='constructorId',how='left')
plt.figure(figsize=(16, 10))

sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()


In [None]:
df.drop(columns=['number','position','positionText','laps','fastestLap','statusId','resultId','driverId','constructorId'],axis=1,inplace=True)

In [None]:

df.rename(columns={'rank':'fastest_lap_rank','name_x':'GP_Name','nationality_x':'driver_nationality','name_y':'constructor_name','nationality_y':'constructors_nationality','driverRef':'driver'},inplace=True)
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()

In [None]:
df=df[['year','raceId','GP_Name','round','driver','constructor_name','grid','positionOrder','points','time','milliseconds','fastest_lap_rank','fastestLapTime','fastestLapSpeed','driver_nationality','constructors_nationality']]
df=df.sort_values(by=['year','round','positionOrder'],ascending=[False,True,True])
df.fastestLapSpeed=df.fastestLapSpeed.astype(float)
df.fastest_lap_rank=df.fastest_lap_rank.astype(float)
df.milliseconds=df.milliseconds.astype(float)
print(df.shape)
df.reset_index(drop=True,inplace=True)
print(df.info)


In [None]:
print(df.head(10))

In [None]:
driver_stats = df.groupby('driver').agg(
    total_races=('raceId', 'nunique'),  # Total number of races per driver
    wins=('positionOrder', lambda x: (x == 1).sum()),
    podiums=('positionOrder', lambda x: (x <= 3).sum()),
    points=('points', 'sum'),
    fastest_laps=('fastest_lap_rank', lambda x: (x == 1).sum())
).reset_index()
driver_stats['win_ratio'] = (driver_stats['wins'] / driver_stats['total_races']) * 100



# Sort by Win Ratio
race_threshold = 100 
top_win_ratio = driver_stats.sort_values(by='win_ratio', ascending=False)
top_win_ratio=top_win_ratio[top_win_ratio['total_races']>=race_threshold].head(10)
print(top_win_ratio)

# Plot Win Ratio
plt.figure(figsize=(14, 6))
sns.barplot(data=top_win_ratio, x='win_ratio', y='driver', palette='plasma')
plt.title(f'Top 10 Drivers by Win Ratio (Min {race_threshold} Races)')
plt.xlabel('Win Ratio (%)')
plt.ylabel('Driver')
plt.show()

# Sort for Top Drivers by Points
top_drivers = driver_stats.sort_values(by='wins', ascending=False).head(10)

# ============================
# 2. Visualization: Top 10 Drivers by Wins
# ============================

plt.figure(figsize=(14, 6))
sns.barplot(data=top_drivers, x='wins', y='driver', palette='coolwarm')
plt.title('Top 10 Drivers by Wins')
plt.xlabel('Total Wins')
plt.ylabel('Driver')
plt.show()

top_drivers = driver_stats.sort_values(by='podiums', ascending=False).head(10)

# Podiums Plot
plt.figure(figsize=(14, 6))
sns.barplot(data=top_drivers, x='podiums', y='driver', palette='magma')
plt.title('Top 10 Drivers by Podiums')
plt.xlabel('Total Podiums')
plt.ylabel('Driver')
plt.show()

top_drivers = driver_stats.sort_values(by='points', ascending=False).head(10)

# Points Plot
plt.figure(figsize=(14, 6))
sns.barplot(data=top_drivers, x='points', y='driver', palette='viridis')
plt.title('Top 10 Drivers by Total Points')
plt.xlabel('Total Points')
plt.ylabel('Driver')
plt.show()

fastest_laps = df[df['fastest_lap_rank'] == 1].groupby('driver').size().reset_index(name='fastest_laps')
top_fastest = fastest_laps.sort_values('fastest_laps', ascending=False).head(10)

plt.figure(figsize=(14, 6))
sns.barplot(data=top_fastest, x='fastest_laps', y='driver', palette='Spectral')
plt.title('Top 10 Drivers by Fastest Laps')
plt.xlabel('Fastest Laps')
plt.ylabel('Driver')
plt.show()


In [None]:

# Aggregate Wins, Podiums, and Points for Constructors
constructor_stats = df.groupby('constructor_name').agg(
    total_races=('raceId', 'nunique'),  # Total number of races per driver
    wins=('positionOrder', lambda x: (x == 1).sum()),
    podiums=('positionOrder', lambda x: (x <= 3).sum()),
    points=('points', 'sum')
).reset_index()

constructor_stats['win_ratio'] = (constructor_stats['wins'] / constructor_stats['total_races']) * 100



# Sort by Win Ratio
top_win_ratio = constructor_stats.sort_values(by='win_ratio', ascending=False)
top_win_ratio=top_win_ratio[top_win_ratio['total_races']>=race_threshold].head(10)
print(top_win_ratio)

# Plot Win Ratio
plt.figure(figsize=(14, 6))
sns.barplot(data=top_win_ratio, x='win_ratio', y='constructor_name', palette='plasma')
plt.title(f'Top 10 Constructors by Win Ratio (Min {race_threshold} Races)')
plt.xlabel('Win Ratio (%)')
plt.ylabel('Driver')
plt.show()


# ============================
# 2. Visualization: Top 10 Constructors by Wins
# ============================
top_constructors = constructor_stats.sort_values(by='wins', ascending=False).head(10)

plt.figure(figsize=(14, 6))
sns.barplot(data=top_constructors, x='wins', y='constructor_name', palette='coolwarm')
plt.title('Top 10 Constructors by Wins')
plt.xlabel('Total Wins')
plt.ylabel('Constructor')
plt.show()

# ============================
# 3. Visualization: Top 10 Constructors by Podiums
# ============================
top_constructors = constructor_stats.sort_values(by='podiums', ascending=False).head(10)

plt.figure(figsize=(14, 6))
sns.barplot(data=top_constructors, x='podiums', y='constructor_name', palette='magma')
plt.title('Top 10 Constructors by Podiums')
plt.xlabel('Total Podiums')
plt.ylabel('Constructor')
plt.show()

# ============================
# 4. Visualization: Top 10 Constructors by Total Points
# ============================
top_constructors = constructor_stats.sort_values(by='points', ascending=False).head(10)

plt.figure(figsize=(14, 6))
sns.barplot(data=top_constructors, x='points', y='constructor_name', palette='viridis')
plt.title('Top 10 Constructors by Total Points')
plt.xlabel('Total Points')
plt.ylabel('Constructor')
plt.show()

In [None]:
from scipy.stats import spearmanr

career_stats = df.groupby('driver').agg(
    first_year=('year', 'min'),
    last_year=('year', 'max'),
    total_races=('raceId', 'nunique'),
    wins=('positionOrder', lambda x: (x == 1).sum()),
    podiums=('positionOrder', lambda x: (x <= 3).sum()),
    points=('points', 'sum')
).reset_index()

# Calculate Career Longevity (in years)
career_stats['career_length'] = career_stats['last_year'] - career_stats['first_year'] + 1

# ============================
# 3. Apply Race Participation Threshold
# ============================

# Set threshold for minimum races (e.g., 50)
race_threshold = 50
filtered_stats = career_stats[career_stats['total_races'] >= race_threshold]

print(f"Drivers after applying threshold ({race_threshold} races): {filtered_stats.shape[0]}")

# ============================
# 4. Visualizations: Career Longevity vs. Success
# ============================

sns.set(style="whitegrid")

# Career Length vs. Wins
plt.figure(figsize=(12, 6))
sns.scatterplot(data=filtered_stats, x='career_length', y='wins', hue='total_races', size='total_races', palette='coolwarm', alpha=0.7)
plt.title(f'Career Longevity vs. Total Wins (Drivers with ≥ {race_threshold} Races)')
plt.xlabel('Career Length (Years)')
plt.ylabel('Total Wins')
plt.legend(title='Total Races')
plt.show()

# Career Length vs. Podiums
plt.figure(figsize=(12, 6))
sns.scatterplot(data=filtered_stats, x='career_length', y='podiums', hue='total_races', size='total_races', palette='magma', alpha=0.7)
plt.title(f'Career Longevity vs. Total Podiums (Drivers with ≥ {race_threshold} Races)')
plt.xlabel('Career Length (Years)')
plt.ylabel('Total Podiums')
plt.legend(title='Total Races')
plt.show()

# Career Length vs. Points
plt.figure(figsize=(12, 6))
sns.scatterplot(data=filtered_stats, x='career_length', y='points', hue='total_races', size='total_races', palette='viridis', alpha=0.7)
plt.title(f'Career Longevity vs. Total Points (Drivers with ≥ {race_threshold} Races)')
plt.xlabel('Career Length (Years)')
plt.ylabel('Total Points')
plt.legend(title='Total Races')
plt.show()


for metric in ['wins', 'podiums', 'points']:
    corr, p_value = spearmanr(filtered_stats['career_length'], filtered_stats[metric])
    print(f"Spearman Correlation between Career Length and {metric.title()} (Threshold Applied): {corr:.2f} (p-value: {p_value:.4f})")
