In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

drivers = pd.read_csv('drivers.csv')
results = pd.read_csv('results.csv')
races=pd.read_csv('races.csv')
imp_df = [ drivers,  results,races]


for df in imp_df:
    df.replace('\\N', np.nan, inplace=True)
    print(df.dtypes)
    

df=pd.merge(results,races[['raceId','year','name','round']],on='raceId',how='left')
df=pd.merge(df,drivers[['driverId','driverRef','nationality']],on='driverId',how='left')
df.drop(columns=['number','position','positionText','laps','fastestLap','resultId','driverId','constructorId'],axis=1,inplace=True)
df.rename(columns={'rank':'fastest_lap_rank','name_x':'GP_Name','nationality_x':'driver_nationality','name_y':'constructor_name','nationality_y':'constructors_nationality','driverRef':'driver'},inplace=True)


In [None]:
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()

In [None]:
df=df[['driver', 'raceId', 'positionOrder', 'points','statusId']].copy()
df.drop_duplicates(inplace=True)
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()

In [None]:
print(df.head())
df.shape

In [None]:


df_finished = df[df['statusId'] == 1].copy()

# Preview Cleaned Data
print(f"Original Rows: {len(df)}")
print(f"Rows After Dropping DNFs: {len(df_finished)}")
print(df_finished.head())
df_finished.shape

In [None]:
# Step 3: Calculate Driver Performance Metrics
driver_performance = df_finished.groupby('driver').agg(
    avg_finish=('positionOrder', 'mean'),
    finish_std=('positionOrder', 'std'),
    total_races=('raceId', 'nunique'),
    total_wins=('positionOrder', lambda x: (x == 1).sum()),
    total_podiums=('positionOrder', lambda x: (x <= 3).sum()),
    top_10_finishes=('positionOrder', lambda x: (x <= 10).sum())
).reset_index()


# Calculate Top 10 Finish Rate
driver_performance['top_10_rate'] = (driver_performance['top_10_finishes'] / driver_performance['total_races']) * 100

# Set a minimum race threshold (e.g., 10 races)
min_race_threshold = 10

#Filter drivers with sufficient race data
filtered_driver_performance = driver_performance[driver_performance['total_races'] >= min_race_threshold].copy()

#Preview Filtered Data
print(filtered_driver_performance.sort_values('avg_finish').head())
filtered_driver_performance.shape


In [None]:

# Scatter Plot: Avg Finish vs. Variability (Filtered)
plt.figure(figsize=(12, 8))
sns.scatterplot(data=filtered_driver_performance, x='avg_finish', y='finish_std', size='total_races',
                hue='top_10_rate', palette='coolwarm', legend='brief', sizes=(50, 300))

# Highlight key thresholds
plt.axvline(x=10, linestyle='--', color='gray', label='Top 10 Finish Line')
plt.axhline(y=3, linestyle='--', color='gray', label='Consistency Threshold')

plt.title('Driver Consistency Analysis (With Race Threshold)')
plt.xlabel('Average Finishing Position (Lower = Better)')
plt.ylabel('Standard Deviation of Finish (Lower = More Consistent)')
plt.legend(title='Top 10 Finish Rate')
plt.show()


In [None]:
consistent_drivers = filtered_driver_performance[(filtered_driver_performance['avg_finish'] <= 8) & (filtered_driver_performance['finish_std'] <= 3)]
consistent_drivers.sort_values(by=['total_podiums','top_10_rate'],inplace=True,ascending=[False,False])
print("Consistent Top Finishers:")
print(consistent_drivers[['driver', 'avg_finish', 'finish_std', 'total_podiums', 'top_10_rate']])


In [None]:
fluctuating_drivers = filtered_driver_performance[filtered_driver_performance['finish_std'] > 3]
fluctuating_drivers.sort_values(by=['total_podiums','top_10_rate'],inplace=True,ascending=[False,False])
print("Fluctuating Performers:")
print(fluctuating_drivers[['driver', 'avg_finish', 'finish_std', 'total_wins', 'top_10_rate']])


In [None]:

# Step 1: Apply Minimum Race Threshold
min_race_threshold = 10
filtered_driver_performance = driver_performance[driver_performance['total_races'] >= min_race_threshold].copy()

# Step 2: Identify Top 20 Consistent and Fluctuating Drivers
# Top 20 Consistent: Low avg_finish & low finish_std
top_consistent = filtered_driver_performance.sort_values(['avg_finish', 'finish_std']).head(20)

# Top 20 Fluctuating: High finish_std
top_fluctuating = filtered_driver_performance.sort_values('finish_std', ascending=False).head(20)

# Preview
print("Top 20 Consistent Drivers:")
print(top_consistent[['driver', 'avg_finish', 'finish_std']])

print("\n Top 20 Fluctuating Drivers:")
print(top_fluctuating[['driver', 'avg_finish', 'finish_std']])


In [None]:
plt.figure(figsize=(14, 6))
sns.barplot(data=top_consistent, x='avg_finish', y='driver', palette='Greens_r')
plt.title(' Top 20 Consistent Drivers (Lowest Avg Finish)')
plt.xlabel('Average Finishing Position')
plt.ylabel('Driver')
plt.show()


In [None]:
plt.figure(figsize=(14, 6))
sns.barplot(data=top_fluctuating, x='finish_std', y='driver', palette='Reds_r')
plt.title('⚡ Top 20 Fluctuating Drivers (Highest Std Deviation)')
plt.xlabel('Standard Deviation of Finish')
plt.ylabel('Driver')
plt.show()
