In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


drivers = pd.read_csv('drivers.csv')
constructors = pd.read_csv('constructors.csv')
results = pd.read_csv('results.csv')
races=pd.read_csv('races.csv')
imp_df = [constructors, drivers,  results,races]


for df in imp_df:
    df.replace('\\N', np.nan, inplace=True)
    
races.drop(columns=['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],inplace=True)
df=pd.merge(results,races[['raceId','year','name','round']],on='raceId',how='left')
df=pd.merge(df,drivers[['driverId','driverRef','nationality']],on='driverId',how='left')
df=pd.merge(df,constructors[['constructorId','constructorRef','name','nationality']],on='constructorId',how='left')
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in Races DataFrame', fontsize=16)
plt.show()


In [None]:
df.drop(columns=['number','position','positionText','laps','fastestLap','resultId','driverId','constructorId'],axis=1,inplace=True)

df.rename(columns={'rank':'fastest_lap_rank','name_x':'GP_Name','nationality_x':'driver_nationality','name_y':'constructor_name','nationality_y':'constructors_nationality','driverRef':'driver'},inplace=True)
plt.figure(figsize=(16, 10))

# Create a heatmap to visualize null values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Add title and display the heatmap
plt.title('Heatmap of Null Values in  DataFrame', fontsize=16)
plt.show()

In [None]:
df=df[['raceId', 'year', 'driver', 'constructor_name','round' ,'positionOrder','points', 'grid', 'statusId', 'fastest_lap_rank']]
df=df.sort_values(by=['year','round','positionOrder'],ascending=[False,True,True])
df.fastest_lap_rank=df.fastest_lap_rank.astype(float)
print(df.shape)
df.reset_index(drop=True,inplace=True)
print(df.info)
print(df.dtypes)


In [None]:
df['positionOrder'] = pd.to_numeric(df['positionOrder'], errors='coerce')
df['points'] = pd.to_numeric(df['points'], errors='coerce')
df['fastest_lap_rank'] = pd.to_numeric(df['fastest_lap_rank'], errors='coerce')
df['DNF'] = df['statusId'].apply(lambda x: 1 if x != 1 else 0)

# Preview Data
print(df.head())

In [None]:
driver_stats = df.groupby('driver').agg(
    total_races=('raceId', 'nunique'),
    total_wins=('positionOrder', lambda x: (x == 1).sum()),
    total_podiums=('positionOrder', lambda x: (x <= 3).sum()),
    avg_points=('points', 'mean'),
    avg_finish=('positionOrder', 'mean'),
    pole_positions=('grid', lambda x: (x == 1).sum()),
    fastest_laps=('fastest_lap_rank', lambda x: (x == 1).sum()),
    dnf_count=('DNF', 'sum')
).reset_index()

driver_stats['win_rate'] = driver_stats['total_wins'] / driver_stats['total_races']
driver_stats['podium_rate'] = driver_stats['total_podiums'] / driver_stats['total_races']
driver_stats['dnf_rate'] = driver_stats['dnf_count'] / driver_stats['total_races']

driver_stats.sort_values(by=['total_wins','avg_points'],ascending=[False,False],inplace=True)
print(driver_stats.head())


In [None]:
race_threshold = 100  

filtered_driver_stats = driver_stats[driver_stats['total_races'] >= race_threshold].copy()

filtered_driver_stats.sort_values(by=['total_wins', 'avg_points'], ascending=[False, False], inplace=True)

print(filtered_driver_stats)


In [None]:


plt.figure(figsize=(12, 7))
sns.scatterplot(data=filtered_driver_stats.head(20), x='total_races', y='total_wins', hue='driver', s=100, palette='Spectral')
plt.title('Wins vs. Total Races (Drivers Above Threshold)')
plt.xlabel('Total Races')
plt.ylabel('Total Wins')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()


In [None]:
# Define weights for each metric based on importance
weights = {
    'win_rate': 0.5,
    'podium_rate': 0.4,
    'avg_points': 0.25,
    'dnf_rate': -0.1  # Negative weight for DNFs
}

# Calculate composite score
filtered_driver_stats['score'] = (
    filtered_driver_stats['win_rate'] * weights['win_rate'] +
    filtered_driver_stats['podium_rate'] * weights['podium_rate'] +
    filtered_driver_stats['avg_points'] * weights['avg_points'] +
    filtered_driver_stats['dnf_rate'] * weights['dnf_rate']
)

# Sort by score
best_lineup = filtered_driver_stats.sort_values(by='score', ascending=False).head(10)  # Top 5 drivers
print(best_lineup[['driver', 'score', 'win_rate', 'avg_points', 'dnf_rate']])
