In [3]:
import pandas as pd

# Load the dataframes
drivers_df = pd.read_csv('Sources/drivers_updated.csv')
teams_df = pd.read_csv('Sources/teams_updated.csv')
winners_df = pd.read_csv('Sources/winners.csv')

# Extract the year from the Date column in winners_df
winners_df['year'] = pd.to_datetime(winners_df['Date']).dt.year

# Split the 'Name Code' column to extract the 'Code'
winners_df[['Name', 'Code']] = winners_df['Name Code'].str.extract(r'(\w+)\s+(\w+)')
winners_df.drop(columns=['Name Code'], inplace=True)

# Rename the 'Car' column to 'Team' in winners_df
winners_df.rename(columns={'Car': 'Team'}, inplace=True)

# Rename the 'Team' column to 'Car' in teams_df
teams_df.rename(columns={'Team': 'Car'}, inplace=True)

# Calculate the total number of victories for each driver
driver_victories_df = winners_df.groupby('Winner').size().reset_index(name='Total Driver Victories')

# Calculate the total number of victories for each team
team_victories_df = winners_df.groupby('Team').size().reset_index(name='Total Races won by team')

# Merge the dataframes
merged_df = pd.merge(drivers_df, winners_df, left_on=['Driver', 'year'], right_on=['Winner', 'year'], how='inner')
merged_df = pd.merge(merged_df, teams_df, left_on=['Team', 'year'], right_on=['Car', 'year'], how='inner')

# Add the total driver victories column
merged_df = pd.merge(merged_df, driver_victories_df, left_on='Driver', right_on='Winner', how='left')

# Add the total team victories column
merged_df = pd.merge(merged_df, team_victories_df, left_on='Team', right_on='Team', how='left')

# Fill missing data with 'NA'
merged_df = merged_df.fillna('NA')

# Drop duplicate or unnecessary columns
columns_to_drop = [
    'Car_x', 'Points', 'Winner', 'Laps', 'Time', 'Pos', 'Car_y', 'PTS_y', 'Position', 
    'Points', 'Winner_x', 'Pos_y', 'Team_y', 'Winner_y', 'year_y', 'Name', 'Code_y', 'Date', 'Pos_x', 'PTS_x'
]
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Rename columns to remove suffixes and clarify their names
merged_df.rename(columns={
    'Driver': 'Driver',
    'Nationality': 'Nationality',
    'Team_x': 'Scuderia',  # Rename Team_x to Scuderia
    'Code_x': 'Code',
    'Grand Prix': 'Grand Prix',
    'Time_x': 'Fastest Lap Time',
    'year_x': 'Year'
}, inplace=True)

# Remove duplicates to ensure each driver appears only once
merged_df.drop_duplicates(subset=['Driver'], inplace=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('Sources/merged_data.csv', index=False)

# Display the merged dataframe
display(merged_df.head())



Unnamed: 0,Pos_x,Driver,Nationality,Car_x,PTS_x,year,Code_x,Grand Prix,Date,Winner,Team,Laps,Time,Name,Code_y,Pos_y,Car_y,PTS_y
0,6,Bruce McLaren,NZL,Cooper Climax,16.5,1959,MCL,United States,1959-12-12,Bruce McLaren,Cooper Climax,42.0,2:12:35.700,,,1,Cooper Climax,40.0
1,9,Giancarlo Baghetti,ITA,Ferrari,9.0,1961,BAG,France,1961-07-02,Giancarlo Baghetti,Ferrari,52.0,2:14:17.500,,,1,Ferrari,40.0
2,3,Jackie Stewart,GBR,BRM,33.0,1965,STE,Italy,1965-09-12,Jackie Stewart,BRM,76.0,2:04:52.800,,,2,BRM,45.0
3,3,Clay Regazzoni,SUI,Ferrari,33.0,1970,REG,Italy,1970-09-06,Clay Regazzoni,Ferrari,68.0,1:39:06.880,,,2,Ferrari,52.0
4,10,Emerson Fittipaldi,BRA,Lotus Ford,12.0,1970,FIT,United States,1970-10-04,Emerson Fittipaldi,Lotus Ford,108.0,1:57:32.790,,,1,Lotus Ford,59.0
