In [5]:
#subproblem 5
import pandas as pd

# Load datasets
drivers_df = pd.read_csv('drivers.csv')
constructors_df = pd.read_csv('constructors.csv')
driver_standings_df = pd.read_csv('driver_standings.csv')
constructor_standings_df = pd.read_csv('constructor_standings.csv')
final_df = pd.read_excel('final_df.xlsx')  # Merged dataset


# Choose two drivers to swap
driver_1 = 'Max Verstappen'  # From Red Bull
driver_2 = 'Lewis Hamilton'  # From Mercedes

# Get driver IDs (using str.contains for flexible matching)
driver_1_id = drivers_df[drivers_df['surname'].str.contains(driver_1.split()[-1])]['driverId'].values[0]  # Get ID using surname
driver_2_id = drivers_df[drivers_df['surname'].str.contains(driver_2.split()[-1])]['driverId'].values[0]  # Get ID using surname

# Get constructor IDs for Red Bull and Mercedes
red_bull_id = constructors_df[constructors_df['name'] == 'Red Bull']['constructorId'].values[0]
mercedes_id = constructors_df[constructors_df['name'] == 'Mercedes']['constructorId'].values[0]

# Swap constructor affiliations
final_df.loc[final_df['driverId'] == driver_1_id, 'constructorId'] = mercedes_id
final_df.loc[final_df['driverId'] == driver_2_id, 'constructorId'] = red_bull_id

In [6]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Assuming final_df is loaded correctly before this cell

# Define X and y
# 'positionOrder' is the correct column name in your final_df for finishing position
X = final_df.drop(columns=['position'])  # Features, corrected column name
y = final_df['position']                # Target, corrected column name             # Target, corrected column name

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X, y)

# Predict finishing positions and add the column to final_df
final_df['new_predicted_position'] = rf_model.predict(X)

# Now, proceed with updating driver points:
driver_standings_df['new_points'] = driver_standings_df['driverId'].map(
    final_df.groupby('driverId')['new_predicted_position'].mean()
)

# Sort drivers based on new points
driver_standings_df = driver_standings_df.sort_values('new_points', ascending=True)

print("\n Updated Driver Standings:\n", driver_standings_df.head(10))


 Updated Driver Standings:
        driverStandingsId  raceId  driverId  points  position positionText  \
31135              67919     963         1   265.0         2            2   
31847              68770     995         1   120.0         2            2   
33124              70451    1060         1   150.0         2            2   
33125              70471    1061         1   177.0         2            2   
34601              72878    1133         1   125.0         6            6   
28534              64481     846         1    85.0         2            2   
33646              71524    1085         1   127.0         6            6   
30195              66851     910         1   191.0         2            2   
30217              66873     911         1   191.0         2            2   
32507              69690    1028         1   381.0         1            1   

       wins  new_points  
31135     6    3.214925  
31847     2    3.214925  
33124     3    3.214925  
33125     4    3.21