In [19]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [20]:
data = pd.read_csv('f1_cleaned.csv')
data = data.rename(columns={'driver_name' : 'team_name'})
data.dtypes

team_name         object
code              object
driver_nat        object
circuitRef        object
year             float64
round            float64
starting_pos     float64
finishing_pos    float64
laps             float64
quali_mean       float64
driver_age       float64
driver_dnf         int64
car_dnf            int64
dtype: object

In [47]:
#train test split
#we are not using a random split here, training with pre 2024 data and trying to predict the races that occured in 2024

train = data[data.year<2024]
test = data[data.year==2024]

#testing set
y_test = test.pop('finishing_pos')
x_test = test

#training set
y_train = train.pop('finishing_pos')
x_train = train


In [22]:
#encoding vars and scaling data

cat_feat = ['team_name', 
            'code', 
            'driver_nat', 
            'circuitRef']
x_num_feat = ['year', 
              'round', 
              'starting_pos', 
              'laps', 
              'quali_mean', 
              'driver_age', 
              'driver_dnf', 
              'car_dnf']

#scale y later if needed for a distance model

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_feat), #avoid dummy var trap with OHE
    ('scx', StandardScaler(), x_num_feat)
])




In [23]:
#Block for parameter tuning (grid search)

In [32]:
#Basic LR model (no tuning)

model = Pipeline(steps=[
    ('preprocessor', ct),
    ('regressor', LinearRegression())
])

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)
print('R2;', r2)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)

R2; 0.6229827414899718
Root Mean Squared Error: 3.5342341857980966




In [30]:
#Basic RF model (no tuning)

model = Pipeline(steps=[
    ('preprocessor', ct),
    ('regressor', RandomForestRegressor(n_estimators=100))
])

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)
print('R2;', r2)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)


R2; 0.7308738885643924
Root Mean Squared Error: 2.9860194071357324




In [48]:
#Basic XGB model (L2 regularization, manual preprocess as pipeline was getting error)
import multiprocessing

#fit transform train
x_train = ct.fit_transform(x_train)
regressor = XGBRegressor(objective='reg:squarederror', n_estimators=25, n_jobs=multiprocessing.cpu_count())
regressor.fit(x_train, y_train)

#transform test
x_test = ct.transform(x_test)
y_pred = regressor.predict(x_test)

r2 = r2_score(y_test, y_pred)
print('R2;', r2)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)


R2; 0.7401179477769638
Root Mean Squared Error: 2.9342887708687915




In [38]:
#df created to compare results 

comparison_df = pd.DataFrame({
    'Driver': test['code'],              
    'Circut': test['circuitRef'],          
    'Actual Pos': y_test,                         
    'Predicted Pos': y_pred                       
})