In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Step 1: Data Extraction and Loading

#Extract Data: Load the necessary tables from the Ergast dataset, which includes drivers, constructors, races, qualifying, results, driverStandings, and constructorStandings.
#Load Data: Use a database client (e.g., MySQL, SQLite) or CSV files to load the data into a pandas DataFrame.
# Load data
constructors_df = pd.read_csv("f1db_csv/constructors.csv")
races_df=pd.read_csv("f1db_csv/races.csv")
quali_df=pd.read_csv("f1db_csv/qualifying.csv")
results_df=pd.read_csv("f1db_csv/results.csv")
driverStandings_df=pd.read_csv("f1db_csv/driver_standings.csv")
constructorStandings_df=pd.read_csv("f1db_csv/constructor_standings.csv")
drivers_df = pd.read_csv('f1db_csv/drivers.csv')

#Step 2: Data Preprocessing

#Merge Tables:
#Merge results with races to get race-specific details.
#Merge the combined DataFrame with qualifying to get qualifying positions.
#Merge with driverStandings and constructorStandings to include standings.
#Merge with drivers and constructors to add driver and constructor details.

# Merge data
data = results_df.merge(races_df, on="raceId")
data1 = data.merge(quali_df, on=["raceId","driverId","constructorId"], how="left")
data2 = data1.merge(driverStandings_df, on=["raceId","driverId"], how="left")
data3 = data2.merge(constructorStandings_df, on=['raceId','constructorId'])
data4 = data3.merge(drivers_df, on=['driverId'], how='left')
data5 = data4.merge(constructors_df, on=['constructorId'], how='left')

In [2]:
#Handle Missing Values:
#Fill or drop missing values as appropriate for the analysis.
#Common strategies include filling with mean/median/mode or dropping rows with missing critical values.
#Convert Data Types:
#Ensure all columns are in the appropriate data type (e.g., integers for IDs, floats for numerical values).

columns=data5.columns

data5.replace('\\N', np.nan, inplace=True)
total_n_values = (data5.applymap(lambda x: x == '\\N')).sum().sum()
print(total_n_values)

data5.to_csv('output.csv', index=False)
f1data=pd.read_csv("output.csv", low_memory=False, index_col=False)
f1data.drop(['drivers_url','const_url'], axis=1, inplace=True)

  total_n_values = (data5.applymap(lambda x: x == '\\N')).sum().sum()


0


In [3]:
#Step 3: Feature Engineering

#Create New Features:
#Qualifying Position: Use the qualifying position as a feature.
#Previous Race Results: Calculate the average finishing position of the driver and constructor in previous races.
#Circuit Characteristics: Extract features such as circuit length, number of turns, and altitude.
#Weather Conditions: If available, include weather conditions like temperature, rain probability, etc.
#Driver and Constructor Standings: Use the current standings points and positions.
#Normalize/Standardize Features: Scale numerical features to ensure they have a mean of 0 and standard deviation of 1 (standardization) or scale between 0 and 1 (normalization).

# Feature engineering
f1data['average_finish'] = f1data.groupby('driverId')['results_positionOrder'].transform('mean')
f1data['date']=pd.to_datetime(f1data['races_date']).apply(lambda x: x.year)
f1data['age'] = f1data['date'] - pd.to_datetime(f1data['drivers_dob']).dt.year
f1data['experience'] = f1data['year'] - f1data.groupby('driverId')['year'].transform('min')
f1data['average_lap_time'] = f1data.groupby(['raceId', 'driverId'])['results_milliseconds'].transform('mean')

# Select features and target
features = ['age','experience','results_grid','quali_position','average_lap_time','average_finish','driver_standings_points','driver_standings_position','const_standing_points','const_standing_position']
target = 'results_positionOrder'
X = f1data[features]
y = f1data[target]

# Handling missing values
X.fillna(X.mean(), inplace=True)

# Encoding categorical variables
X = pd.get_dummies(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


In [4]:
f1data_filtered = f1data[f1data['date'] >= 2004]
f1data_filtered = f1data_filtered.sort_values(by='date', ascending=True)
f1data_filtered.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,results_number,results_grid,results_position,results_positionText,results_positionOrder,results_points,...,drivers_dob,drivers_nationality,constructorRef,const_name,const_nationality,average_finish,date,age,experience,average_lap_time
1590,1601,94,21,15,11.0,12,7.0,7,7,2.0,...,1973-01-14,Italian,sauber,Sauber,Swiss,10.880531,2004,31,8,5329949.0
1618,1627,95,8,1,6.0,5,,R,13,0.0,...,1979-10-17,Finnish,mclaren,McLaren,British,8.491477,2004,25,3,
1619,1632,95,14,1,5.0,8,,R,18,0.0,...,1971-03-27,British,mclaren,McLaren,British,9.386831,2004,33,10,
1620,1630,95,17,19,14.0,11,,R,16,0.0,...,1976-08-27,Australian,jaguar,Jaguar,British,9.691589,2004,28,2,
1621,1634,95,32,19,15.0,14,,R,20,0.0,...,1983-02-07,Austrian,jaguar,Jaguar,British,13.666667,2004,21,0,


In [5]:
f1data_filtered.isna().sum()

resultId               0
raceId                 0
driverId               0
constructorId          0
results_number         0
                    ... 
average_finish         0
date                   0
age                    0
experience             0
average_lap_time    4223
Length: 66, dtype: int64

In [None]:
scaler=MinMaxScaler()
scaler.fit(X)
scaled_data=scaler.transform(X)

In [None]:
scaled_data

In [6]:
column_types = f1data.dtypes

# Filter columns with 'object' type (string columns)
string_columns = column_types[column_types == 'object'].index.tolist()

In [7]:
len(string_columns)

31

In [8]:
from pycaret.regression import *

setup(data=f1data_filtered, 
      target=target, 
      numeric_features=features, 
      ignore_features=string_columns)

Unnamed: 0,Description,Value
0,Session id,2570
1,Target,results_positionOrder
2,Target type,Regression
3,Original data shape,"(8350, 66)"
4,Transformed data shape,"(8350, 35)"
5,Transformed train set shape,"(5845, 35)"
6,Transformed test set shape,"(2505, 35)"
7,Ignore features,31
8,Numeric features,10
9,Rows with missing values,100.0%


<pycaret.regression.oop.RegressionExperiment at 0x16e6492a090>

In [9]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.2838,0.4515,0.6678,0.9882,0.0405,0.0238,9.89
rf,Random Forest Regressor,0.2392,0.4635,0.6775,0.9879,0.0373,0.0135,2.944
lightgbm,Light Gradient Boosting Machine,0.2576,0.4701,0.6813,0.9877,0.0379,0.0167,0.563
xgboost,Extreme Gradient Boosting,0.2699,0.4928,0.6997,0.9871,0.0393,0.0176,0.65
dt,Decision Tree Regressor,0.3285,0.9666,0.9771,0.9748,0.0535,0.0183,0.082
dummy,Dummy Regressor,5.3531,38.3613,6.1931,-0.0016,0.6895,1.2278,0.049


<catboost.core.CatBoostRegressor at 0x16e6cdde9d0>

In [10]:
lgbm_model=create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2995,0.4931,0.7022,0.9874,0.0439,0.0258
1,0.2851,0.4617,0.6795,0.9879,0.0389,0.0235
2,0.307,0.5119,0.7155,0.986,0.0473,0.0274
3,0.2788,0.4495,0.6704,0.9887,0.0411,0.0236
4,0.3206,0.5602,0.7485,0.9857,0.0422,0.0245
5,0.2607,0.4198,0.6479,0.9886,0.0389,0.0218
6,0.2561,0.304,0.5514,0.9921,0.031,0.0204
7,0.2578,0.3586,0.5988,0.9906,0.0354,0.0214
8,0.3226,0.6327,0.7954,0.984,0.0502,0.0276
9,0.2495,0.3232,0.5685,0.9913,0.0359,0.0219


In [None]:
evaluate_model(lgbm_model)

In [None]:
prediction_df=predict_model(lgbm_model)

In [None]:
new_predictions=predict_model(lgbm_model, f1data_filtered)

In [None]:
plot_model(lgbm_model, plot='feature')

In [None]:
new_predictions.head()

In [None]:
newf1=f1data_filtered.drop(columns=string_columns)
newf1_filtered=newf1.drop(columns='results_positionOrder')


In [None]:
len(newf1_filtered.columns)

In [None]:
#Interpret the model


import shap

# Assuming your model is named 'best_model'
explainer = shap.TreeExplainer(lgbm_model)
shap_values = explainer.shap_values(newf1_filtered)

# Plot feature importance
shap.summary_plot(shap_values, newf1_filtered)

In [None]:
from pycaret.classification import tune_model

tuned_model = tune_model(lgbm_model)
print(tuned_model)


In [None]:
import joblib

# Save the model
joblib.dump(tuned_model, 'lgbm_model.pkl')

# Load the model
loaded_model = joblib.load('lgbm_model.pkl')