In [1]:
import pandas as pd
import numpy as np

#Step 1: Data Extraction and Loading

#Extract Data: Load the necessary tables from the Ergast dataset, which includes drivers, constructors, races, qualifying, results, driverStandings, and constructorStandings.
#Load Data: Use a database client (e.g., MySQL, SQLite) or CSV files to load the data into a pandas DataFrame.
# Load data
constructors_df = pd.read_csv("f1db_csv/constructors.csv")
races_df=pd.read_csv("f1db_csv/races.csv")
quali_df=pd.read_csv("f1db_csv/qualifying.csv")
results_df=pd.read_csv("f1db_csv/results.csv")
driverStandings_df=pd.read_csv("f1db_csv/driver_standings.csv")
constructorStandings_df=pd.read_csv("f1db_csv/constructor_standings.csv")
drivers_df = pd.read_csv('f1db_csv/drivers.csv')

#Step 2: Data Preprocessing

#Merge Tables:
#Merge results with races to get race-specific details.
#Merge the combined DataFrame with qualifying to get qualifying positions.
#Merge with driverStandings and constructorStandings to include standings.
#Merge with drivers and constructors to add driver and constructor details.

# Merge data
data = results_df.merge(races_df, on="raceId")
data1 = data.merge(quali_df, on=["raceId","driverId","constructorId"], how="left")
data2 = data1.merge(driverStandings_df, on=["raceId","driverId"], how="left")
data3 = data2.merge(constructorStandings_df, on=['raceId','constructorId'])
data4 = data3.merge(drivers_df, on=['driverId'], how='left')
data5 = data4.merge(constructors_df, on=['constructorId'], how='left')

In [2]:
#Handle Missing Values:
#Fill or drop missing values as appropriate for the analysis.
#Common strategies include filling with mean/median/mode or dropping rows with missing critical values.
#Convert Data Types:
#Ensure all columns are in the appropriate data type (e.g., integers for IDs, floats for numerical values).

columns=data5.columns

data5.replace('\\N', np.nan, inplace=True)
total_n_values = (data5.applymap(lambda x: x == '\\N')).sum().sum()
print(total_n_values)

data5.to_csv('output.csv', index=False)
f1data=pd.read_csv("output.csv", low_memory=False, index_col=False)
f1data.drop(['drivers_url','const_url'], axis=1, inplace=True)

  total_n_values = (data5.applymap(lambda x: x == '\\N')).sum().sum()


0


In [3]:
#Step 3: Feature Engineering

#Create New Features:
#Qualifying Position: Use the qualifying position as a feature.
#Previous Race Results: Calculate the average finishing position of the driver and constructor in previous races.
#Circuit Characteristics: Extract features such as circuit length, number of turns, and altitude.
#Weather Conditions: If available, include weather conditions like temperature, rain probability, etc.
#Driver and Constructor Standings: Use the current standings points and positions.
#Normalize/Standardize Features: Scale numerical features to ensure they have a mean of 0 and standard deviation of 1 (standardization) or scale between 0 and 1 (normalization).

# Feature engineering
f1data['average_finish'] = f1data.groupby('driverId')['results_positionOrder'].transform('mean')
f1data['date']=pd.to_datetime(f1data['races_date']).apply(lambda x: x.year)
f1data['age'] = f1data['date'] - pd.to_datetime(f1data['drivers_dob']).dt.year
f1data['experience'] = f1data['year'] - f1data.groupby('driverId')['year'].transform('min')
f1data['average_lap_time'] = f1data.groupby(['raceId', 'driverId'])['results_milliseconds'].transform('mean')

# Select features and target
features = ['age','experience','results_grid','quali_position','average_lap_time','average_finish','driver_standings_points','driver_standings_position','const_standing_points','const_standing_position']
target = 'results_positionOrder'
X = f1data[features]
y = f1data[target]

# Handling missing values
X.fillna(X.mean(), inplace=True)

# Encoding categorical variables
X = pd.get_dummies(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


In [4]:
from sklearn.model_selection import train_test_split

# Binning the target variable for classification
y_binned = pd.qcut(y, q=5, labels=False)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binned, test_size=0.2, random_state=42)

In [5]:
#Step 5: Model Selection and Training
#Choose a Model:
#Regression models like Linear Regression, Ridge Regression.
#Tree-based models like Decision Trees, Random Forests, Gradient Boosting.
#Ensemble models like XGBoost, LightGBM.

#Train the Model:
#Train the chosen model using the training set.
#Use cross-validation to tune hyperparameters and avoid overfitting.

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define the model
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_state=42, verbose=100)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

0:	learn: 1.5228697	total: 198ms	remaining: 3m 17s
100:	learn: 1.0141697	total: 5.55s	remaining: 49.4s
200:	learn: 0.9692543	total: 11.3s	remaining: 44.9s
300:	learn: 0.9371114	total: 18.8s	remaining: 43.7s
400:	learn: 0.9074144	total: 23.1s	remaining: 34.5s
500:	learn: 0.8815588	total: 27.8s	remaining: 27.7s
600:	learn: 0.8577650	total: 31.9s	remaining: 21.2s
700:	learn: 0.8385211	total: 37.8s	remaining: 16.1s
800:	learn: 0.8190675	total: 43.8s	remaining: 10.9s
900:	learn: 0.8002419	total: 49s	remaining: 5.39s
999:	learn: 0.7819891	total: 53.8s	remaining: 0us
Accuracy: 0.525086329473898
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      1050
           1       0.47      0.46      0.46      1025
           2       0.38      0.39      0.39      1041
           3       0.39      0.44      0.41       998
           4       0.59      0.51      0.54       809

    accuracy                           0.53      4923
   macro avg       0.53  

In [6]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
}

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=CatBoostClassifier(random_state=42, verbose=100), param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Model Accuracy: {accuracy}')
print(f'Best Parameters: {best_params}')
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
0:	learn: 1.5672903	total: 24.6ms	remaining: 12.3s
100:	learn: 1.0752569	total: 2.15s	remaining: 8.51s
200:	learn: 1.0420745	total: 4.18s	remaining: 6.22s
300:	learn: 1.0218483	total: 6.25s	remaining: 4.13s
400:	learn: 1.0096699	total: 7.99s	remaining: 1.97s
499:	learn: 1.0006602	total: 9.79s	remaining: 0us
Best Model Accuracy: 0.5303676619947186
Best Parameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1050
           1       0.47      0.45      0.46      1025
           2       0.39      0.40      0.40      1041
           3       0.40      0.46      0.43       998
           4       0.60      0.51      0.55       809

    accuracy                           0.53      4923
   macro avg       0.54      0.53      0.53      4923
weighted avg       0.54      0.53      0.53      4923

