In [3]:
%pip install -q --upgrade numpy scikit-learn fastf1 keras xgboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
import fastf1
fastf1.Cache.enable_cache("cache") # Local cache for low-latency data retrieval

In [5]:
# clean air race pace from racepace.py
# NOTE: CLEAN AIR RACE PACE IS AVERAGE OF DRIVER LAP RIMES FROM FP2
clean_air_race_pace = {
    "VER": 93.191067, "HAM": 94.020622, "LEC": 93.418667, "NOR": 93.428600, "ALO": 94.784333,
    "PIA": 93.232111, "RUS": 93.833378, "SAI": 94.497444, "STR": 95.318250, "HUL": 95.345455,
    "OCO": 95.682128
}

In [None]:
# KERAS NN CONSTRUCTION:  functional API
# The Functional API, which is an easy-to-use, fully-featured API that supports arbitrary model architectures. For most people and most use cases, this is what you should be using. This is the Keras "industry strength" model.

# KERAS NN CONSTRUCTION: The sequential feedforward network is build using keras_model_sequential() with dense layers. model_1_ANN is built with 1 hidden layer, 5 neurons, 17 predictor variables, and 1 output layer. The activation function used is ReLU - rectified linear unit.

#model_1_ANN, ran with a mini-batch gradient descent size of 32 and the epoch hyperparameter set to 100. The validation data is set to 20% of the training data



In [None]:
#import keras 

#model=keras.Sequential()
#Input= number of columns in dataset
#Output= single target variable: ranking position
# dense layers used for NN construction
# Look into overfitting or bias/variance tradeoff. Generally, more complex models require a lot more data to train on
# Check variance - basically overfitting, bias - underfitting. how to solve? analyse prediction-true value graph


#XGBOOST
# use feature importance visualisation



In [None]:
# #STACKED MODEL: XGBOOST AND ANNS

# from sklearn.ensemble import StackingClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from xgboost import XGBClassifier
# from sklearn.neural_network import MLPClassifier

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define base models
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)

# # Define stacking model
# stack_model = StackingClassifier(
#     estimators=[('xgb', xgb_model), ('nn', nn_model)],
#     final_estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# )

# # Train and evaluate
# stack_model.fit(X_train, y_train)
# y_pred = stack_model.predict(X_test)
# print(f"Stacking Model Accuracy: {accuracy_score(y_test, y_pred)}")

In [6]:
import pandas as pd
import datetime
def get_clean_air_race_pace(year, race, session):
   
   cur_session=fastf1.get_session(year,race,session)
   cur_session.load()
    
   laps=cur_session.laps
   laps=laps.drop(columns=['Time', 'DriverNumber', 'LapNumber', 'Stint',
         'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
         'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
         'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST',
         'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
         'LapStartDate', 'Position', 'DeletedReason',
         'FastF1Generated', 'IsAccurate'])
      #drop laps where trackstatus !=1
      #drop laps where it is deleted

   laps=laps[(laps['TrackStatus']=='1') & (laps['Deleted']==False) & (laps.LapTime.notnull())]
   laps=laps.drop(columns=['Deleted','TrackStatus'])

   #Find Personal Best
   personalbest=laps[laps['IsPersonalBest']==True]
   personalbest['LapTime (s)']=personalbest["LapTime"].dt.total_seconds()
   personalbest=personalbest.drop(columns=['LapTime'])
   personalbest_map=personalbest.sort_values('LapTime (s)').groupby('Driver').apply(pd.DataFrame.head,n=1).reset_index(drop=True)

   laps['LapTime (s)']=laps["LapTime"].dt.total_seconds()
   laps=laps.drop(columns=['LapTime'])
   avg_lap=laps['LapTime (s)'].groupby(laps['Driver']).mean()
   avg_lap 
      
    
   return personalbest_map,avg_lap


In [7]:
def get_session_weather(year, race, session):
    cur_session=fastf1.get_session(year,race,session)
    cur_session.load(weather=True)
    weather_data=cur_session.weather_data
    if weather_data.empty:
        return None  # or return default values
    
    weather_summary = {
        'air_temp': weather_data['AirTemp'].mean(),
        'humidity': weather_data['Humidity'].mean(),
        'pressure': weather_data['Pressure'].mean(),
        'track_temp': weather_data['TrackTemp'].mean(),
        'wind_speed': weather_data['WindSpeed'].mean(),
        'session_wet': weather_data['Rainfall'].any(),  # True if any rainfall
        'rainfall_readings': weather_data['Rainfall'].sum()  #count of True values
    }
    return weather_summary

In [8]:
def race_data():
    all_race_data = []
    races = {2025: ['Suzuka', 'Imola', 'Miami'], 2024: ['Zandvoort']}
    
    for year in races:
        for race in races[year]:
            # Get base race results
            race_session = fastf1.get_session(year,race,'R')
            race_session.load() 
            
            # Keep the Time column this time
            race_results = race_session.results.drop(columns=['BroadcastName','TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName',
       'HeadshotUrl', 'CountryCode', 'Position', 'Q1', 'Q2', 'Q3', 'Status', 'Points', 'Laps'])
            race_results = race_results.rename(columns={'Abbreviation':'Driver','GridPosition':'QualifyingPosition'})
            
            # Convert race time to seconds
            race_results['RaceTime_seconds'] = race_results['Time'].dt.total_seconds()
            
            # Handle DNFs - they have NaN race times
            max_time = race_results['RaceTime_seconds'].max()
            race_results['RaceTime_seconds'] = race_results['RaceTime_seconds'].fillna(max_time + 60)  # DNF penalty
            
            # Drop the original Time column now that we have seconds
            race_results = race_results.drop('Time', axis=1)
            
            # Rest of your existing code...
            for session in ['FP1', 'FP2', 'FP3']:
                try:
                    best_df, avg_series = get_clean_air_race_pace(year, race, session)
                    race_results[f'{session}_avg'] = race_results['Driver'].map(avg_series)
                    best_series = best_df.set_index('Driver')['LapTime (s)']
                    race_results[f'{session}_best'] = race_results['Driver'].map(best_series)
                    
                    weather_dict = get_session_weather(year, race, session)
                    for weather_var, value in weather_dict.items():
                        race_results[f'{session.lower()}_{weather_var}'] = value
                except ValueError as e:
                    print(f"Skipping {session} for {year} {race}: {e}")
                    continue
            
            race_results['Race'] = race
            race_results['Year'] = year
            all_race_data.append(race_results)
            
    combined_data = pd.concat(all_race_data)
    final_data = pd.get_dummies(combined_data, columns=['Race'], prefix='race')
    
    return final_data

In [9]:

data=race_data()
data

core           INFO 	Loading data for Japanese Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '12', '44', '6', '23', '87', '14', '22', '10', '55', '7', '27', '30', '31', '5', '18']
core           INFO 	Loading data for Japanese Grand Prix - Pr

Skipping FP2 for 2025 Miami: Session type 'FP2' does not exist for this event
Skipping FP3 for 2025 Miami: Session type 'FP3' does not exist for this event


core           INFO 	Loading data for Dutch Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '16', '81', '55', '11', '63', '44', '10', '14', '27', '3', '18', '23', '31', '2', '22', '20', '77', '24']
core           INFO 	Loading data for Dutch Grand Prix - Practic

Unnamed: 0,DriverNumber,Driver,DriverId,TeamName,ClassifiedPosition,QualifyingPosition,RaceTime_seconds,FP1_avg,FP1_best,fp1_air_temp,...,fp3_pressure,fp3_track_temp,fp3_wind_speed,fp3_session_wet,fp3_rainfall_readings,Year,race_Imola,race_Miami,race_Suzuka,race_Zandvoort
1,1,VER,max_verstappen,Red Bull Racing,1,1.0,4926.983,105.100833,89.065,13.598765,...,1015.060000,38.548000,3.324000,False,0.0,2025,False,False,True,False
4,4,NOR,norris,McLaren,2,2.0,1.423,100.331235,88.549,13.598765,...,1015.060000,38.548000,3.324000,False,0.0,2025,False,False,True,False
81,81,PIA,piastri,McLaren,3,3.0,2.129,107.272864,89.708,13.598765,...,1015.060000,38.548000,3.324000,False,0.0,2025,False,False,True,False
16,16,LEC,leclerc,Ferrari,4,4.0,16.097,107.001909,88.965,13.598765,...,1015.060000,38.548000,3.324000,False,0.0,2025,False,False,True,False
63,63,RUS,russell,Mercedes,5,5.0,17.362,101.712909,88.712,13.598765,...,1015.060000,38.548000,3.324000,False,0.0,2025,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,2,SAR,sargeant,Williams,16,18.0,64.539,101.000889,75.605,20.233333,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True
22,22,TSU,tsunoda,RB,17,11.0,65.146,96.911125,74.418,20.233333,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True
20,20,MAG,kevin_magnussen,Haas F1 Team,18,20.0,65.707,97.814111,73.597,20.233333,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True
77,77,BOT,bottas,Kick Sauber,19,16.0,3.248,,,20.233333,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True


In [10]:
pd.DataFrame.to_csv(data,'./data/combined_session_data.csv')

In [11]:
data.columns

Index(['DriverNumber', 'Driver', 'DriverId', 'TeamName', 'ClassifiedPosition',
       'QualifyingPosition', 'RaceTime_seconds', 'FP1_avg', 'FP1_best',
       'fp1_air_temp', 'fp1_humidity', 'fp1_pressure', 'fp1_track_temp',
       'fp1_wind_speed', 'fp1_session_wet', 'fp1_rainfall_readings', 'FP2_avg',
       'FP2_best', 'fp2_air_temp', 'fp2_humidity', 'fp2_pressure',
       'fp2_track_temp', 'fp2_wind_speed', 'fp2_session_wet',
       'fp2_rainfall_readings', 'FP3_avg', 'FP3_best', 'fp3_air_temp',
       'fp3_humidity', 'fp3_pressure', 'fp3_track_temp', 'fp3_wind_speed',
       'fp3_session_wet', 'fp3_rainfall_readings', 'Year', 'race_Imola',
       'race_Miami', 'race_Suzuka', 'race_Zandvoort'],
      dtype='object')

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Splitting train and test sets
drivers = data['Driver'].copy()

# Clean positions for reference (keep this for context)
data['ClassifiedPosition_numeric'] = pd.to_numeric(data['ClassifiedPosition'], errors='coerce')
retired_count = data['ClassifiedPosition'].eq('R').sum()
print(f"Found {retired_count} retired drivers")
data['ClassifiedPosition_clean'] = data['ClassifiedPosition_numeric'].fillna(21)

# NEW: Calculate average race lap time from total race time
# Assuming standard race distances (adjust these based on your tracks)
race_laps = {'Suzuka': 53, 'Imola': 63, 'Miami': 57, 'Zandvoort': 72}

# Create a race column to map laps
data['race_name'] = data[['race_Suzuka', 'race_Imola', 'race_Miami', 'race_Zandvoort']].idxmax(axis=1).str.replace('race_', '')

# Calculate average lap time (this is your new target)
data['laps_completed'] = data['race_name'].map(race_laps)
data['avg_race_laptime'] = data['RaceTime_seconds'] / data['laps_completed']

# Handle DNFs - they get a penalty lap time
max_laptime = data['avg_race_laptime'].max()
data['avg_race_laptime'] = data['avg_race_laptime'].fillna(max_laptime + 5)  # +5 second penalty

# Use average race lap time as target (similar scale to practice times)
y = data['avg_race_laptime']

# Select relevant features (similar to working example)
feature_cols = [
    'QualifyingPosition',     # Like QualifyingTime in working example
    'FP2_best',              # Your 'clean air race pace'
    'fp3_air_temp',          # Weather factors
    'fp3_humidity',
    'fp3_session_wet',
    'fp3_pressure',
    'fp3_track_temp',
    'fp3_wind_speed',
    'race_Suzuka',
    'race_Imola',
    'race_Miami',
    'race_Zandvoort'
]

X = data[feature_cols].fillna(0)

# # Add team performance (you'll need to create this mapping)
# team_performance = {
#     'Red Bull Racing': 1.0,
#     'McLaren': 0.85,
#     'Ferrari': 0.75,
#     'Mercedes': 0.70,
#     'Aston Martin': 0.45,
#     # Add other teams...
# }

# data['team_performance'] = data['TeamName'].map(team_performance).fillna(0.3)
# X['team_performance'] = data['team_performance']

# Split data
X_train, X_test, y_train, y_test, drivers_train, drivers_test = train_test_split(
    X, y, drivers, test_size=0.25, random_state=42
)

# Simple model (like the working example)
model = GradientBoostingRegressor(
    n_estimators=50,      # Moderate complexity
    learning_rate=0.1,    # Reasonable learning rate
    max_depth=3,          # Allow some complexity
    random_state=37
)

model.fit(X_train, y_train)

# Make predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Create results (sort by predicted lap time - fastest first)
test_results = pd.DataFrame({
    'Driver': drivers_test,
    'Predicted_LapTime': test_pred,
    'Actual_LapTime': y_test
}).sort_values('Predicted_LapTime')

print("Predicted Top 3 (Fastest Average Lap Times):")
print(test_results.head(3))

print(f"\nModel Error (MAE): {mean_absolute_error(y_test, test_pred):.3f} seconds")

# Feature importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(importance_df.head())

Found 6 retired drivers
Predicted Top 3 (Fastest Average Lap Times):
   Driver  Predicted_LapTime  Actual_LapTime
14    ALO         -12.335477        0.432540
14    ALO          -3.148238        1.081755
5     BOR          -1.327874        1.582962

Model Error (MAE): 21.246 seconds

Top 5 Most Important Features:
              feature  importance
0  QualifyingPosition    0.484534
1            FP2_best    0.190356
2        fp3_air_temp    0.098298
7      fp3_wind_speed    0.060091
3        fp3_humidity    0.057449


In [None]:
# # Splitting train and test sets
# drivers = data['Driver'].copy()

# data['ClassifiedPosition_numeric'] = pd.to_numeric(data['ClassifiedPosition'], errors='coerce')

# # Check how many 'R' values we have
# retired_count = data['ClassifiedPosition'].eq('R').sum()
# print(f"Found {retired_count} retired drivers")

# # Assign retired drivers to position 21 (last place)
# data['ClassifiedPosition_clean'] = data['ClassifiedPosition_numeric'].fillna(21)

# # Now use the cleaned target
# y = data['RaceTime_seconds']
# X = data.drop(columns=['ClassifiedPosition', 'RaceTime_seconds','ClassifiedPosition_numeric','ClassifiedPosition_clean','DriverId', 'Driver', 'DriverNumber','TeamName'])

# X_filled = X.fillna(0) 
# X_filled['quali_fp2_diff'] = X_filled['QualifyingPosition'] - X_filled['FP2_avg'].rank()
# X_filled['practice_consistency'] = X_filled[['FP1_avg', 'FP2_avg', 'FP3_avg']].std(axis=1)

# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_absolute_error

# # Split X, y, AND drivers together
# X_train, X_test, y_train, y_test, drivers_train, drivers_test = train_test_split(
#     X_filled, y, drivers, test_size=0.25, random_state=42
# )

# # XGBOOST
# from sklearn.ensemble import GradientBoostingRegressor

# model = GradientBoostingRegressor(n_estimators=10,     # Much smaller
#     learning_rate=0.01,  # Much slower
#     max_depth=1,         # Very shallow
#     min_samples_leaf=5,  # Require more samples per leaf
#     random_state=37)
# model.fit(X_train, y_train)

# # Make predictions
# train_pred = model.predict(X_train)
# test_pred = model.predict(X_test)

# # Create results DataFrame with driver names
# train_results = pd.DataFrame({
#     'Driver': drivers_train,
#     'Predicted_Position': train_pred,
#     'Actual_Position': y_train
# }).sort_values('Predicted_Position')

# test_results = pd.DataFrame({
#     'Driver': drivers_test,
#     'Predicted_Position': test_pred,
#     'Actual_Position': y_test
# }).sort_values('Predicted_Position')

# print("Predicted Top 3 (Training):")
# print(train_results.head(3))

# print("\nPredicted Top 3 (Test):")
# print(test_results.head(3))

# print(f"\nModel Error (MAE): {mean_absolute_error(y_test, test_pred):.3f}")

Found 6 retired drivers
Predicted Top 3 (Training):
   Driver  Predicted_Position  Actual_Position
87    BEA           731.18641           54.529
16    LEC           731.18641           57.036
77    BOT           731.18641            3.248

Predicted Top 3 (Test):
   Driver  Predicted_Position  Actual_Position
14    ALO           731.18641           27.250
1     VER           731.18641           22.896
23    ALB           731.18641           40.711

Model Error (MAE): 1076.680


In [None]:
# Check what the model learned
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10))

Top 10 most important features:
                  feature  importance
33         quali_fp2_diff         1.0
0      QualifyingPosition         0.0
26        fp3_session_wet         0.0
20               FP3_best         0.0
21           fp3_air_temp         0.0
22           fp3_humidity         0.0
23           fp3_pressure         0.0
24         fp3_track_temp         0.0
25         fp3_wind_speed         0.0
27  fp3_rainfall_readings         0.0


In [None]:
train_mae = mean_absolute_error(y_train, train_pred)
test_mae = mean_absolute_error(y_test, test_pred)
print(f"Training MAE: {train_mae:.3f}")
print(f"Test MAE: {test_mae:.3f}")

# Check prediction ranges
print(f"Training predictions range: {train_pred.min():.2f} to {train_pred.max():.2f}")
print(f"Test predictions range: {test_pred.min():.2f} to {test_pred.max():.2f}")

Training MAE: 1225.013
Test MAE: 1076.680
Training predictions range: 731.19 to 984.08
Test predictions range: 731.19 to 984.08


In [None]:
print("y_train data type:", y_train.dtype)
print("y_train unique values:", y_train.unique())
print("y_train sample:", y_train.head(10))

# Check for non-numeric values in ClassifiedPosition
print("ClassifiedPosition unique values:", data['ClassifiedPosition'].unique())
print("ClassifiedPosition data type:", data['ClassifiedPosition'].dtype)

y_train data type: float64
y_train unique values: [5.452900e+01 3.213700e+01 1.867100e+01 6.018600e+01 3.299300e+01
 2.543900e+01 3.764400e+01 2.097900e+01 8.273400e+01 5.553199e+03
 5.391587e+03 3.601900e+01 3.713400e+01 7.460200e+01 1.155000e+00
 1.292900e+01 4.461700e+01 2.077400e+01 4.806700e+01 7.412900e+01
 6.514600e+01 1.609700e+01 8.343800e+01 4.036700e+01 3.954200e+01
 2.918200e+01 3.380800e+01 2.571200e+01 5.703600e+01 3.248000e+00
 8.195700e+01 2.289800e+01 4.630000e+00 2.203400e+01 6.057700e+01
 1.794500e+01 3.995600e+01 6.570700e+01 5.840100e+01 3.142400e+01
 6.453900e+01 2.733700e+01 1.353300e+01 3.857200e+01 2.644600e+01
 1.423000e+00 9.044500e+01 6.109000e+00 2.129000e+00 1.435600e+01
 4.687800e+01 5.493199e+03 5.445519e+03 2.106700e+01 8.131400e+01
 8.200600e+01]
y_train sample: 87      54.529
55      32.137
12      18.671
44      60.186
18      32.993
16      25.439
63      37.644
14      20.979
30      82.734
31    5553.199
Name: RaceTime_seconds, dtype: float64
Clas

In [None]:
# Check what data types we actually have after all the encoding
print("Data types after encoding:")
print(X_train_clean.dtypes)

# Check for any remaining object columns
object_cols = X_train_clean.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"\nRemaining object columns: {object_cols.tolist()}")
    for col in object_cols:
        print(f"{col} unique values: {X_train_clean[col].unique()}")

# Check if there are any mixed types in supposedly numeric columns
for col in X_train_clean.columns:
    # Try to identify which column has the problematic 'R'
    try:
        temp = pd.to_numeric(X_train_clean[col], errors='raise')
    except:
        print(f"Column {col} has non-numeric values:")
        print(X_train_clean[col].value_counts())
        break

Data types after encoding:


NameError: name 'X_train_clean' is not defined

In [None]:
X_train

Unnamed: 0,Driver,TeamName,QualifyingPosition,FP1_avg,FP1_best,fp1_air_temp,fp1_humidity,fp1_pressure,fp1_track_temp,fp1_wind_speed,...,fp3_pressure,fp3_track_temp,fp3_wind_speed,fp3_session_wet,fp3_rainfall_readings,Year,race_Imola,race_Miami,race_Suzuka,race_Zandvoort
87,BEA,Haas F1 Team,10.0,106.2736,90.077,13.598765,41.654321,1011.761728,35.941975,4.630864,...,1015.06,38.548,3.324,False,0.0,2025,False,False,True,False
55,SAI,Ferrari,10.0,97.269,73.074,20.233333,83.876543,1003.335802,21.725926,7.109877,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True
12,ANT,Mercedes,6.0,103.36881,89.284,13.598765,41.654321,1011.761728,35.941975,4.630864,...,1015.06,38.548,3.324,False,0.0,2025,False,False,True,False
44,HAM,Ferrari,12.0,93.592364,88.556,27.873077,50.217949,1017.666667,41.64359,1.852564,...,,,,,,2025,False,True,False,False
18,STR,Aston Martin,8.0,96.945188,77.032,19.302632,32.421053,1007.957895,39.531579,3.206579,...,1009.070513,42.158974,1.653846,False,0.0,2025,True,False,False,False
16,LEC,Ferrari,6.0,97.472,74.306,20.233333,83.876543,1003.335802,21.725926,7.109877,...,1004.411392,21.572152,1.975949,True,17.0,2024,False,False,False,True
63,RUS,Mercedes,5.0,94.67675,88.058,27.873077,50.217949,1017.666667,41.64359,1.852564,...,,,,,,2025,False,True,False,False
14,ALO,Aston Martin,17.0,109.158278,88.243,27.873077,50.217949,1017.666667,41.64359,1.852564,...,,,,,,2025,False,True,False,False
30,LAW,Racing Bulls,13.0,109.560739,89.536,13.598765,41.654321,1011.761728,35.941975,4.630864,...,1015.06,38.548,3.324,False,0.0,2025,False,False,True,False
31,OCO,Haas F1 Team,18.0,97.328133,77.662,19.302632,32.421053,1007.957895,39.531579,3.206579,...,1009.070513,42.158974,1.653846,False,0.0,2025,True,False,False,False


In [None]:
# Check data types and problematic values
print("Data types:")
print(X_train.dtypes)
print("\nUnique values in object columns:")
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(f"{col}: {X_train[col].unique()[:10]}")  # Show first 10 unique values

Data types:
Driver                    object
TeamName                  object
QualifyingPosition       float64
FP1_avg                  float64
FP1_best                 float64
fp1_air_temp             float64
fp1_humidity             float64
fp1_pressure             float64
fp1_track_temp           float64
fp1_wind_speed           float64
fp1_session_wet             bool
fp1_rainfall_readings      int64
FP2_avg                  float64
FP2_best                 float64
fp2_air_temp             float64
fp2_humidity             float64
fp2_pressure             float64
fp2_track_temp           float64
fp2_wind_speed           float64
fp2_session_wet           object
fp2_rainfall_readings    float64
FP3_avg                  float64
FP3_best                 float64
fp3_air_temp             float64
fp3_humidity             float64
fp3_pressure             float64
fp3_track_temp           float64
fp3_wind_speed           float64
fp3_session_wet           object
fp3_rainfall_readings    float6

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate standard metrics
xgb_mae = mean_absolute_error(y_test, xgb_pred)
ann_mae = mean_absolute_error(y_test, ann_pred)

xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
ann_rmse = np.sqrt(mean_squared_error(y_test, ann_pred))

xgb_r2 = r2_score(y_test, xgb_pred)
ann_r2 = r2_score(y_test, ann_pred)

print(f"XGBoost - MAE: {xgb_mae:.3f}, RMSE: {xgb_rmse:.3f}, R²: {xgb_r2:.3f}")
print(f"ANN - MAE: {ann_mae:.3f}, RMSE: {ann_rmse:.3f}, R²: {ann_r2:.3f}")