In [3]:
import pandas as pd

data = pd.read_csv('Combined_final-laps.csv')



In [4]:
def convert_lap_time(lap_time_str):
    minutes, seconds, milliseconds = lap_time_str.split(':')
    return int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000

# Apply the conversion to the FormattedLapTime column
data['LapTimeSeconds'] = data['FormattedLapTime'].apply(convert_lap_time)

# Drop the original FormattedLapTime column as it's no longer needed
data.drop(columns=['FormattedLapTime'], inplace=True)
data.drop(columns=['WindDirection'], inplace=True)
data.drop(columns=['Rainfall'], inplace=True)

One Hot Encoded - Compound


In [5]:
data_encoded = pd.get_dummies(data, columns=[ 'Compound'], drop_first=True)
data_encoded.style.set_sticky(axis="columns")
print(data_encoded.columns)
data_encoded.head(20)

Index(['Driver', 'DriverNumber', 'LapNumber', 'Stint', 'TyreLife', 'Team',
       'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed', 'Track',
       'LapTimeSeconds', 'Compound_MEDIUM', 'Compound_SOFT'],
      dtype='object')


Unnamed: 0,Driver,DriverNumber,LapNumber,Stint,TyreLife,Team,AirTemp,Humidity,Pressure,TrackTemp,WindSpeed,Track,LapTimeSeconds,Compound_MEDIUM,Compound_SOFT
0,GAS,10,2.0,1.0,5.0,Toro Rosso,21.4,23.9,1002.0,28.9,2.5,Austin,104.253,False,True
1,GAS,10,3.0,1.0,6.0,Toro Rosso,21.5,24.7,1002.0,28.9,3.0,Austin,103.246,False,True
2,GAS,10,4.0,1.0,7.0,Toro Rosso,21.1,24.8,1001.8,29.0,1.6,Austin,102.794,False,True
3,GAS,10,5.0,1.0,8.0,Toro Rosso,21.2,25.0,1001.7,29.0,1.4,Austin,102.481,False,True
4,GAS,10,6.0,1.0,9.0,Toro Rosso,21.4,24.9,1001.7,29.1,0.7,Austin,102.402,False,True
5,GAS,10,7.0,1.0,10.0,Toro Rosso,21.8,23.9,1001.8,29.5,2.6,Austin,103.118,False,True
6,GAS,10,8.0,1.0,11.0,Toro Rosso,21.7,23.2,1001.7,29.3,2.9,Austin,103.406,False,True
7,GAS,10,9.0,1.0,12.0,Toro Rosso,21.5,23.2,1001.7,29.3,3.0,Austin,103.19,False,True
8,GAS,10,10.0,1.0,13.0,Toro Rosso,21.4,24.3,1001.6,29.1,1.9,Austin,103.39,False,True
9,GAS,10,11.0,1.0,14.0,Toro Rosso,21.2,24.6,1001.6,29.1,2.0,Austin,103.658,False,True


Featrue Engineering on Tyre Compound


In [6]:
data_encoded['TyreDegradation'] = data_encoded['TyreLife'] * \
    (data_encoded['Compound_SOFT'] if 'Compound_SOFT' in data_encoded else 0) + \
    (data_encoded['TyreLife'] * 0.8 * (data_encoded['Compound_MEDIUM'] if 'Compound_MEDIUM' in data_encoded else 0)) + \
    (data_encoded['TyreLife'] * 0.5 * (data_encoded['Compound_HARD'] if 'Compound_HARD' in data_encoded else 0))

data_encoded.head(10)

Unnamed: 0,Driver,DriverNumber,LapNumber,Stint,TyreLife,Team,AirTemp,Humidity,Pressure,TrackTemp,WindSpeed,Track,LapTimeSeconds,Compound_MEDIUM,Compound_SOFT,TyreDegradation
0,GAS,10,2.0,1.0,5.0,Toro Rosso,21.4,23.9,1002.0,28.9,2.5,Austin,104.253,False,True,5.0
1,GAS,10,3.0,1.0,6.0,Toro Rosso,21.5,24.7,1002.0,28.9,3.0,Austin,103.246,False,True,6.0
2,GAS,10,4.0,1.0,7.0,Toro Rosso,21.1,24.8,1001.8,29.0,1.6,Austin,102.794,False,True,7.0
3,GAS,10,5.0,1.0,8.0,Toro Rosso,21.2,25.0,1001.7,29.0,1.4,Austin,102.481,False,True,8.0
4,GAS,10,6.0,1.0,9.0,Toro Rosso,21.4,24.9,1001.7,29.1,0.7,Austin,102.402,False,True,9.0
5,GAS,10,7.0,1.0,10.0,Toro Rosso,21.8,23.9,1001.8,29.5,2.6,Austin,103.118,False,True,10.0
6,GAS,10,8.0,1.0,11.0,Toro Rosso,21.7,23.2,1001.7,29.3,2.9,Austin,103.406,False,True,11.0
7,GAS,10,9.0,1.0,12.0,Toro Rosso,21.5,23.2,1001.7,29.3,3.0,Austin,103.19,False,True,12.0
8,GAS,10,10.0,1.0,13.0,Toro Rosso,21.4,24.3,1001.6,29.1,1.9,Austin,103.39,False,True,13.0
9,GAS,10,11.0,1.0,14.0,Toro Rosso,21.2,24.6,1001.6,29.1,2.0,Austin,103.658,False,True,14.0


Fuel Load Calculation


In [7]:
data_encoded['Fuel_Load']= 1-(data_encoded['LapNumber']/50)
data_encoded.head(10)

Unnamed: 0,Driver,DriverNumber,LapNumber,Stint,TyreLife,Team,AirTemp,Humidity,Pressure,TrackTemp,WindSpeed,Track,LapTimeSeconds,Compound_MEDIUM,Compound_SOFT,TyreDegradation,Fuel_Load
0,GAS,10,2.0,1.0,5.0,Toro Rosso,21.4,23.9,1002.0,28.9,2.5,Austin,104.253,False,True,5.0,0.96
1,GAS,10,3.0,1.0,6.0,Toro Rosso,21.5,24.7,1002.0,28.9,3.0,Austin,103.246,False,True,6.0,0.94
2,GAS,10,4.0,1.0,7.0,Toro Rosso,21.1,24.8,1001.8,29.0,1.6,Austin,102.794,False,True,7.0,0.92
3,GAS,10,5.0,1.0,8.0,Toro Rosso,21.2,25.0,1001.7,29.0,1.4,Austin,102.481,False,True,8.0,0.9
4,GAS,10,6.0,1.0,9.0,Toro Rosso,21.4,24.9,1001.7,29.1,0.7,Austin,102.402,False,True,9.0,0.88
5,GAS,10,7.0,1.0,10.0,Toro Rosso,21.8,23.9,1001.8,29.5,2.6,Austin,103.118,False,True,10.0,0.86
6,GAS,10,8.0,1.0,11.0,Toro Rosso,21.7,23.2,1001.7,29.3,2.9,Austin,103.406,False,True,11.0,0.84
7,GAS,10,9.0,1.0,12.0,Toro Rosso,21.5,23.2,1001.7,29.3,3.0,Austin,103.19,False,True,12.0,0.82
8,GAS,10,10.0,1.0,13.0,Toro Rosso,21.4,24.3,1001.6,29.1,1.9,Austin,103.39,False,True,13.0,0.8
9,GAS,10,11.0,1.0,14.0,Toro Rosso,21.2,24.6,1001.6,29.1,2.0,Austin,103.658,False,True,14.0,0.78


Boolean Columns to 0/1


In [8]:
data_encoded.loc[:, data_encoded.dtypes == 'bool'] = data_encoded.loc[:, data_encoded.dtypes == 'bool'].astype(int)
data_encoded.head(10)
print(data.columns)


Index(['Driver', 'DriverNumber', 'LapNumber', 'Stint', 'TyreLife', 'Compound',
       'Team', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed',
       'Track', 'LapTimeSeconds'],
      dtype='object')


  data_encoded.loc[:, data_encoded.dtypes == 'bool'] = data_encoded.loc[:, data_encoded.dtypes == 'bool'].astype(int)
  data_encoded.loc[:, data_encoded.dtypes == 'bool'] = data_encoded.loc[:, data_encoded.dtypes == 'bool'].astype(int)


In [9]:
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame and you want to scale specific columns, e.g., 'col1' and 'col2'
columns_to_scale = ['AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed']  # Replace with the columns you want to scale

# Initialize the scaler
scaler = StandardScaler()

data_encoded_new = data_encoded.copy()

# Fit and transform only the specified columns
data_encoded_new[columns_to_scale] = scaler.fit_transform(data_encoded[columns_to_scale])

data_encoded_new.style.set_sticky(axis="columns")
# Display the scaled DataFrame
print(data_encoded_new.columns)
data_encoded_new.head()

Index(['Driver', 'DriverNumber', 'LapNumber', 'Stint', 'TyreLife', 'Team',
       'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed', 'Track',
       'LapTimeSeconds', 'Compound_MEDIUM', 'Compound_SOFT', 'TyreDegradation',
       'Fuel_Load'],
      dtype='object')


Unnamed: 0,Driver,DriverNumber,LapNumber,Stint,TyreLife,Team,AirTemp,Humidity,Pressure,TrackTemp,WindSpeed,Track,LapTimeSeconds,Compound_MEDIUM,Compound_SOFT,TyreDegradation,Fuel_Load
0,GAS,10,2.0,1.0,5.0,Toro Rosso,-1.355971,-2.063158,-0.255183,-2.111643,0.327079,Austin,104.253,0,1,5.0,0.96
1,GAS,10,3.0,1.0,6.0,Toro Rosso,-1.326772,-1.988407,-0.255183,-2.111643,0.647242,Austin,103.246,0,1,6.0,0.94
2,GAS,10,4.0,1.0,7.0,Toro Rosso,-1.443568,-1.979063,-0.276359,-2.09237,-0.249213,Austin,102.794,0,1,7.0,0.92
3,GAS,10,5.0,1.0,8.0,Toro Rosso,-1.414369,-1.960376,-0.286947,-2.09237,-0.377278,Austin,102.481,0,1,8.0,0.9
4,GAS,10,6.0,1.0,9.0,Toro Rosso,-1.355971,-1.96972,-0.286947,-2.073097,-0.825506,Austin,102.402,0,1,9.0,0.88


In [10]:
data_encoded_new = pd.get_dummies(data_encoded_new, columns=['Driver', 'Team','Track'], drop_first=False)


print(data_encoded_new.columns)
data_encoded_new.head()

Index(['DriverNumber', 'LapNumber', 'Stint', 'TyreLife', 'AirTemp', 'Humidity',
       'Pressure', 'TrackTemp', 'WindSpeed', 'LapTimeSeconds',
       'Compound_MEDIUM', 'Compound_SOFT', 'TyreDegradation', 'Fuel_Load',
       'Driver_ALB', 'Driver_ALO', 'Driver_BEA', 'Driver_BOT', 'Driver_COL',
       'Driver_DEV', 'Driver_GAS', 'Driver_GIO', 'Driver_GRO', 'Driver_HAM',
       'Driver_HUL', 'Driver_KUB', 'Driver_KVY', 'Driver_LAT', 'Driver_LAW',
       'Driver_LEC', 'Driver_MAG', 'Driver_MAZ', 'Driver_MSC', 'Driver_NOR',
       'Driver_OCO', 'Driver_PER', 'Driver_PIA', 'Driver_RAI', 'Driver_RIC',
       'Driver_RUS', 'Driver_SAI', 'Driver_SAR', 'Driver_STR', 'Driver_TSU',
       'Driver_VER', 'Driver_VET', 'Driver_ZHO', 'Team_Alfa Romeo',
       'Team_Alfa Romeo Racing', 'Team_AlphaTauri', 'Team_Alpine',
       'Team_Aston Martin', 'Team_Ferrari', 'Team_Haas F1 Team',
       'Team_Kick Sauber', 'Team_McLaren', 'Team_Mercedes', 'Team_RB',
       'Team_Racing Point', 'Team_Red Bull Racing

Unnamed: 0,DriverNumber,LapNumber,Stint,TyreLife,AirTemp,Humidity,Pressure,TrackTemp,WindSpeed,LapTimeSeconds,...,Team_McLaren,Team_Mercedes,Team_RB,Team_Racing Point,Team_Red Bull Racing,Team_Renault,Team_Toro Rosso,Team_Williams,Track_Austin,Track_Baku
0,10,2.0,1.0,5.0,-1.355971,-2.063158,-0.255183,-2.111643,0.327079,104.253,...,False,False,False,False,False,False,True,False,True,False
1,10,3.0,1.0,6.0,-1.326772,-1.988407,-0.255183,-2.111643,0.647242,103.246,...,False,False,False,False,False,False,True,False,True,False
2,10,4.0,1.0,7.0,-1.443568,-1.979063,-0.276359,-2.09237,-0.249213,102.794,...,False,False,False,False,False,False,True,False,True,False
3,10,5.0,1.0,8.0,-1.414369,-1.960376,-0.286947,-2.09237,-0.377278,102.481,...,False,False,False,False,False,False,True,False,True,False
4,10,6.0,1.0,9.0,-1.355971,-1.96972,-0.286947,-2.073097,-0.825506,102.402,...,False,False,False,False,False,False,True,False,True,False


In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Assuming df is your DataFrame
X = data_encoded_new.drop(columns=['LapTimeSeconds'])  # Features
y = data_encoded_new['LapTimeSeconds']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=500, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

n_trees = len(rf_model.estimators_)  # Number of trees in the forest
train_predictions = np.zeros((X_train.shape[0], n_trees))
test_predictions = np.zeros((X_test.shape[0], n_trees))

# Store predictions for each tree in the forest
for i, tree in enumerate(rf_model.estimators_):
    train_predictions[:, i] = tree.predict(X_train)
    test_predictions[:, i] = tree.predict(X_test)

# Calculate the average prediction across all trees for both train and test data
avg_train_predictions = np.mean(train_predictions, axis=1)
avg_test_predictions = np.mean(test_predictions, axis=1)

# Calculate Bias: difference between the mean of predictions and the true value
train_bias = np.mean(avg_train_predictions - y_train)
test_bias = np.mean(avg_test_predictions - y_test)

# Calculate Variance: variance of the predictions for the test set
train_variance = np.mean(np.var(train_predictions, axis=1))
test_variance = np.mean(np.var(test_predictions, axis=1))

# Print the results
print(f"Train Bias: {train_bias}")
print(f"Test Bias: {test_bias}")
print(f"Train Variance: {train_variance}")
print(f"Test Variance: {test_variance}")

Mean Squared Error (MSE): 0.6519064630269878
Mean Absolute Error (MAE): 0.46447099961043925
R^2 Score: 0.95556430177968
Train Bias: 0.00314593085017529
Test Bias: 0.016712316322555317
Train Variance: 0.6928785868982715
Test Variance: 1.2440983522624576


In [17]:
import pandas as pd

def predict_lap_time(driver_number, lap_number, stint, tyre_life, air_temp, humidity, pressure, track_temp,
                     wind_speed, compound, team, driver, track):
    # Initialize a dictionary with all features, setting default values for one-hot encoded columns
    feature_dict = {col: 0 for col in data_encoded.columns if col != 'LapTimeSeconds'}

    # Fill in provided values for numeric features
    feature_dict['DriverNumber'] = driver_number
    feature_dict['LapNumber'] = lap_number
    feature_dict['Stint'] = stint
    feature_dict['TyreLife'] = tyre_life
    feature_dict['AirTemp'] = air_temp
    feature_dict['Humidity'] = humidity
    feature_dict['Pressure'] = pressure
    feature_dict['TrackTemp'] = track_temp
    feature_dict['WindSpeed'] = wind_speed

    # One-hot encode Compound, Team, Driver, and Track based on training data
    compound_column = f'Compound_{compound.upper()}'
    if compound_column in feature_dict:
        feature_dict[compound_column] = 1

    team_column = f'Team_{team}'
    if team_column in feature_dict:
        feature_dict[team_column] = 1

    driver_column = f'Driver_{driver}'
    if driver_column in feature_dict:
        feature_dict[driver_column] = 1

    track_column = f'Track_{track}'
    if track_column in feature_dict:
        feature_dict[track_column] = 1

    # Additional engineered features
    feature_dict['TyreDegradation'] = feature_dict['TyreLife'] * (feature_dict.get('Compound_SOFT', 0) +
                                                                  0.8 * feature_dict.get('Compound_MEDIUM', 0) +
                                                                  0.5 * feature_dict.get('Compound_HARD', 0))
    feature_dict['Fuel_Load'] = 1 - lap_number / 50  # Example fuel load calculation

    # Convert feature dictionary to a DataFrame
    input_df = pd.DataFrame([feature_dict])
    print(input_df)
    # Ensure columns align with model expectations
    input_df = input_df.reindex(columns=rf_model.feature_names_in_, fill_value=0)

    # Predict lap time using the model
    predicted_lap_time = rf_model.predict(input_df)[0]

    return predicted_lap_time

# Example usage
predicted_time = predict_lap_time(
    driver_number=10,
    lap_number=25,
    stint=1,
    tyre_life=11,
    air_temp=20.1,
    humidity=51.1,
    pressure=1013.9,
    track_temp=41.3,
    wind_speed=5.1,
    compound="hard",
    team="Mercedes",
    driver="HAM",
    track="Austin"
)

print(f"Predicted Lap Time: {predicted_time:.2f} seconds")




   Driver  DriverNumber  LapNumber  Stint  TyreLife  Team  AirTemp  Humidity  \
0       0            10         25      1        11     0     20.1      51.1   

   Pressure  TrackTemp  WindSpeed  Track  Compound_MEDIUM  Compound_SOFT  \
0    1013.9       41.3        5.1      0                0              0   

   TyreDegradation  Fuel_Load  
0              0.0        0.5  
Predicted Lap Time: 106.75 seconds


In [16]:
import pandas as pd

def predict_race_outcome(driver_number, compound1, compound1_laps, compound2, compound2_laps,
                         air_temp, humidity, pressure, track_temp, wind_speed, team, driver, track):
    lap_times = []
    stint = 1
    current_compound = compound1
    tyre_life = 0

    for lap in range(1, 51):
        # Determine if we need to switch compound
        if lap > compound1_laps:
            current_compound = compound2
            stint = 2
            tyre_life = lap - compound1_laps  # Reset tyre life based on when the second compound starts
        else:
            tyre_life = lap  # Tyre life starts counting from lap 1 for compound1

        # Predict lap time using the existing predict_lap_time function
        lap_time = predict_lap_time(
            driver_number=driver_number,
            lap_number=lap,
            stint=stint,
            tyre_life=tyre_life,
            air_temp=air_temp,
            humidity=humidity,
            pressure=pressure,
            track_temp=track_temp,
            wind_speed=wind_speed,
            compound=current_compound,
            team=team,
            driver=driver,
            track=track
        )

        # Append the lap time to the lap_times list
        lap_times.append(lap_time)

    # Return the list of predicted lap times
    return lap_times

# Example usage of the new function
predicted_lap_times = predict_race_outcome(
    driver_number=10,
    compound1="hard",
    compound1_laps=30,  # Assume the first compound is used for 20 laps
    compound2="medium",
    compound2_laps=20,  # The second compound is used for the remaining 30 laps
    air_temp=20.1,
    humidity=51.1,
    pressure=1013.9,
    track_temp=41.3,
    wind_speed=5.1,
    team="Mercedes",
    driver="HAM",
    track="Austin"
)

# Print predicted lap times
for i, time in enumerate(predicted_lap_times, start=1):
    print(f"Lap {i}: {time:.2f} seconds")


Lap 1: 107.35 seconds
Lap 2: 107.35 seconds
Lap 3: 107.30 seconds
Lap 4: 107.23 seconds
Lap 5: 107.14 seconds
Lap 6: 107.13 seconds
Lap 7: 106.80 seconds
Lap 8: 106.77 seconds
Lap 9: 106.74 seconds
Lap 10: 106.73 seconds
Lap 11: 106.74 seconds
Lap 12: 106.75 seconds
Lap 13: 106.76 seconds
Lap 14: 106.77 seconds
Lap 15: 106.80 seconds
Lap 16: 106.79 seconds
Lap 17: 106.81 seconds
Lap 18: 106.79 seconds
Lap 19: 106.85 seconds
Lap 20: 106.85 seconds
Lap 21: 106.86 seconds
Lap 22: 106.87 seconds
Lap 23: 106.84 seconds
Lap 24: 106.84 seconds
Lap 25: 106.72 seconds
Lap 26: 106.67 seconds
Lap 27: 106.57 seconds
Lap 28: 106.47 seconds
Lap 29: 106.46 seconds
Lap 30: 106.45 seconds
Lap 31: 106.08 seconds
Lap 32: 105.98 seconds
Lap 33: 105.98 seconds
Lap 34: 105.95 seconds
Lap 35: 105.94 seconds
Lap 36: 105.85 seconds
Lap 37: 105.81 seconds
Lap 38: 105.71 seconds
Lap 39: 105.76 seconds
Lap 40: 106.24 seconds
Lap 41: 106.21 seconds
Lap 42: 106.07 seconds
Lap 43: 106.07 seconds
Lap 44: 106.10 secon