In [796]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import fastf1


In [797]:
print(fastf1.__version__)


3.6.1


DATA FETCHING...

In [798]:
# Create the cache directory if it doesn't exist
os.makedirs('fastf1_cache', exist_ok=True)

# Enable FastF1 cache
fastf1.Cache.enable_cache('fastf1_cache')  # stores data locally for faster re-use
# Example: 2023 Bahrain GP, Race session
session = fastf1.get_session(2023, 'Bahrain', 'R')  # 'R' = Race, can also use 'Q', 'FP1', etc.
session.load()


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']


In [799]:
# All drivers, filtered columns
laps_all = session.laps.copy()
laps_all.head()  # Preview the first few rows
session.laps.to_csv('dataset/laps.csv', index=False)
laps_all.shape

(1056, 31)

In [800]:
laps = session.laps
tele_list = []
for _, lap in laps.iterrows():
    tel = lap.get_car_data().copy()          # telemetry for this lap only
    tel['LapNumber'] = lap['LapNumber']
    tel['Driver'] = lap['Driver'] 
    tele_list.append(tel)

tele_all = pd.concat(tele_list, ignore_index=True)
tele_all.to_csv('dataset/telemetry.csv', index=False)

In [801]:
# car data
# lap = session.laps.pick_driver('VER')
# car_data= lap.get_car_data()
# car_data.head()
#car_data.to_csv('dataset/telemetry.csv', index=False)


In [802]:
# Event info
event_info = session.event
event_info.head()
event_info.to_csv('dataset/event_info.csv', index=False)


In [803]:
# Weather
weather = session.weather_data
weather.to_csv('dataset/weather.csv', index=False)
weather.head()

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:00:45.438000,29.8,19.0,1016.5,False,35.1,176,1.2
1,0 days 00:01:45.453000,29.7,19.0,1016.5,False,35.0,182,1.2
2,0 days 00:02:45.452000,29.7,19.0,1016.5,False,34.9,156,1.1
3,0 days 00:03:45.466000,29.6,19.0,1016.5,False,34.9,201,0.8
4,0 days 00:04:45.449000,29.6,19.0,1016.5,False,34.8,219,0.8


In [804]:
# Results
results = session.results
#results.to_csv('dataset/results.csv', index=False)
results.head()

Unnamed: 0,DriverNumber,BroadcastName,Abbreviation,DriverId,TeamName,TeamColor,TeamId,FirstName,LastName,FullName,...,Position,ClassifiedPosition,GridPosition,Q1,Q2,Q3,Time,Status,Points,Laps
1,1,M VERSTAPPEN,VER,max_verstappen,Red Bull Racing,3671C6,red_bull,Max,Verstappen,Max Verstappen,...,1.0,1,1.0,NaT,NaT,NaT,0 days 01:33:56.736000,Finished,25.0,57.0
11,11,S PEREZ,PER,perez,Red Bull Racing,3671C6,red_bull,Sergio,Perez,Sergio Perez,...,2.0,2,2.0,NaT,NaT,NaT,0 days 00:00:11.987000,Finished,18.0,57.0
14,14,F ALONSO,ALO,alonso,Aston Martin,358C75,aston_martin,Fernando,Alonso,Fernando Alonso,...,3.0,3,5.0,NaT,NaT,NaT,0 days 00:00:38.637000,Finished,15.0,57.0
55,55,C SAINZ,SAI,sainz,Ferrari,F91536,ferrari,Carlos,Sainz,Carlos Sainz,...,4.0,4,4.0,NaT,NaT,NaT,0 days 00:00:48.052000,Finished,12.0,57.0
44,44,L HAMILTON,HAM,hamilton,Mercedes,6CD3BF,mercedes,Lewis,Hamilton,Lewis Hamilton,...,5.0,5,7.0,NaT,NaT,NaT,0 days 00:00:50.977000,Finished,10.0,57.0


DATA LOADING...

In [805]:
laps=pd.read_csv("dataset/laps.csv")
laps_needed=['Driver','LapNumber','LapTime','Sector1Time', 'Sector2Time', 'Sector3Time' ,'Stint','Compound' , 'PitInTime', 'PitOutTime', 'TrackStatus','TyreLife','FreshTyre','Position' ]    
laps=laps[laps_needed]
laps

Unnamed: 0,Driver,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,Compound,PitInTime,PitOutTime,TrackStatus,TyreLife,FreshTyre,Position
0,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0
1,VER,2.0,0 days 00:01:37.974000,0 days 00:00:31.342000,0 days 00:00:42.504000,0 days 00:00:24.128000,1.0,SOFT,,,12,5.0,False,1.0
2,VER,3.0,0 days 00:01:38.006000,0 days 00:00:31.388000,0 days 00:00:42.469000,0 days 00:00:24.149000,1.0,SOFT,,,1,6.0,False,1.0
3,VER,4.0,0 days 00:01:37.976000,0 days 00:00:31.271000,0 days 00:00:42.642000,0 days 00:00:24.063000,1.0,SOFT,,,1,7.0,False,1.0
4,VER,5.0,0 days 00:01:38.035000,0 days 00:00:31.244000,0 days 00:00:42.724000,0 days 00:00:24.067000,1.0,SOFT,,,1,8.0,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,PIA,9.0,0 days 00:01:41.534000,0 days 00:00:31.989000,0 days 00:00:44.548000,0 days 00:00:24.997000,1.0,SOFT,,,1,9.0,True,16.0
1052,PIA,10.0,0 days 00:01:41.584000,0 days 00:00:31.779000,0 days 00:00:44.617000,0 days 00:00:25.188000,1.0,SOFT,,,1,10.0,True,16.0
1053,PIA,11.0,0 days 00:01:41.352000,0 days 00:00:31.894000,0 days 00:00:44.557000,0 days 00:00:24.901000,1.0,SOFT,,,1,11.0,True,13.0
1054,PIA,12.0,0 days 00:01:41.156000,0 days 00:00:32.100000,0 days 00:00:44.235000,0 days 00:00:24.821000,1.0,SOFT,,,1,12.0,True,11.0


In [806]:
events=pd.read_csv('dataset/event_info.csv')
events

Unnamed: 0,1
0,1
1,Bahrain
2,Sakhir
3,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2023
4,2023-03-05 00:00:00
5,Bahrain Grand Prix
6,conventional
7,Practice 1
8,2023-03-03 14:30:00+03:00
9,2023-03-03 11:30:00


In [807]:
telemetry=pd.read_csv('dataset/telemetry.csv')
telemetry_needed=['Driver','LapNumber','RPM','Speed','nGear','Throttle','Brake','DRS','SessionTime']
telemetry=telemetry[telemetry_needed]

agg_tele = (
    telemetry
    .groupby(['Driver','LapNumber'])
    .agg({
        'Speed': ['mean','max'],
        'Throttle': 'mean',
        'Brake': 'mean'
    })
)
agg_tele.columns = ['Speed_mean','Speed_max','Throttle_mean','Brake_mean']
agg_tele = agg_tele.reset_index()
telemetry

Unnamed: 0,Driver,LapNumber,RPM,Speed,nGear,Throttle,Brake,DRS,SessionTime
0,VER,1.0,10130.0,0.0,1,16.0,False,1,0 days 01:02:36.762000
1,VER,1.0,9766.0,0.0,1,16.0,False,1,0 days 01:02:36.962000
2,VER,1.0,8086.0,2.0,1,16.0,False,1,0 days 01:02:37.242000
3,VER,1.0,5566.0,16.0,1,16.0,False,1,0 days 01:02:37.602000
4,VER,1.0,4473.0,24.0,1,16.0,False,1,0 days 01:02:37.842000
...,...,...,...,...,...,...,...,...,...
394618,PIA,13.0,4101.0,56.0,8,0.0,False,8,0 days 01:24:58.646000
394619,PIA,13.0,4096.0,56.0,8,0.0,False,8,0 days 01:24:58.806000
394620,PIA,13.0,4075.0,55.0,8,0.0,False,8,0 days 01:24:59.006000
394621,PIA,13.0,4050.0,55.0,8,0.0,False,8,0 days 01:24:59.166000


In [808]:
weather=pd.read_csv('dataset/weather.csv')
weather_needed=['Time','TrackTemp','Humidity','AirTemp','Rainfall','WindSpeed']
weather=weather[weather_needed]
weather


Unnamed: 0,Time,TrackTemp,Humidity,AirTemp,Rainfall,WindSpeed
0,0 days 00:00:45.438000,35.1,19.0,29.8,False,1.2
1,0 days 00:01:45.453000,35.0,19.0,29.7,False,1.2
2,0 days 00:02:45.452000,34.9,19.0,29.7,False,1.1
3,0 days 00:03:45.466000,34.9,19.0,29.6,False,0.8
4,0 days 00:04:45.449000,34.8,19.0,29.6,False,0.8
...,...,...,...,...,...,...
156,0 days 02:36:46.050000,28.7,21.0,26.3,False,0.6
157,0 days 02:37:46.064000,28.7,21.0,26.3,False,0.0
158,0 days 02:38:46.063000,28.7,21.0,26.3,False,0.4
159,0 days 02:39:46.109000,28.7,21.0,26.3,False,0.5


In [809]:
results=pd.read_csv('dataset/results.csv')
results_needed=['Driver','Position','GridPosition','Status','Points','Laps']
results=results[results_needed]
results

Unnamed: 0,Driver,Position,GridPosition,Status,Points,Laps
0,VER,1,1,Finished,25,57
1,PER,2,2,Finished,18,57
2,ALO,3,5,Finished,15,57
3,SAI,4,4,Finished,12,57
4,HAM,5,7,Finished,10,57
5,STR,6,8,Finished,8,57
6,RUS,7,6,Finished,6,57
7,BOT,8,12,Finished,4,57
8,GAS,9,20,Finished,2,57
9,ALB,10,15,Finished,1,57


In [810]:
laps.isna().sum()

Driver            0
LapNumber         0
LapTime           1
Sector1Time      21
Sector2Time       1
Sector3Time       1
Stint             0
Compound          0
PitInTime      1004
PitOutTime     1006
TrackStatus       0
TyreLife          0
FreshTyre         0
Position          1
dtype: int64

In [811]:
# merging the laps and telemetry
df = laps.copy()
df = df.merge(agg_tele, on='LapNumber', how='left')
df

Unnamed: 0,Driver_x,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,Compound,PitInTime,PitOutTime,TrackStatus,TyreLife,FreshTyre,Position,Driver_y,Speed_mean,Speed_max,Throttle_mean,Brake_mean
0,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0,ALB,173.613861,301.0,52.309406,0.269802
1,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0,ALO,176.760101,284.0,61.338384,0.207071
2,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0,BOT,177.396985,284.0,61.726131,0.276382
3,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0,DEV,171.092857,296.0,53.645238,0.309524
4,VER,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,4.0,False,1.0,GAS,171.167064,286.0,55.236277,0.252983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19689,PIA,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,13.0,True,16.0,SAR,162.416092,297.0,57.411494,0.222989
19690,PIA,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,13.0,True,16.0,STR,191.395664,309.0,61.121951,0.189702
19691,PIA,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,13.0,True,16.0,TSU,193.072222,309.0,61.930556,0.219444
19692,PIA,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,13.0,True,16.0,VER,197.473684,304.0,63.545706,0.182825


In [812]:
print("df columns:", df.columns.tolist())
print("results columns:", results.columns.tolist())


df columns: ['Driver_x', 'LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Stint', 'Compound', 'PitInTime', 'PitOutTime', 'TrackStatus', 'TyreLife', 'FreshTyre', 'Position', 'Driver_y', 'Speed_mean', 'Speed_max', 'Throttle_mean', 'Brake_mean']
results columns: ['Driver', 'Position', 'GridPosition', 'Status', 'Points', 'Laps']


In [813]:
df['Driver']=df['Driver_x']
df=df.drop(columns=['Driver_x','Driver_y'])

In [814]:
# merge results into df
df = df.merge(results, on='Driver', how='left')
df


Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,Compound,PitInTime,PitOutTime,TrackStatus,...,Speed_mean,Speed_max,Throttle_mean,Brake_mean,Driver,Position_y,GridPosition,Status,Points,Laps
0,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,173.613861,301.0,52.309406,0.269802,VER,1,1,Finished,25,57
1,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,176.760101,284.0,61.338384,0.207071,VER,1,1,Finished,25,57
2,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,177.396985,284.0,61.726131,0.276382,VER,1,1,Finished,25,57
3,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,171.092857,296.0,53.645238,0.309524,VER,1,1,Finished,25,57
4,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,171.167064,286.0,55.236277,0.252983,VER,1,1,Finished,25,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19689,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,162.416092,297.0,57.411494,0.222989,PIA,20,18,Retired,0,13
19690,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,191.395664,309.0,61.121951,0.189702,PIA,20,18,Retired,0,13
19691,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,193.072222,309.0,61.930556,0.219444,PIA,20,18,Retired,0,13
19692,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,197.473684,304.0,63.545706,0.182825,PIA,20,18,Retired,0,13


In [815]:
df.columns

Index(['LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Stint', 'Compound', 'PitInTime', 'PitOutTime', 'TrackStatus',
       'TyreLife', 'FreshTyre', 'Position_x', 'Speed_mean', 'Speed_max',
       'Throttle_mean', 'Brake_mean', 'Driver', 'Position_y', 'GridPosition',
       'Status', 'Points', 'Laps'],
      dtype='object')

In [816]:
# rename the x and y
df = df.rename(columns={
    'Position_x': 'Position',
    'Laps': 'Totallaps'
})



In [817]:
df.columns

Index(['LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Stint', 'Compound', 'PitInTime', 'PitOutTime', 'TrackStatus',
       'TyreLife', 'FreshTyre', 'Position', 'Speed_mean', 'Speed_max',
       'Throttle_mean', 'Brake_mean', 'Driver', 'Position_y', 'GridPosition',
       'Status', 'Points', 'Totallaps'],
      dtype='object')

In [818]:
df = df.drop(columns=['Position_y']) 


In [819]:
# removing non feature columns
#df_model = df.drop(columns=['Status', 'Points'])


In [820]:
len(df.columns)


22

In [821]:
# drop duplicate
col_drop=['Position_y', 'GridPosition_y', 'Status_y', 'Points_y', 'Laps_y']



In [822]:
df

Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,Compound,PitInTime,PitOutTime,TrackStatus,...,Position,Speed_mean,Speed_max,Throttle_mean,Brake_mean,Driver,GridPosition,Status,Points,Totallaps
0,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,1.0,173.613861,301.0,52.309406,0.269802,VER,1,Finished,25,57
1,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,1.0,176.760101,284.0,61.338384,0.207071,VER,1,Finished,25,57
2,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,1.0,177.396985,284.0,61.726131,0.276382,VER,1,Finished,25,57
3,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,1.0,171.092857,296.0,53.645238,0.309524,VER,1,Finished,25,57
4,1.0,0 days 00:01:39.019000,,0 days 00:00:42.414000,0 days 00:00:23.842000,1.0,SOFT,,,12,...,1.0,171.167064,286.0,55.236277,0.252983,VER,1,Finished,25,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19689,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,16.0,162.416092,297.0,57.411494,0.222989,PIA,18,Retired,0,13
19690,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,16.0,191.395664,309.0,61.121951,0.189702,PIA,18,Retired,0,13
19691,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,16.0,193.072222,309.0,61.930556,0.219444,PIA,18,Retired,0,13
19692,13.0,0 days 00:02:02.071000,0 days 00:00:34.839000,0 days 00:00:53.536000,0 days 00:00:33.696000,1.0,SOFT,0 days 01:24:56.987000,,1,...,16.0,197.473684,304.0,63.545706,0.182825,PIA,18,Retired,0,13


In [823]:
df.isna().sum()

LapNumber            0
LapTime             19
Sector1Time        419
Sector2Time         19
Sector3Time         19
Stint                0
Compound             0
PitInTime        18698
PitOutTime       18745
TrackStatus          0
TyreLife             0
FreshTyre            0
Position            19
Speed_mean           0
Speed_max            0
Throttle_mean        0
Brake_mean           0
Driver               0
GridPosition         0
Status               0
Points               0
Totallaps            0
dtype: int64

Filling missing values



In [824]:
#pit filling
df['is_pit_lap'] = df['PitInTime'].notna().astype(int)


In [825]:
df=df.drop(columns=['PitInTime', 'PitOutTime'])

In [826]:
df.columns

Index(['LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Stint', 'Compound', 'TrackStatus', 'TyreLife', 'FreshTyre', 'Position',
       'Speed_mean', 'Speed_max', 'Throttle_mean', 'Brake_mean', 'Driver',
       'GridPosition', 'Status', 'Points', 'Totallaps', 'is_pit_lap'],
      dtype='object')

In [827]:
df = df.dropna(subset=['LapTime','Sector1Time','Sector2Time','Sector3Time'])


In [828]:
df['Compound']

20       SOFT
21       SOFT
22       SOFT
23       SOFT
24       SOFT
         ... 
19689    SOFT
19690    SOFT
19691    SOFT
19692    SOFT
19693    SOFT
Name: Compound, Length: 19275, dtype: object

In [829]:

df.isna().sum()

LapNumber        0
LapTime          0
Sector1Time      0
Sector2Time      0
Sector3Time      0
Stint            0
Compound         0
TrackStatus      0
TyreLife         0
FreshTyre        0
Position         0
Speed_mean       0
Speed_max        0
Throttle_mean    0
Brake_mean       0
Driver           0
GridPosition     0
Status           0
Points           0
Totallaps        0
is_pit_lap       0
dtype: int64

In [830]:
df.dtypes


LapNumber        float64
LapTime           object
Sector1Time       object
Sector2Time       object
Sector3Time       object
Stint            float64
Compound          object
TrackStatus        int64
TyreLife         float64
FreshTyre           bool
Position         float64
Speed_mean       float64
Speed_max        float64
Throttle_mean    float64
Brake_mean       float64
Driver            object
GridPosition       int64
Status            object
Points             int64
Totallaps          int64
is_pit_lap         int64
dtype: object

In [831]:
# predicting the pit soon (target column)
df = df.sort_values(['Driver', 'LapNumber']).reset_index(drop=True)
N = 3 

def label_pit_soon(group):
    pit = group['is_pit_lap'].values
    y = np.zeros_like(pit)
    for i in range(len(pit)):
        # check laps i+1 ... i+N for a pit (if within bounds)
        if i + 1 < len(pit):
            y[i] = pit[i+1 : i+1+N].max()
        else:
            y[i] = 0
    group['pit_soon'] = y
    return group

df = df.groupby('Driver', group_keys=False).apply(label_pit_soon)



  df = df.groupby('Driver', group_keys=False).apply(label_pit_soon)


In [832]:
# One-hot- encoding to 'Compound','TrackStatus','Driver'
df_enc = df.copy()
df_enc = pd.get_dummies(df_enc,columns=['Compound','TrackStatus','Driver'],drop_first=True)
df_enc

Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,TyreLife,FreshTyre,Position,Speed_mean,...,Driver_OCO,Driver_PER,Driver_PIA,Driver_RUS,Driver_SAI,Driver_SAR,Driver_STR,Driver_TSU,Driver_VER,Driver_ZHO
0,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,192.539474,...,False,False,False,False,False,False,False,False,False,False
1,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,194.203753,...,False,False,False,False,False,False,False,False,False,False
2,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,193.962567,...,False,False,False,False,False,False,False,False,False,False
3,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,189.295812,...,False,False,False,False,False,False,False,False,False,False
4,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,191.226913,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19270,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,190.431525,...,False,False,False,False,False,False,False,False,False,True
19271,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,194.786842,...,False,False,False,False,False,False,False,False,False,True
19272,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,199.569554,...,False,False,False,False,False,False,False,False,False,True
19273,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,199.002703,...,False,False,False,False,False,False,False,False,False,True


In [833]:
df_enc

Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,TyreLife,FreshTyre,Position,Speed_mean,...,Driver_OCO,Driver_PER,Driver_PIA,Driver_RUS,Driver_SAI,Driver_SAR,Driver_STR,Driver_TSU,Driver_VER,Driver_ZHO
0,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,192.539474,...,False,False,False,False,False,False,False,False,False,False
1,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,194.203753,...,False,False,False,False,False,False,False,False,False,False
2,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,193.962567,...,False,False,False,False,False,False,False,False,False,False
3,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,189.295812,...,False,False,False,False,False,False,False,False,False,False
4,2.0,0 days 00:01:40.430000,0 days 00:00:31.765000,0 days 00:00:43.909000,0 days 00:00:24.756000,1.0,2.0,True,12.0,191.226913,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19270,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,190.431525,...,False,False,False,False,False,False,False,False,False,True
19271,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,194.786842,...,False,False,False,False,False,False,False,False,False,True
19272,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,199.569554,...,False,False,False,False,False,False,False,False,False,True
19273,56.0,0 days 00:01:33.996000,0 days 00:00:30.183000,0 days 00:00:40.533000,0 days 00:00:23.280000,4.0,5.0,False,16.0,199.002703,...,False,False,False,False,False,False,False,False,False,True


In [834]:
time_cols = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']

for col in time_cols:
    df_enc[col] = pd.to_timedelta(df[col]).dt.total_seconds()


In [835]:
df_enc.dtypes

LapNumber           float64
LapTime             float64
Sector1Time         float64
Sector2Time         float64
Sector3Time         float64
Stint               float64
TyreLife            float64
FreshTyre              bool
Position            float64
Speed_mean          float64
Speed_max           float64
Throttle_mean       float64
Brake_mean          float64
GridPosition          int64
Status               object
Points                int64
Totallaps             int64
is_pit_lap            int64
pit_soon              int64
Compound_MEDIUM        bool
Compound_SOFT          bool
TrackStatus_12         bool
TrackStatus_21         bool
TrackStatus_126        bool
TrackStatus_671        bool
TrackStatus_2671       bool
Driver_ALO             bool
Driver_BOT             bool
Driver_DEV             bool
Driver_GAS             bool
Driver_HAM             bool
Driver_HUL             bool
Driver_LEC             bool
Driver_MAG             bool
Driver_NOR             bool
Driver_OCO          

In [836]:
x_features=[
    'LapNumber','LapTime','Sector1Time','Sector2Time','Sector3Time',
    'Stint','Compound','TrackStatus','TyreLife','FreshTyre','Position',
    'Speed_mean','Speed_max','Throttle_mean','Brake_mean',
    'GridPosition','TotalLaps'
]

x = df_enc[[c for c in df_enc.columns
            if c in x_features
            or c.startswith('Compound_')
            or c.startswith('TrackStatus_')
            or c.startswith('Driver_')]]

In [854]:
x.columns

Index(['LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Stint', 'TyreLife', 'FreshTyre', 'Position', 'Speed_mean', 'Speed_max',
       'Throttle_mean', 'Brake_mean', 'GridPosition', 'Compound_MEDIUM',
       'Compound_SOFT', 'TrackStatus_12', 'TrackStatus_21', 'TrackStatus_126',
       'TrackStatus_671', 'TrackStatus_2671', 'Driver_ALO', 'Driver_BOT',
       'Driver_DEV', 'Driver_GAS', 'Driver_HAM', 'Driver_HUL', 'Driver_LEC',
       'Driver_MAG', 'Driver_NOR', 'Driver_OCO', 'Driver_PER', 'Driver_PIA',
       'Driver_RUS', 'Driver_SAI', 'Driver_SAR', 'Driver_STR', 'Driver_TSU',
       'Driver_VER', 'Driver_ZHO'],
      dtype='object')

In [837]:
y=df['pit_soon']
y


0        0
1        0
2        0
3        0
4        0
        ..
19270    0
19271    0
19272    0
19273    0
19274    0
Name: pit_soon, Length: 19275, dtype: int64

In [838]:
y.shape


(19275,)

In [839]:
x.shape

(19275, 40)

In [840]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)


In [841]:
x_train

Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,TyreLife,FreshTyre,Position,Speed_mean,...,Driver_OCO,Driver_PER,Driver_PIA,Driver_RUS,Driver_SAI,Driver_SAR,Driver_STR,Driver_TSU,Driver_VER,Driver_ZHO
9210,13.0,99.510,31.740,43.246,24.524,2.0,3.0,True,17.0,194.634877,...,False,False,False,False,False,False,False,False,False,False
15779,34.0,97.168,30.988,42.474,23.706,3.0,4.0,True,7.0,197.657459,...,False,False,False,False,False,False,True,False,False,False
9652,36.0,99.886,32.216,43.423,24.247,4.0,9.0,True,18.0,198.302198,...,False,False,False,False,False,False,False,False,False,False
8200,13.0,101.654,32.197,44.455,25.002,1.0,13.0,True,9.0,191.395664,...,False,False,False,False,False,False,False,False,False,False
18815,31.0,99.670,31.824,43.500,24.346,2.0,19.0,True,9.0,198.942466,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,28.0,97.211,31.052,42.274,23.885,2.0,13.0,False,2.0,147.574737,...,False,True,False,False,False,False,False,False,False,False
11964,9.0,101.534,31.989,44.548,24.997,1.0,9.0,True,16.0,189.329670,...,False,False,True,False,False,False,False,False,False,False
5390,13.0,119.179,52.819,42.559,23.801,2.0,1.0,True,8.0,194.634877,...,False,False,False,False,False,False,False,False,False,False
860,47.0,97.545,31.048,42.675,23.822,4.0,7.0,False,10.0,202.353107,...,False,False,False,False,False,False,False,False,False,False


In [842]:
x_test

Unnamed: 0,LapNumber,LapTime,Sector1Time,Sector2Time,Sector3Time,Stint,TyreLife,FreshTyre,Position,Speed_mean,...,Driver_OCO,Driver_PER,Driver_PIA,Driver_RUS,Driver_SAI,Driver_SAR,Driver_STR,Driver_TSU,Driver_VER,Driver_ZHO
5058,51.0,96.635,30.941,41.896,23.798,4.0,11.0,True,9.0,199.769444,...,False,False,False,False,False,False,False,False,False,False
15117,55.0,98.316,31.122,43.107,24.087,4.0,15.0,True,12.0,155.958763,...,False,False,False,False,False,True,False,False,False,False
19202,52.0,98.713,31.522,42.936,24.255,3.0,23.0,False,13.0,196.128134,...,False,False,False,False,False,False,False,False,False,True
7430,12.0,99.489,31.597,43.406,24.486,1.0,12.0,True,2.0,161.315193,...,False,False,False,False,False,False,False,False,False,False
14160,4.0,100.644,31.421,44.124,25.099,1.0,4.0,True,13.0,189.997409,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9809,44.0,97.219,30.936,42.401,23.882,5.0,7.0,True,17.0,200.877841,...,False,False,False,False,False,False,False,False,False,False
1803,41.0,126.713,39.952,58.704,28.057,3.0,7.0,True,4.0,150.995671,...,False,False,False,False,False,False,False,False,False,False
17391,10.0,98.369,31.361,42.884,24.124,1.0,13.0,False,1.0,187.207895,...,False,False,False,False,False,False,False,False,True,False
13086,2.0,98.933,31.451,43.164,24.318,1.0,5.0,False,4.0,195.975676,...,False,False,False,False,True,False,False,False,False,False


In [843]:
y_train

9210     0
15779    0
9652     0
8200     0
18815    0
        ..
11284    0
11964    0
5390     0
860      0
15795    0
Name: pit_soon, Length: 13492, dtype: int64

In [844]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


(13492, 40) (5783, 40) (13492,) (5783,)


In [846]:
#model 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200,max_depth=None,random_state=42,n_jobs=-1)
model.fit(x_train,y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [847]:
#predict
y_pred=model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(5783,))

In [849]:

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

accuracy_score(y_pred,y_test)

0.9891060003458413

In [853]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5470
           1       0.85      0.96      0.91       313

    accuracy                           0.99      5783
   macro avg       0.93      0.98      0.95      5783
weighted avg       0.99      0.99      0.99      5783



In [856]:
confusion_matrix(y_pred,y_test)

array([[5418,   52],
       [  11,  302]])

In [862]:
import pickle
with open("pit_model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(x.columns.tolist(), f)
