# Analyzing methods to collect data

In [None]:
import pandas as pd

In [None]:
import requests
import pandas as pd
def fetch_openf1_data(endpoint, params=None):
 base_url = "https://api.openf1.org/v1/"
 url = f"{base_url}{endpoint}"
 response = requests.get(url, params=params)
 response.raise_for_status() # Raise an exception for HTTP errors
 return response.json()

In [None]:
hypothetical_session_key = 'latest'

The cell below was used for testing openF1. Ultimately it was decided to use FastF1 because of it's easier integration with Python

In [None]:
# try:
#  lap_data = fetch_openf1_data('laps', {'session_key': hypothetical_session_key})
#  laps_df = pd.DataFrame(lap_data)
#  print(f"Fetched {len(laps_df)} laps.")
#  print(laps_df.head())
# except requests.exceptions.RequestException as e:
#  print(f"Error fetching data: {e}")
#  laps_df = pd.DataFrame()



# Using FastF1 to collect data and create a dataframe

In [None]:
import fastf1
import fastf1.plotting
import pandas as pd
import os


cache_dir = os.path.expanduser("~/.fastf1_cache")
os.makedirs(cache_dir, exist_ok=True)

fastf1.Cache.enable_cache("~/.fastf1_cache")
 # Example: Load a specific race session (e.g., 2023 Austrian Grand Prix Race)
 # You can find the correct year, GP name, and session type from the FastF1 documentation or by exploring.
 # For the Austrian Grand Prix, the GP name is typically 'Austrian Grand Prix'
try:
     session = fastf1.get_session(2023, 'Austrian Grand Prix', 'Race')
     session.load()
     fastf1_laps = session.laps
     print(f"\nFetched {len(fastf1_laps)} laps using FastF1 API.")
     print(fastf1_laps.head())

     # Access driver data
     fastf1_drivers = session.drivers
     print("\nFastF1 Driver Data:")
     print(fastf1_drivers.head())

     # Access telemetry data for a specific driver and lap (example)
     # Pick a driver (e.g., Max Verstappen, driver number 33)
     driver_telemetry = fastf1_laps.pick_driver(33).telemetry
     print("\nFastF1 Telemetry Data (Driver 33, all laps):")
     print(driver_telemetry.head())

     # Access weather data
     fastf1_weather = session.weather_data
     print("\nFastF1 Weather Data:")
     print(fastf1_weather.head())

except Exception as e:
    print(f"Error fetching data from FastF1: {e}")
    fastf1_laps = pd.DataFrame()
    fastf1_drivers = pd.DataFrame()
    fastf1_telemetry = pd.DataFrame()
    fastf1_weather = pd.DataFrame()

core           INFO 	Loading data for Austrian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No c


Fetched 1354 laps using FastF1 API.
                    Time Driver DriverNumber                LapTime  \
0 0 days 01:03:05.095000    VER            1 0 days 00:01:17.639000   
1 0 days 01:05:00.574000    VER            1 0 days 00:01:55.479000   
2 0 days 01:07:05.295000    VER            1 0 days 00:02:04.721000   
3 0 days 01:08:14.986000    VER            1 0 days 00:01:09.691000   
4 0 days 01:09:25.012000    VER            1 0 days 00:01:10.026000   

   LapNumber  Stint             PitOutTime              PitInTime  \
0        1.0    1.0                    NaT                    NaT   
1        2.0    1.0                    NaT 0 days 01:04:57.200000   
2        3.0    2.0 0 days 01:05:13.560000                    NaT   
3        4.0    2.0                    NaT                    NaT   
4        5.0    2.0                    NaT                    NaT   

             Sector1Time            Sector2Time  ... FreshTyre  \
0                    NaT 0 days 00:00:31.613000  ...   

In [None]:
years = list(range(2018, 2024 + 1))  # adjust range as needed
all_laps = []

for year in years:
    try:
        session = fastf1.get_session(year, 'Austrian Grand Prix', 'Race')
        session.load()
        laps = session.laps
        laps["year"] = year
        all_laps.append(laps)
    except Exception as e:
        print(f"Failed to load {year}: {e}")

df_all_laps = pd.concat(all_laps, ignore_index=True)


core           INFO 	Loading data for Austrian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No c

In [None]:
df_all_laps.to_csv('f1data.csv', index=False)

# Predicting the Austrian Grand Prix

Run the code from here:


In [14]:
import pandas as pd
df = pd.read_csv('f1data.csv')

In [15]:
df

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,year
0,0 days 00:08:03.720000,GAS,10,0 days 00:01:20.158000,1.0,,,,,0 days 00:00:34.389000,...,Toro Rosso,0 days 00:06:43.395000,2018-07-01 13:12:43.611,,15.0,False,,False,False,2018
1,0 days 00:09:16.779000,GAS,10,0 days 00:01:13.059000,2.0,1.0,,,0 days 00:00:17.521000,0 days 00:00:33.053000,...,Toro Rosso,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,False,,False,True,2018
2,0 days 00:10:28.223000,GAS,10,0 days 00:01:11.444000,3.0,1.0,,,0 days 00:00:17.443000,0 days 00:00:31.545000,...,Toro Rosso,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,False,,False,True,2018
3,0 days 00:11:39.388000,GAS,10,0 days 00:01:11.165000,4.0,1.0,,,0 days 00:00:17.161000,0 days 00:00:31.737000,...,Toro Rosso,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,False,,False,True,2018
4,0 days 00:12:51.796000,GAS,10,0 days 00:01:12.408000,5.0,1.0,,,0 days 00:00:17.333000,0 days 00:00:31.559000,...,Toro Rosso,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,False,,False,True,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,0 days 02:06:32.304000,NOR,4,0 days 00:01:09.356000,60.0,3.0,,,0 days 00:00:17.270000,0 days 00:00:31.434000,...,McLaren,0 days 02:05:22.948000,2024-06-30 14:12:54.059,1.0,2.0,False,,False,True,2024
9114,0 days 02:07:42.341000,NOR,4,0 days 00:01:10.037000,61.0,3.0,,,0 days 00:00:17.086000,0 days 00:00:32.125000,...,McLaren,0 days 02:06:32.304000,2024-06-30 14:14:03.415,1.0,2.0,False,,False,True,2024
9115,0 days 02:08:51.850000,NOR,4,0 days 00:01:09.509000,62.0,3.0,,,0 days 00:00:17.182000,0 days 00:00:31.527000,...,McLaren,0 days 02:07:42.341000,2024-06-30 14:15:13.452,1.0,2.0,False,,False,True,2024
9116,0 days 02:10:02.084000,NOR,4,0 days 00:01:10.234000,63.0,3.0,,,0 days 00:00:17.195000,0 days 00:00:32.111000,...,McLaren,0 days 02:08:51.850000,2024-06-30 14:16:22.961,1.0,2.0,False,,False,True,2024


In [16]:
df.columns

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'year'],
      dtype='object')

In [17]:
# prompt: drop columns in the dataframe

df = df.drop(['Time', 'Driver','DeletedReason','FastF1Generated','IsAccurate'], axis=1)
df

Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
0,10,0 days 00:01:20.158000,1.0,,,,,0 days 00:00:34.389000,0 days 00:00:23.283000,,...,,,True,Toro Rosso,0 days 00:06:43.395000,2018-07-01 13:12:43.611,,15.0,False,2018
1,10,0 days 00:01:13.059000,2.0,1.0,,,0 days 00:00:17.521000,0 days 00:00:33.053000,0 days 00:00:22.485000,0 days 00:08:21.241000,...,SUPERSOFT,1.0,True,Toro Rosso,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,False,2018
2,10,0 days 00:01:11.444000,3.0,1.0,,,0 days 00:00:17.443000,0 days 00:00:31.545000,0 days 00:00:22.456000,0 days 00:09:34.222000,...,SUPERSOFT,2.0,True,Toro Rosso,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,False,2018
3,10,0 days 00:01:11.165000,4.0,1.0,,,0 days 00:00:17.161000,0 days 00:00:31.737000,0 days 00:00:22.267000,0 days 00:10:45.384000,...,SUPERSOFT,3.0,True,Toro Rosso,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,False,2018
4,10,0 days 00:01:12.408000,5.0,1.0,,,0 days 00:00:17.333000,0 days 00:00:31.559000,0 days 00:00:23.516000,0 days 00:11:56.721000,...,SUPERSOFT,4.0,True,Toro Rosso,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,False,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,4,0 days 00:01:09.356000,60.0,3.0,,,0 days 00:00:17.270000,0 days 00:00:31.434000,0 days 00:00:20.652000,0 days 02:05:40.272000,...,MEDIUM,9.0,True,McLaren,0 days 02:05:22.948000,2024-06-30 14:12:54.059,1.0,2.0,False,2024
9114,4,0 days 00:01:10.037000,61.0,3.0,,,0 days 00:00:17.086000,0 days 00:00:32.125000,0 days 00:00:20.826000,0 days 02:06:49.444000,...,MEDIUM,10.0,True,McLaren,0 days 02:06:32.304000,2024-06-30 14:14:03.415,1.0,2.0,False,2024
9115,4,0 days 00:01:09.509000,62.0,3.0,,,0 days 00:00:17.182000,0 days 00:00:31.527000,0 days 00:00:20.800000,0 days 02:07:59.577000,...,MEDIUM,11.0,True,McLaren,0 days 02:07:42.341000,2024-06-30 14:15:13.452,1.0,2.0,False,2024
9116,4,0 days 00:01:10.234000,63.0,3.0,,,0 days 00:00:17.195000,0 days 00:00:32.111000,0 days 00:00:20.928000,0 days 02:09:09.099000,...,MEDIUM,12.0,True,McLaren,0 days 02:08:51.850000,2024-06-30 14:16:22.961,1.0,2.0,False,2024


In [19]:
# prompt: Replace NaN in stint, PitOutTime, PitInTime, Sector1Time, Sector1SessionTime, Sector2SessionTime, Sector3Time with 0

cols_to_fill = ['Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3Time']
df[cols_to_fill] = df[cols_to_fill].fillna(0)

df.head()

Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
0,10,0 days 00:01:20.158000,1.0,0.0,0,0,0,0 days 00:00:34.389000,0 days 00:00:23.283000,0,...,,,True,Toro Rosso,0 days 00:06:43.395000,2018-07-01 13:12:43.611,,15.0,False,2018
1,10,0 days 00:01:13.059000,2.0,1.0,0,0,0 days 00:00:17.521000,0 days 00:00:33.053000,0 days 00:00:22.485000,0 days 00:08:21.241000,...,SUPERSOFT,1.0,True,Toro Rosso,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,False,2018
2,10,0 days 00:01:11.444000,3.0,1.0,0,0,0 days 00:00:17.443000,0 days 00:00:31.545000,0 days 00:00:22.456000,0 days 00:09:34.222000,...,SUPERSOFT,2.0,True,Toro Rosso,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,False,2018
3,10,0 days 00:01:11.165000,4.0,1.0,0,0,0 days 00:00:17.161000,0 days 00:00:31.737000,0 days 00:00:22.267000,0 days 00:10:45.384000,...,SUPERSOFT,3.0,True,Toro Rosso,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,False,2018
4,10,0 days 00:01:12.408000,5.0,1.0,0,0,0 days 00:00:17.333000,0 days 00:00:31.559000,0 days 00:00:23.516000,0 days 00:11:56.721000,...,SUPERSOFT,4.0,True,Toro Rosso,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,False,2018


In [22]:
# prompt: Show the following columns Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
#        'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',

relevant_columns = ['Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                      'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest']

df[relevant_columns]


Unnamed: 0,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest
0,0,0 days 00:07:40.460000,0 days 00:08:03.822000,304.0,215.0,272.0,297.0,False
1,0 days 00:08:21.241000,0 days 00:08:54.294000,0 days 00:09:16.779000,306.0,214.0,271.0,290.0,True
2,0 days 00:09:34.222000,0 days 00:10:05.767000,0 days 00:10:28.223000,314.0,224.0,278.0,308.0,True
3,0 days 00:10:45.384000,0 days 00:11:17.121000,0 days 00:11:39.388000,316.0,227.0,279.0,309.0,True
4,0 days 00:11:56.721000,0 days 00:12:28.280000,0 days 00:12:51.796000,310.0,228.0,256.0,303.0,False
...,...,...,...,...,...,...,...,...
9113,0 days 02:05:40.272000,0 days 02:06:11.706000,0 days 02:06:32.358000,309.0,231.0,285.0,316.0,False
9114,0 days 02:06:49.444000,0 days 02:07:21.569000,0 days 02:07:42.395000,321.0,234.0,283.0,312.0,False
9115,0 days 02:07:59.577000,0 days 02:08:31.104000,0 days 02:08:51.904000,310.0,234.0,282.0,316.0,False
9116,0 days 02:09:09.099000,0 days 02:09:41.210000,0 days 02:10:02.138000,313.0,232.0,283.0,316.0,False


In [23]:
# prompt: Replace True with 1 and False with 0 in IsPersonalBest.

df['IsPersonalBest'] = df['IsPersonalBest'].replace({True: 1, False: 0})
df['IsPersonalBest']

  df['IsPersonalBest'] = df['IsPersonalBest'].replace({True: 1, False: 0})


Unnamed: 0,IsPersonalBest
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0
...,...
9113,0.0
9114,0.0
9115,0.0
9116,0.0


In [24]:
df[relevant_columns]

Unnamed: 0,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest
0,0,0 days 00:07:40.460000,0 days 00:08:03.822000,304.0,215.0,272.0,297.0,0.0
1,0 days 00:08:21.241000,0 days 00:08:54.294000,0 days 00:09:16.779000,306.0,214.0,271.0,290.0,1.0
2,0 days 00:09:34.222000,0 days 00:10:05.767000,0 days 00:10:28.223000,314.0,224.0,278.0,308.0,1.0
3,0 days 00:10:45.384000,0 days 00:11:17.121000,0 days 00:11:39.388000,316.0,227.0,279.0,309.0,1.0
4,0 days 00:11:56.721000,0 days 00:12:28.280000,0 days 00:12:51.796000,310.0,228.0,256.0,303.0,0.0
...,...,...,...,...,...,...,...,...
9113,0 days 02:05:40.272000,0 days 02:06:11.706000,0 days 02:06:32.358000,309.0,231.0,285.0,316.0,0.0
9114,0 days 02:06:49.444000,0 days 02:07:21.569000,0 days 02:07:42.395000,321.0,234.0,283.0,312.0,0.0
9115,0 days 02:07:59.577000,0 days 02:08:31.104000,0 days 02:08:51.904000,310.0,234.0,282.0,316.0,0.0
9116,0 days 02:09:09.099000,0 days 02:09:41.210000,0 days 02:10:02.138000,313.0,232.0,283.0,316.0,0.0


In [25]:
# prompt: Replace NaN in stint, PitOutTime, PitInTime, Sector1Time, Sector1SessionTime, Sector2SessionTime, Sector3Time with 0

cols_to_fill = ['SpeedI1','SpeedI2','SpeedFL','SpeedST']
df[cols_to_fill] = df[cols_to_fill].fillna(0)

df.head()

Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
0,10,0 days 00:01:20.158000,1.0,0.0,0,0,0,0 days 00:00:34.389000,0 days 00:00:23.283000,0,...,,,True,Toro Rosso,0 days 00:06:43.395000,2018-07-01 13:12:43.611,,15.0,False,2018
1,10,0 days 00:01:13.059000,2.0,1.0,0,0,0 days 00:00:17.521000,0 days 00:00:33.053000,0 days 00:00:22.485000,0 days 00:08:21.241000,...,SUPERSOFT,1.0,True,Toro Rosso,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,False,2018
2,10,0 days 00:01:11.444000,3.0,1.0,0,0,0 days 00:00:17.443000,0 days 00:00:31.545000,0 days 00:00:22.456000,0 days 00:09:34.222000,...,SUPERSOFT,2.0,True,Toro Rosso,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,False,2018
3,10,0 days 00:01:11.165000,4.0,1.0,0,0,0 days 00:00:17.161000,0 days 00:00:31.737000,0 days 00:00:22.267000,0 days 00:10:45.384000,...,SUPERSOFT,3.0,True,Toro Rosso,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,False,2018
4,10,0 days 00:01:12.408000,5.0,1.0,0,0,0 days 00:00:17.333000,0 days 00:00:31.559000,0 days 00:00:23.516000,0 days 00:11:56.721000,...,SUPERSOFT,4.0,True,Toro Rosso,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,False,2018


In [26]:
# prompt: Show the number of NaN left in the dataframe

print(df.isna().sum())

DriverNumber           0
LapTime               19
LapNumber              0
Stint                  0
PitOutTime             0
PitInTime              0
Sector1Time            0
Sector2Time           25
Sector3Time            0
Sector1SessionTime     0
Sector2SessionTime     0
Sector3SessionTime    32
SpeedI1                0
SpeedI2                0
SpeedFL                0
SpeedST                0
IsPersonalBest        14
Compound              21
TyreLife              21
FreshTyre              0
Team                   0
LapStartTime           0
LapStartDate          14
TrackStatus           20
Position              15
Deleted                0
year                   0
dtype: int64


In [28]:
# prompt: Display the rows where LapNumber is NaN

# Display rows where 'LapNumber' is NaN
nan_lap_number_rows = df[df['LapTime'].isna()]
print("\nRows where 'LapTime' is NaN:")
nan_lap_number_rows


Rows where 'LapTime' is NaN:


Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
495,27,,12.0,1.0,0,0,0,,0,0,...,ULTRASOFT,14.0,False,Renault,0 days 00:19:53.496000,,12.0,,False,2018
550,28,,55.0,1.0,0,0,0,,0,0,...,SUPERSOFT,54.0,True,Toro Rosso,0 days 01:11:16.907000,,1.0,,False,2018
604,3,,54.0,3.0,0,0,0,,0,0,...,SUPERSOFT,21.0,False,Red Bull Racing,0 days 01:08:47.826000,,12.0,,False,2018
877,44,,63.0,3.0,0,0,0,,0,0,...,SUPERSOFT,11.0,True,Mercedes,0 days 01:18:59.214000,,12.0,,False,2018
1103,77,,14.0,1.0,0,0,0,,0,0,...,SUPERSOFT,16.0,False,Mercedes,0 days 00:21:50.780000,,12.0,,False,2018
2877,20,,20.0,1.0,0,0,0,,0,0,...,MEDIUM,20.0,True,Haas F1 Team,0 days 01:01:29.470000,,12.0,,False,2020
2940,23,,63.0,4.0,0,0,0,,0,0,...,SOFT,20.0,False,Red Bull Racing,0 days 01:59:30.113000,,12.0,,False,2020
3005,26,,65.0,4.0,0,0,0,,0,0,...,SOFT,19.0,False,AlphaTauri,0 days 02:01:52.041000,,1.0,,False,2020
3424,6,,67.0,4.0,0,0,0,,0,0,...,SOFT,18.0,False,Williams,0 days 01:58:13.074000,,12.0,,False,2020
3433,63,,9.0,1.0,0,0,0 days 00:00:17.924000,0 days 00:00:31.836000,0 days 00:00:21.905000,0 days 00:50:03.607000,...,MEDIUM,9.0,True,Williams,0 days 00:42:34.246000,2020-07-05 13:22:34.469,1.0,20.0,False,2020


In [29]:
# prompt: Delete the rows with LapTime = NaN

df.dropna(subset=['LapTime'], inplace=True)
print("\nDataFrame after removing rows with NaN 'LapTime':")
df.head()
print("\nNaN counts after dropping 'LapTime' NaN rows:")
print(df.isna().sum())


DataFrame after removing rows with NaN 'LapTime':

NaN counts after dropping 'LapTime' NaN rows:
DriverNumber           0
LapTime                0
LapNumber              0
Stint                  0
PitOutTime             0
PitInTime              0
Sector1Time            0
Sector2Time            9
Sector3Time            0
Sector1SessionTime     0
Sector2SessionTime     0
Sector3SessionTime    16
SpeedI1                0
SpeedI2                0
SpeedFL                0
SpeedST                0
IsPersonalBest         0
Compound              21
TyreLife              21
FreshTyre              0
Team                   0
LapStartTime           0
LapStartDate           0
TrackStatus           20
Position               0
Deleted                0
year                   0
dtype: int64


In [31]:
relevant_columns = ['Sector2Time','Sector3SessionTime','Compound','TyreLife','TrackStatus']

df[relevant_columns]

Unnamed: 0,Sector2Time,Sector3SessionTime,Compound,TyreLife,TrackStatus
0,0 days 00:00:34.389000,0 days 00:08:03.822000,,,
1,0 days 00:00:33.053000,0 days 00:09:16.779000,SUPERSOFT,1.0,21.0
2,0 days 00:00:31.545000,0 days 00:10:28.223000,SUPERSOFT,2.0,1.0
3,0 days 00:00:31.737000,0 days 00:11:39.388000,SUPERSOFT,3.0,1.0
4,0 days 00:00:31.559000,0 days 00:12:51.796000,SUPERSOFT,4.0,1.0
...,...,...,...,...,...
9113,0 days 00:00:31.434000,0 days 02:06:32.358000,MEDIUM,9.0,1.0
9114,0 days 00:00:32.125000,0 days 02:07:42.395000,MEDIUM,10.0,1.0
9115,0 days 00:00:31.527000,0 days 02:08:51.904000,MEDIUM,11.0,1.0
9116,0 days 00:00:32.111000,0 days 02:10:02.138000,MEDIUM,12.0,1.0


In [35]:
df.dropna(subset=['TyreLife'], inplace=True)
print("\nDataFrame after removing rows with NaN 'TyreLife':")
df.head()
print("\nNaN counts after dropping 'TyreLife' NaN rows:")
print(df.isna().sum())


DataFrame after removing rows with NaN 'TyreLife':

NaN counts after dropping 'TyreLife' NaN rows:
DriverNumber           0
LapTime                0
LapNumber              0
Stint                  0
PitOutTime             0
PitInTime              0
Sector1Time            0
Sector2Time            9
Sector3Time            0
Sector1SessionTime     0
Sector2SessionTime     0
Sector3SessionTime    16
SpeedI1                0
SpeedI2                0
SpeedFL                0
SpeedST                0
IsPersonalBest         0
Compound               0
TyreLife               0
FreshTyre              0
Team                   0
LapStartTime           0
LapStartDate           0
TrackStatus            0
Position               0
Deleted                0
year                   0
dtype: int64


In [36]:
# prompt: Replace NaN values of Sector2Time,Sector3SessionTime  with 0

df[['Sector2Time', 'Sector3SessionTime']] = df[['Sector2Time', 'Sector3SessionTime']].fillna(0)
print("\nNaN counts after filling 'Sector2Time' and 'Sector3SessionTime' with 0:")
print(df.isna().sum())


NaN counts after filling 'Sector2Time' and 'Sector3SessionTime' with 0:
DriverNumber          0
LapTime               0
LapNumber             0
Stint                 0
PitOutTime            0
PitInTime             0
Sector1Time           0
Sector2Time           0
Sector3Time           0
Sector1SessionTime    0
Sector2SessionTime    0
Sector3SessionTime    0
SpeedI1               0
SpeedI2               0
SpeedFL               0
SpeedST               0
IsPersonalBest        0
Compound              0
TyreLife              0
FreshTyre             0
Team                  0
LapStartTime          0
LapStartDate          0
TrackStatus           0
Position              0
Deleted               0
year                  0
dtype: int64


In [38]:
# prompt: Replace True with 1 and False with 0 for Deleted, FreshTyre

df['Deleted'] = df['Deleted'].replace({True: 1, False: 0})
df['FreshTyre'] = df['FreshTyre'].replace({True: 1, False: 0})

  df['Deleted'] = df['Deleted'].replace({True: 1, False: 0})
  df['FreshTyre'] = df['FreshTyre'].replace({True: 1, False: 0})


In [39]:
df

Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
1,10,0 days 00:01:13.059000,2.0,1.0,0,0,0 days 00:00:17.521000,0 days 00:00:33.053000,0 days 00:00:22.485000,0 days 00:08:21.241000,...,SUPERSOFT,1.0,1,Toro Rosso,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,0,2018
2,10,0 days 00:01:11.444000,3.0,1.0,0,0,0 days 00:00:17.443000,0 days 00:00:31.545000,0 days 00:00:22.456000,0 days 00:09:34.222000,...,SUPERSOFT,2.0,1,Toro Rosso,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,0,2018
3,10,0 days 00:01:11.165000,4.0,1.0,0,0,0 days 00:00:17.161000,0 days 00:00:31.737000,0 days 00:00:22.267000,0 days 00:10:45.384000,...,SUPERSOFT,3.0,1,Toro Rosso,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,0,2018
4,10,0 days 00:01:12.408000,5.0,1.0,0,0,0 days 00:00:17.333000,0 days 00:00:31.559000,0 days 00:00:23.516000,0 days 00:11:56.721000,...,SUPERSOFT,4.0,1,Toro Rosso,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,0,2018
5,10,0 days 00:01:12.341000,6.0,1.0,0,0,0 days 00:00:18.014000,0 days 00:00:32.137000,0 days 00:00:22.190000,0 days 00:13:09.810000,...,SUPERSOFT,5.0,1,Toro Rosso,0 days 00:12:51.796000,2018-07-01 13:18:52.012,1.0,15.0,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,4,0 days 00:01:09.356000,60.0,3.0,0,0,0 days 00:00:17.270000,0 days 00:00:31.434000,0 days 00:00:20.652000,0 days 02:05:40.272000,...,MEDIUM,9.0,1,McLaren,0 days 02:05:22.948000,2024-06-30 14:12:54.059,1.0,2.0,0,2024
9114,4,0 days 00:01:10.037000,61.0,3.0,0,0,0 days 00:00:17.086000,0 days 00:00:32.125000,0 days 00:00:20.826000,0 days 02:06:49.444000,...,MEDIUM,10.0,1,McLaren,0 days 02:06:32.304000,2024-06-30 14:14:03.415,1.0,2.0,0,2024
9115,4,0 days 00:01:09.509000,62.0,3.0,0,0,0 days 00:00:17.182000,0 days 00:00:31.527000,0 days 00:00:20.800000,0 days 02:07:59.577000,...,MEDIUM,11.0,1,McLaren,0 days 02:07:42.341000,2024-06-30 14:15:13.452,1.0,2.0,0,2024
9116,4,0 days 00:01:10.234000,63.0,3.0,0,0,0 days 00:00:17.195000,0 days 00:00:32.111000,0 days 00:00:20.928000,0 days 02:09:09.099000,...,MEDIUM,12.0,1,McLaren,0 days 02:08:51.850000,2024-06-30 14:16:22.961,1.0,2.0,0,2024


In [40]:
# prompt: show the set of values in Compound, Team

print("Unique Compounds:", df['Compound'].unique())
print("Unique Teams:", df['Team'].unique())

Unique Compounds: ['SUPERSOFT' 'SOFT' 'ULTRASOFT' 'MEDIUM' 'HARD']
Unique Teams: ['Toro Rosso' 'Force India' 'McLaren' 'Sauber' 'Williams' 'Haas F1 Team'
 'Renault' 'Red Bull Racing' 'Mercedes' 'Ferrari' 'Alfa Romeo Racing'
 'Racing Point' 'AlphaTauri' 'Alpine' 'Aston Martin' 'Alfa Romeo' 'RB'
 'Kick Sauber']


In [41]:
# prompt: Replace the values in Compound with numeric values as follows: SUPERSOFT - 1 ' 'SOFT - 2' 'ULTRASOFT'- 3  'MEDIUM - 4' 'HARD - 5'

df['Compound'] = df['Compound'].replace({
    'SUPERSOFT': 1,
    'SOFT': 2,
    'ULTRASOFT': 3,
    'MEDIUM': 4,
    'HARD': 5
})

print("Unique Compounds after replacement:", df['Compound'].unique())

Unique Compounds after replacement: [1 2 3 4 5]


  df['Compound'] = df['Compound'].replace({


The values in Compound has been replaced as follows:
SUPERSOFT - 1 ' 'SOFT - 2' 'ULTRASOFT'- 3  'MEDIUM - 4' 'HARD - 5

In [42]:
# prompt: Replace values in Team as follows - Toro Rosso (1) ' 'Force India (2) ' 'McLaren (3) ' 'Sauber (4) ' 'Williams (5) ' 'Haas F1 Team (6)'
#  'Renault (7) ' 'Red Bull Racing (8) ' 'Mercedes (9) ' 'Ferrari (10) ' 'Alfa Romeo Racing (11)'
#  'Racing Point (12) ' 'AlphaTauri (13) ' 'Alpine (14) ' 'Aston Martin (15) ' 'Alfa Romeo (16) ' 'RB (17)'
#  'Kick Sauber (18)

team_mapping = {
    'Toro Rosso': 1,
    'Force India': 2,
    'McLaren': 3,
    'Sauber': 4,
    'Williams': 5,
    'Haas F1 Team': 6,
    'Renault': 7,
    'Red Bull Racing': 8,
    'Mercedes': 9,
    'Ferrari': 10,
    'Alfa Romeo Racing': 11,
    'Racing Point': 12,
    'AlphaTauri': 13,
    'Alpine': 14,
    'Aston Martin': 15,
    'Alfa Romeo': 16,
    'RB': 17,
    'Kick Sauber': 18
}

df['Team'] = df['Team'].replace(team_mapping)

print("Unique Teams after replacement:", df['Team'].unique())


Unique Teams after replacement: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]


  df['Team'] = df['Team'].replace(team_mapping)


Teams have been replaced with numbers as follows :   
    'Toro Rosso': 1,
    'Force India': 2,
    'McLaren': 3,
    'Sauber': 4,
    'Williams': 5,
    'Haas F1 Team': 6,
    'Renault': 7,
    'Red Bull Racing': 8,
    'Mercedes': 9,
    'Ferrari': 10,
    'Alfa Romeo Racing': 11,
    'Racing Point': 12,
    'AlphaTauri': 13,
    'Alpine': 14,
    'Aston Martin': 15,
    'Alfa Romeo': 16,
    'RB': 17,
    'Kick Sauber': 18

In [43]:
df

Unnamed: 0,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,year
1,10,0 days 00:01:13.059000,2.0,1.0,0,0,0 days 00:00:17.521000,0 days 00:00:33.053000,0 days 00:00:22.485000,0 days 00:08:21.241000,...,1,1.0,1,1,0 days 00:08:03.720000,2018-07-01 13:14:03.936,21.0,14.0,0,2018
2,10,0 days 00:01:11.444000,3.0,1.0,0,0,0 days 00:00:17.443000,0 days 00:00:31.545000,0 days 00:00:22.456000,0 days 00:09:34.222000,...,1,2.0,1,1,0 days 00:09:16.779000,2018-07-01 13:15:16.995,1.0,14.0,0,2018
3,10,0 days 00:01:11.165000,4.0,1.0,0,0,0 days 00:00:17.161000,0 days 00:00:31.737000,0 days 00:00:22.267000,0 days 00:10:45.384000,...,1,3.0,1,1,0 days 00:10:28.223000,2018-07-01 13:16:28.439,1.0,14.0,0,2018
4,10,0 days 00:01:12.408000,5.0,1.0,0,0,0 days 00:00:17.333000,0 days 00:00:31.559000,0 days 00:00:23.516000,0 days 00:11:56.721000,...,1,4.0,1,1,0 days 00:11:39.388000,2018-07-01 13:17:39.604,1.0,15.0,0,2018
5,10,0 days 00:01:12.341000,6.0,1.0,0,0,0 days 00:00:18.014000,0 days 00:00:32.137000,0 days 00:00:22.190000,0 days 00:13:09.810000,...,1,5.0,1,1,0 days 00:12:51.796000,2018-07-01 13:18:52.012,1.0,15.0,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,4,0 days 00:01:09.356000,60.0,3.0,0,0,0 days 00:00:17.270000,0 days 00:00:31.434000,0 days 00:00:20.652000,0 days 02:05:40.272000,...,4,9.0,1,3,0 days 02:05:22.948000,2024-06-30 14:12:54.059,1.0,2.0,0,2024
9114,4,0 days 00:01:10.037000,61.0,3.0,0,0,0 days 00:00:17.086000,0 days 00:00:32.125000,0 days 00:00:20.826000,0 days 02:06:49.444000,...,4,10.0,1,3,0 days 02:06:32.304000,2024-06-30 14:14:03.415,1.0,2.0,0,2024
9115,4,0 days 00:01:09.509000,62.0,3.0,0,0,0 days 00:00:17.182000,0 days 00:00:31.527000,0 days 00:00:20.800000,0 days 02:07:59.577000,...,4,11.0,1,3,0 days 02:07:42.341000,2024-06-30 14:15:13.452,1.0,2.0,0,2024
9116,4,0 days 00:01:10.234000,63.0,3.0,0,0,0 days 00:00:17.195000,0 days 00:00:32.111000,0 days 00:00:20.928000,0 days 02:09:09.099000,...,4,12.0,1,3,0 days 02:08:51.850000,2024-06-30 14:16:22.961,1.0,2.0,0,2024
