In [49]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob
import os
import shutil
import matplotlib.pyplot as plt

# Set project root dynamically
project_root = Path().resolve().parent
data_dir = project_root/'data'

In [52]:
# Load cleaned tyre data
tyre_dir = project_root /'cleaned_tyre_data'
tyre_list = glob.glob(str(tyre_dir / '*.csv'))
dfs_tyre = {Path(file).stem: pd.read_csv(file) for file in tyre_list}

print(f"Loaded {len(dfs_tyre)} cleaned tyre files")
data_list = glob.glob(str(data_dir / '*.csv'))
dfs_data = {Path(file).stem: pd.read_csv(file) for file in data_list}

Loaded 22 cleaned tyre files


In [55]:
# Load core race data
lap_times = dfs_data['lap_times']
races = dfs_data['races']
drivers = dfs_data['drivers']
'''print("Laptimes")
print(lap_times.head())
print("Races")
print(races.head())
print("Drivers")
print(drivers.head())'''

'print("Laptimes")\nprint(lap_times.head())\nprint("Races")\nprint(races.head())\nprint("Drivers")\nprint(drivers.head())'

In [59]:
lap_times_copy = lap_times.copy()
times = lap_times_copy
print(times.keys())

Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds'], dtype='object')


In [35]:
pit_stops = dfs_data['pit_stops']

# Filter for a specific driver and race
race_id = 926  # Example for 2015 Australia
driver_id = 1

pit_data = pit_stops.loc[
    (pit_stops['raceId'] == race_id) & 
    (pit_stops['driverId'] == driver_id)
]

print(pit_data[['stop', 'lap']])


      stop  lap
3834     1   25


In [151]:
def detect_pit_stops(lap_times, threshold=12000):
    # Pit stop → Large increase in lap time relative to previous lap
    pit_laps = []
    for i in range(1, len(lap_times)):
        if int(lap_times[i]) - int(lap_times[i-1]) > threshold:
            pit_laps.append(i + 1)  # Lap number is 1-based
    return pit_laps

In [169]:
def assign_tyre_data(df, tyre_data, threshold=12000):
    # ✅ Step 1: Identify if Supersoft is present for any driver in the race
    supersoft_present = any(
        'Super' in str(tyre_data.get(f'Stint {i}', '')).strip()
        for i in range(1, 5) 
        for idx in tyre_data.index
    )

    # ✅ Step 2: Define compound map based on presence of Supersoft
    if supersoft_present:
        compound_map = {
            'Supersoft': 'Soft',
            'Super': 'Soft',
            'Soft': 'Medium',
            'Medium': 'Hard',
            'Hard': 'Hard'
        }
    else:
        compound_map = {
            'Supersoft': 'Supersoft',
            'Super': 'Super',
            'Soft': 'Soft',
            'Medium': 'Medium',
            'Hard': 'Hard'
        }

    print(f"Compound map for this race: {compound_map}")

    # ✅ Step 3: Loop through each driver ID
    for driver_id in tyre_data['driverId'].unique():
        driver_stints = tyre_data.loc[tyre_data['driverId'] == driver_id].iloc[0]
        
        # ✅ Create a dictionary to hold stint info
        stint_map = {}
        for i in range(1, 5):
            stint_key = f'Stint {i}'
            stint_value = driver_stints.get(stint_key)  
            
            if pd.notna(stint_value) and isinstance(stint_value, str):  # ✅ Skip empty or NaN
                try:
                    compound, length = stint_value.split('(')
                    compound = compound.strip()
                    length = int(length.strip(')'))
                    
                    # ✅ Map using the adjusted compound map
                    compound = compound_map.get(compound.strip(), compound.strip())
                    stint_map[stint_key] = (compound, length)
                except ValueError:
                    print(f"⚠️ Invalid stint data for driver {driver_id} in {stint_key}: {stint_value}")
                    continue

        print(f"✅ Driver {driver_id} stint map: {stint_map}")

        # ✅ Step 4: Filter the lap data for the specific driver
        driver_laps = df.loc[df['driverId'] == driver_id].copy()

        # ✅ Step 5: Detect pit stops using stint length
        pit_laps = []
        current_lap = 1
        for stint_key in stint_map.keys():
            compound, stint_length = stint_map[stint_key]
            pit_lap = current_lap + stint_length
            if pit_lap <= driver_laps['lap'].max():
                # ✅ Search neighborhood of ±2 laps
                neighbor_window = driver_laps.loc[
                    (driver_laps['lap'] >= pit_lap - 2) &
                    (driver_laps['lap'] <= pit_lap + 2)
                ]
                if not neighbor_window.empty:
                    # ✅ Use .loc[] instead of .iloc[] to avoid IndexError
                    best_guess = neighbor_window.loc[neighbor_window['milliseconds'].idxmax(), 'lap']
                    pit_laps.append(best_guess)
                else:
                    pit_laps.append(pit_lap)
            current_lap = pit_lap + 1
        
        print(f"✅ Detected pit stops for driver {driver_id}: {pit_laps}")

        # ✅ Step 6: Assign compound and age using the stint_map
        stint_index = 0
        lap_counter = 1
        
        for idx in driver_laps.index:
            if stint_index < len(pit_laps) and driver_laps.loc[idx, 'lap'] == pit_laps[stint_index]:
                stint_index += 1
                lap_counter = 1
            
            stint_key = f'Stint {stint_index + 1}'
            if stint_key in stint_map:
                compound, stint_length = stint_map[stint_key]
                driver_laps.loc[idx, 'tyre_compound'] = compound
                driver_laps.loc[idx, 'tyre_age'] = lap_counter
            
            lap_counter += 1
        
        # ✅ Step 7: Save back into the original dataframe
        df.loc[df['driverId'] == driver_id] = driver_laps
    
    print(f"✅ `tyre_compound` and `tyre_age` successfully created.")
    return df


In [82]:
tyre_keys = list(dfs_tyre.keys())
tk = tyre_keys[0]
print(tyre_keys)
print(tk)
print(dfs_tyre[tk][dfs_tyre[tk]['driverId'] == 3])


['948', '949', '928', '929', '939', '938', '936', '937', '934', '930', '931', '927', '933', '932', '926', '941', '940', '942', '943', '944', '950', '945']
948
           NAME     Stint 1   Stint 2      Stint 3 Stint 4  driverId lastName
0  Nico Rosberg  Super (12)  Soft (6)  Medium (39)     NaN         3  rosberg


In [89]:
# Convert tyre_keys to integer (if necessary)
tkeys = [int(k) for k in tyre_keys]

# Filter the times dataframe to only include raceIds in tyre_keys
filtered_times = times[times['raceId'].isin(tkeys)].copy()
df = filtered_times
# Show the filtered dataframe
print(filtered_times.head(40))
print(f"Filtered times shape: {filtered_times.shape}")

       raceId  driverId  lap  position      time  milliseconds
90908     926         3   48         2  1:31.332         91332
90909     926         3   47         2  1:31.092         91092
90910     926         3   46         2  1:31.319         91319
90911     926         3   45         2  1:31.231         91231
90912     926         3   44         2  1:32.243         92243
90913     926         3   43         2  1:31.259         91259
90914     926         3   42         2  1:31.493         91493
90915     926         3   41         2  1:31.400         91400
90916     926         3   40         2  1:31.798         91798
90917     926         3   39         2  1:31.565         91565
90918     926         3   38         2  1:31.573         91573
90919     926         3   37         2  1:31.610         91610
90920     926         3   36         2  1:31.355         91355
90921     926         3   35         2  1:31.775         91775
90922     926         3   34         2  1:32.981       

In [91]:
df_copy = df
# Create a mapping from raceId to circuitId using the races dataframe
race_to_circuit_map = races.set_index('raceId')['circuitId'].to_dict()
# Add circuitId to filtered_times by mapping raceId
df_copy['circuitId'] = df_copy['raceId'].map(race_to_circuit_map)
# Show the updated dataframe
print(df_copy.head(10))  


       raceId  driverId  lap  position      time  milliseconds  circuitId
90908     926         3   48         2  1:31.332         91332          1
90909     926         3   47         2  1:31.092         91092          1
90910     926         3   46         2  1:31.319         91319          1
90911     926         3   45         2  1:31.231         91231          1
90912     926         3   44         2  1:32.243         92243          1
90913     926         3   43         2  1:31.259         91259          1
90914     926         3   42         2  1:31.493         91493          1
90915     926         3   41         2  1:31.400         91400          1
90916     926         3   40         2  1:31.798         91798          1
90917     926         3   39         2  1:31.565         91565          1


In [171]:
# Select a single race for testing
test_race_id = '948'

# Get lap data for that race
test_data = df_copy.loc[df_copy['raceId'] == int(test_race_id)].copy()

# ✅ Step 1: Ensure lap time column exists
if 'lapTime' not in test_data.columns:
    for alt_name in ['time', 'laptime', 'milliseconds']:
        if alt_name in test_data.columns:
            test_data.rename(columns={alt_name: 'lapTime'}, inplace=True)
            break

# ✅ Step 2: Get tyre data for that race
if test_race_id in tyre_data.keys():
    tyre_info = tyre_data[test_race_id]
    
    if not tyre_info.empty:
        print(f"Tyre info for race {test_race_id} loaded:")
        print(tyre_info)

        # ✅ Step 3: Apply tyre compound and age logic
        test_data = assign_tyre_data(test_data, tyre_info)

        # ✅ Step 4: Ensure the function actually returned a dataframe
        if test_data is not None and isinstance(test_data, pd.DataFrame):
            # ✅ Check that columns were created correctly
            if 'tyre_compound' in test_data.columns and 'tyre_age' in test_data.columns:
                print(f"✅ Tyre data successfully added for race {test_race_id}")
                print(test_data[['raceId', 'lap', 'tyre_compound', 'tyre_age']])
            else:
                print(f"⚠️ Tyre compound or age columns not created for race {test_race_id}.")
        else:
            print(f"⚠️ Tyre compound or age columns not created for race {test_race_id}.")
    else:
        print(f"⚠️ No tyre data found for race {test_race_id}.")
else:
    print(f"⚠️ No tyre data available for race ID {test_race_id}.")


Tyre info for race 948 loaded:
                 NAME     Stint 1      Stint 2      Stint 3      Stint 4  \
0        Nico Rosberg  Super (12)     Soft (6)  Medium (39)          NaN   
1      Lewis Hamilton  Super (16)  Medium (41)          NaN          NaN   
2    Sebastian Vettel  Super (13)   Super (22)    Soft (22)          NaN   
3    Daniel Ricciardo  Super (12)    Super (6)    Soft (24)   Super (15)   
4        Felipe Massa  Super (11)     Soft (7)  Medium (39)          NaN   
5     Romain Grosjean   Soft (18)  Medium (39)          NaN          NaN   
6     Nico Hulkenberg   Soft (16)  Medium (41)          NaN          NaN   
7     Valtteri Bottas   Soft (17)  Medium (40)          NaN          NaN   
8    Carlos Sainz Jnr   Super (8)    Soft (10)    Soft (13)  Medium (26)   
9      Max Verstappen  Super (13)     Soft (5)    Soft (14)  Medium (25)   
10      Jolyon Palmer  Super (12)     Soft (6)  Medium (39)          NaN   
11    Kevin Magnussen   Super (1)    Soft (17)  Medium (3

In [117]:
print(tyre_data.keys())
tyre_race_ids = tyre_data['948'].keys()
print(tyre_race_ids)
print(df_copy.keys())

dict_keys(['948', '949', '928', '929', '939', '938', '936', '937', '934', '930', '931', '927', '933', '932', '926', '941', '940', '942', '943', '944', '950', '945'])
Index(['NAME', 'Stint 1', 'Stint 2', 'Stint 3', 'Stint 4', 'driverId',
       'lastName'],
      dtype='object')
Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds',
       'circuitId'],
      dtype='object')


In [163]:
print(test_data.head(57))

        raceId  driverId  lap  position    lapTime  milliseconds  circuitId  \
111775     948        20    1         1   1:36.916         96916          1   
111776     948        20    2         1   1:31.664         91664          1   
111777     948        20    3         1   1:32.167         92167          1   
111778     948        20    4         1   1:32.014         92014          1   
111779     948        20    5         1   1:32.273         92273          1   
111780     948        20    6         1   1:32.204         92204          1   
111781     948        20    7         1   1:32.080         92080          1   
111782     948        20    8         1   1:32.289         92289          1   
111783     948        20    9         1   1:32.578         92578          1   
111784     948        20   10         1   1:32.571         92571          1   
111785     948        20   11         1   1:32.960         92960          1   
111786     948        20   12         1   1:33.014  

In [172]:
test_id = '937'
tyre_ex = tyre_data[test_id]
print(tyre_ex)

          Unnamed: 0      Stint 1      Stint 2      Stint 3    Stint 4
0     Lewis Hamilton    Soft (13)  Medium (17)    Soft (13)        NaN
1       Nico Rosberg    Soft (12)  Medium (19)    Soft (12)        NaN
2    Romain Grosjean     Soft (9)    Soft (12)  Medium (22)        NaN
3       Daniil Kvyat     Soft (9)  Medium (18)    Soft (16)        NaN
4       Sergio Perez     Soft (8)    Soft (12)  Medium (23)        NaN
5       Felipe Massa     Soft (9)  Medium (12)  Medium (22)        NaN
6     Kimi Raikkonen    Soft (11)    Soft (10)  Medium (22)        NaN
7     Max Verstappen     Soft (9)    Soft (12)  Medium (11)  Soft (11)
8    Valtteri Bottas     Soft (8)    Soft (13)  Medium (22)        NaN
9    Marcus Ericsson     Soft (9)  Medium (19)    Soft (15)        NaN
10       Felipe Nasr  Medium (10)    Soft (17)    Soft (16)        NaN
11  Sebastian Vettel    Soft (14)  Medium (28)          NaN        NaN
12   Fernando Alonso   Medium (8)    Soft (12)    Soft (12)  Soft (10)
13    

In [174]:
# ✅ Create a dictionary to store processed data by raceId
processed_data_by_race = {}

# ✅ Loop through all tyre datasets
for race_id, tyre_info in tyre_data.items():
    print(f"Processing Race ID {race_id}...")
    
    # ✅ Get lap data for that race
    race_data = lap_times_copy.loc[lap_times_copy['raceId'] == int(race_id)].copy()
    
    if not race_data.empty and not tyre_info.empty:
        result = assign_tyre_data(race_data, tyre_info)
        if result is not None:
            processed_data_by_race[race_id] = result
    else:
        print(f"⚠️ No valid lap data or tyre info for race {race_id}")

# ✅ Final check
for race_id, data in processed_data_by_race.items():
    print(f"✅ Processed data for race {race_id}: {data.shape}")


Processing Race ID 948...
Compound map for this race: {'Supersoft': 'Soft', 'Super': 'Soft', 'Soft': 'Medium', 'Medium': 'Hard', 'Hard': 'Hard'}
✅ Driver 3 stint map: {'Stint 1': ('Soft', 12), 'Stint 2': ('Medium', 6), 'Stint 3': ('Hard', 39)}
✅ Detected pit stops for driver 3: [np.int64(12), np.int64(19)]
✅ Driver 1 stint map: {'Stint 1': ('Soft', 16), 'Stint 2': ('Hard', 41)}
✅ Detected pit stops for driver 1: [np.int64(18)]
✅ Driver 20 stint map: {'Stint 1': ('Soft', 13), 'Stint 2': ('Soft', 22), 'Stint 3': ('Medium', 22)}
✅ Detected pit stops for driver 20: [np.int64(13), np.int64(35)]
✅ Driver 817 stint map: {'Stint 1': ('Soft', 12), 'Stint 2': ('Soft', 6), 'Stint 3': ('Medium', 24), 'Stint 4': ('Soft', 15)}
✅ Detected pit stops for driver 817: [np.int64(12), np.int64(19), np.int64(43)]
✅ Driver 13 stint map: {'Stint 1': ('Soft', 11), 'Stint 2': ('Medium', 7), 'Stint 3': ('Hard', 39)}
✅ Detected pit stops for driver 13: [np.int64(11), np.int64(18)]
✅ Driver 154 stint map: {'Stint 

KeyError: 'driverId'

In [None]:
# ✅ Export each race's data to a separate CSV file
output_dir = project_root/'processed_training_data'
os.makedirs(output_dir, exist_ok=True)

for race_id, data in processed_data_by_race.items():
    output_file = os.path.join(output_dir, f"{race_id}_training_data.csv")
    data.to_csv(output_file, index=False)
    print(f"✅ Training data for race {race_id} saved to {output_file}")