In [9]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob

# Set project root dynamically
project_root = Path().resolve().parent
data_dir = project_root / 'data'

# Get data file lists from data folder
file_list = glob.glob(str(data_dir / '*.csv'))

# Load data into dataframes
dfs = {Path(file).stem: pd.read_csv(file) for file in file_list}

# Filter out data from 2014 onwards
races = dfs['races']
races = races[races['year'] >= 2014]
valid_race_ids = races['raceId'].unique()

# Filter other datasets based on valid race IDs
for key in dfs.keys():
    if 'raceId' in dfs[key].columns:
        dfs[key] = dfs[key][dfs[key]['raceId'].isin(valid_race_ids)]

# Convert milliseconds to seconds
dfs['lap_times']['milliseconds'] = dfs['lap_times']['milliseconds'] / 1000
dfs['pit_stops']['milliseconds'] = dfs['pit_stops']['milliseconds'] / 1000

# Create races_cleaned dictionary
races_cleaned = {}

for race_id in valid_race_ids:
    # Get lap times and reshape into matrix
    lap_times = dfs['lap_times'][dfs['lap_times']['raceId'] == race_id]
    
    if lap_times.empty:
        continue
    
    # Reshape into matrix format (fill missing laps with zeros)
    lap_times_matrix = lap_times.pivot(index='lap', columns='driverId', values='milliseconds').fillna(0)
    
    # Get cumulative race time for each driver
    cumulative_times = lap_times_matrix.cumsum(axis=0)
    
    # Compute gap to leader (subtract minimum cumulative time)
    gaps_to_leader = cumulative_times.subtract(cumulative_times.min(axis=1), axis=0)
    
    # Compute gap to car ahead
    gaps_to_car_ahead = cumulative_times.copy()
    
    for lap in gaps_to_car_ahead.index:
        # Rank drivers by cumulative time at each lap
        ranked_drivers = cumulative_times.loc[lap].sort_values().index
        for i in range(1, len(ranked_drivers)):
            # Difference between each driver and the one ahead
            gaps_to_car_ahead.loc[lap, ranked_drivers[i]] = (
                cumulative_times.loc[lap, ranked_drivers[i]] - cumulative_times.loc[lap, ranked_drivers[i - 1]]
            )
        # Set the leader's gap to car ahead as NaN (since there's no car ahead)
        gaps_to_car_ahead.loc[lap, ranked_drivers[0]] = np.nan
    
    # Get pit stop information
    pit_stops = dfs['pit_stops'][dfs['pit_stops']['raceId'] == race_id]
    pit_stop_info = {}
    
    for _, row in pit_stops.iterrows():
        driver_id = row['driverId']
        if driver_id not in pit_stop_info:
            pit_stop_info[driver_id] = {}
        pit_stop_info[driver_id][row['stop']] = row['milliseconds']
    
    # Store in races_cleaned
    races_cleaned[race_id] = {
        'lap_times': lap_times_matrix,
        'pit_stops': pit_stop_info,
        'gaps_to_leader': gaps_to_leader,
        'gaps_to_car_ahead': gaps_to_car_ahead
    }

# Example access:
example_race_id = list(races_cleaned.keys())[0]
print("\nLap Times Matrix:")
print(races_cleaned[example_race_id]['lap_times'].head())

print("\nGaps to Leader:")
print(races_cleaned[example_race_id]['gaps_to_leader'].head())

print("\nGaps to Car Ahead:")
print(races_cleaned[example_race_id]['gaps_to_car_ahead'].head())

print("\nPit Stop Info:")
print(races_cleaned[example_race_id]['pit_stops'])



Lap Times Matrix:
driverId      1        3        4        8        16       18       20   \
lap                                                                       
1         106.128  102.038  108.266  109.387  113.369  110.328  120.977   
2         100.287   97.687   98.611   98.630  101.685   98.968  109.947   
3           0.000   95.765   97.810   97.771  100.024   98.705  111.460   
4           0.000   94.939   95.719   97.013   99.823   97.156    0.000   
5           0.000   95.438   96.205   96.092  101.794   97.120    0.000   

driverId      154      807      813      815      817      818      820  \
lap                                                                       
1         130.795  106.986  116.505  156.707  103.549  108.805  125.520   
2         105.629   98.359  100.456  104.790   97.459   98.579  103.507   
3          98.519   96.856  100.141   97.064   96.321   99.424  100.425   
4          99.157   96.878   99.728   96.510   95.966   96.855   98.928   
5    

In [10]:
import os

# Create cleaned data directory within the project
cleaned_data_dir = project_root / 'cleaned_data'
cleaned_data_dir.mkdir(parents=True, exist_ok=True)

# Save each race's data as a separate file
for race_id, data in races_cleaned.items():
    # Create a subfolder for each race ID
    race_dir = cleaned_data_dir / f'race_{race_id}'
    race_dir.mkdir(parents=True, exist_ok=True)
    
    # Save lap times
    data['lap_times'].to_csv(race_dir / 'lap_times.csv', index=True)
    
    # Save gaps to leader
    data['gaps_to_leader'].to_csv(race_dir / 'gaps_to_leader.csv', index=True)
    
    # Save gaps to car ahead
    data['gaps_to_car_ahead'].to_csv(race_dir / 'gaps_to_car_ahead.csv', index=True)
    
    # Save pit stop info as JSON
    import json
    with open(race_dir / 'pit_stops.json', 'w') as f:
        json.dump(data['pit_stops'], f)

print(f"Cleaned data saved to: {cleaned_data_dir}")

Cleaned data saved to: /Users/elliotporter/Desktop/Github/f1-rl-project/cleaned_data
