In [37]:
import logging

from util import convert_timedelta_columns_to_millis, generate_empty_session_df
import os
import pandas as pd
import fastf1
from tqdm.notebook import tqdm
from glob import glob

# Set debug mode
debug = False

# Set logging level
fastf1.set_log_level(logging.DEBUG if debug else logging.ERROR)

# Create debug directory if in debug mode
if debug:
    debug_dir = '../data/processed/fastf1_batches/debug'
    os.makedirs(debug_dir, exist_ok=True)

# Enable FastF1 cache
fastf1.Cache.enable_cache("../data/cache")


In [57]:
# Load original and races data
original_df = pd.read_csv('../data/processed/export_v1.csv', dtype={'q1': str, 'q2': str, 'q3': str, 'number': str})

races_df = pd.read_csv('../data/raw_data/races.csv')

# Filter and merge dataframes
df = original_df[['raceId']].copy()
df.drop_duplicates(inplace=True)
df.reset_index(drop=True)
df = pd.merge(df, races_df, on='raceId', how='left')
df = df[df['year'] >= 2018]

df.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
976,989,2018,1,1,Australian Grand Prix,2018-03-25,05:10:00,http://en.wikipedia.org/wiki/2018_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
977,990,2018,2,3,Bahrain Grand Prix,2018-04-08,15:10:00,http://en.wikipedia.org/wiki/2018_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
978,991,2018,3,17,Chinese Grand Prix,2018-04-15,06:10:00,http://en.wikipedia.org/wiki/2018_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
979,992,2018,4,73,Azerbaijan Grand Prix,2018-04-29,12:10:00,http://en.wikipedia.org/wiki/2018_Azerbaijan_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
980,993,2018,5,4,Spanish Grand Prix,2018-05-13,13:10:00,http://en.wikipedia.org/wiki/2018_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [49]:
# Initialize list to store race sessions
races = []

# Iterate over each race and get session data
for idx, row in df.iterrows():
    try:
        fp1 = fastf1.get_session(year=row['year'], gp=row['name'], identifier='fp1')
    except:
        fp1 = None
    try:
        fp2 = fastf1.get_session(year=row['year'], gp=row['name'], identifier='fp2')
    except:
        fp2 = None
    try:
        fp3 = fastf1.get_session(year=row['year'], gp=row['name'], identifier='fp3')
    except:
        fp3 = None
    try:
        quali = fastf1.get_session(year=row['year'], gp=row['name'], identifier='q')
    except:
        quali = None

    races.append({
        'raceId': row['raceId'],
        'fp1_session': fp1,
        'fp2_session': fp2,
        'fp3_session': fp3,
        'qualifying': quali
    })

In [52]:
def process_session(session, race_id, session_name):
    """
    Process the session data for a given race and session name.

    Parameters:
    session (fastf1.core.Session): The session object to be processed.
    race_id (int): The ID of the race.
    session_name (str): The name of the session (e.g., 'fp1', 'fp2', 'fp3').

    Returns:
    pd.DataFrame: A DataFrame containing the processed session data.
    """
    try:
        # Load session data
        session.load(laps=True, telemetry=False, weather=False, messages=False)
        laps = session.laps
        # Filter out deleted laps
        laps = laps[(laps['Deleted'] == False) | (laps['Deleted'].isnull())]
        # Pick quick laps and make a copy
        quick_laps = laps.pick_quicklaps().copy()
        # Convert timedelta columns to milliseconds
        convert_timedelta_columns_to_millis(quick_laps, ['Sector1Time', 'Sector2Time', 'LapTime'], in_place=True)
        # Group by driver and calculate mean values
        grp = quick_laps.groupby("Driver").mean(numeric_only=True)
        # Create a DataFrame with the processed data
        sess_df = pd.DataFrame({
            'raceId': race_id,
            'driver_code': grp.index,
            f'{session_name}_avg_sector_1': grp['Sector1Time'],
            f'{session_name}_avg_sector_2': grp['Sector2Time'],
            f'{session_name}_avg_lap_time': grp['LapTime'],
            f'{session_name}_avg_speedI1': grp['SpeedI1'],
            f'{session_name}_avg_speedI2': grp['SpeedI2'],
            f'{session_name}_avg_speedFL': grp['SpeedFL'],
            f'{session_name}_avg_speedST': grp['SpeedST'],
            f'{session_name}_avg_tyre_life': grp['TyreLife'],
            f'{session_name}_avg_is_on_fresh_tyres': grp['FreshTyre']
        })
    except:
        # Generate an empty DataFrame if an error occurs
        sess_df = generate_empty_session_df(session_name, race_id)
    return sess_df

In [53]:
def process_qualifying_session(race):
    """
    Process the qualifying session for a given race and return the qualifying DataFrame.

    Parameters:
    race (dict): A dictionary containing race information and session data.

    Returns:
    pd.DataFrame: A DataFrame containing the processed qualifying session data.
    """
    race['qualifying'].load(laps=True, telemetry=False, weather=False, messages=False)
    results = race['qualifying'].results
    if not results.empty:
        convert_timedelta_columns_to_millis(results, ['Q1', 'Q2', 'Q3'], in_place=True)

        qualifying_df = pd.DataFrame({
            'raceId': race['raceId'],
            'driver_code': results['Abbreviation'],
            'q1_time': results['Q1'],
            'q2_time': results['Q2'],
            'q3_time': results['Q3'],
            'q_position': results['Position']
        })
    else:
        qualifying_df = pd.DataFrame(
            columns=['raceId', 'driver_code', 'q1_time', 'q2_time', 'q3_time', 'q_position'],
            data={
                'raceId': race['raceId'],
                'driver_code': [],
                'q1_time': [],
                'q2_time': [],
                'q3_time': [],
                'q_position': []
            }
        )
    return qualifying_df

In [54]:

# Initialize lists to store session data
all_fp1_data = []
all_fp2_data = []
all_fp3_data = []
all_qualifying_data = []

# Get list of completed batches
batch_files = glob('../data/processed/fastf1_batches/batch_*.csv')
completed_batches = [int(os.path.basename(f).split('_')[1].split('.')[0]) for f in batch_files]
completed_batches.sort()

# Determine expected and missing batches
expected_batches = list(range(1, (len(races) + 4) // 5 + 1))
missing_batches = [b for b in expected_batches if b not in completed_batches]

# Process missing batches
batch_size = 5
for batch_num in tqdm(missing_batches, initial=len(completed_batches), total=len(expected_batches)):
    # Calculate the starting index for the current batch
    i = (batch_num - 1) * batch_size
    # Get the races for the current batch
    batch_races = races[i:i + batch_size]
    for race in batch_races:
        # Process FP1 session if available, otherwise generate an empty DataFrame
        if race['fp1_session']:
            all_fp1_data.append(process_session(race['fp1_session'], race['raceId'], 'fp1'))
        else:
            all_fp1_data.append(generate_empty_session_df('fp1', race['raceId']))
        # Process FP2 session if available, otherwise generate an empty DataFrame
        if race['fp2_session']:
            all_fp2_data.append(process_session(race['fp2_session'], race['raceId'], 'fp2'))
        else:
            all_fp2_data.append(generate_empty_session_df('fp2', race['raceId']))
        # Process FP3 session if available, otherwise generate an empty DataFrame
        if race['fp3_session']:
            all_fp3_data.append(process_session(race['fp3_session'], race['raceId'], 'fp3'))
        else:
            all_fp3_data.append(generate_empty_session_df('fp3', race['raceId']))
        # Process qualifying session if available, otherwise generate an empty DataFrame
        if race['qualifying']:
            race['qualifying'].load(laps=True, telemetry=False, weather=False, messages=False)
            results = race['qualifying'].results
            if not results.empty:
                convert_timedelta_columns_to_millis(results, ['Q1', 'Q2', 'Q3'], in_place=True)

                qualifying_df = pd.DataFrame({
                    'raceId': race['raceId'],
                    'driver_code': results['Abbreviation'],
                    'q1_time': results['Q1'],
                    'q2_time': results['Q2'],
                    'q3_time': results['Q3'],
                    'q_position': results['Position']
                })
                all_qualifying_data.append(qualifying_df)
            else:
                all_qualifying_data.append(pd.DataFrame(
                    columns=['raceId', 'driver_code', 'q1_time', 'q2_time', 'q3_time', 'q_position'],
                    data={
                        'raceId': race['raceId'],
                        'driver_code': [],
                        'q1_time': [],
                        'q2_time': [],
                        'q3_time': [],
                        'q_position': []
                    }
                ))
        else:
            all_qualifying_data.append(generate_empty_session_df('q', race['raceId']))

    # Concatenate all FP1 DataFrames
    all_fp1_df = pd.concat(all_fp1_data, ignore_index=True)
    if debug:
        all_fp1_df.to_csv(f'../data/processed/fastf1_batches/debug/fp1_{i//batch_size + 1}.csv')
    # Concatenate all FP2 DataFrames
    all_fp2_df = pd.concat(all_fp2_data, ignore_index=True)
    if debug:
        all_fp2_df.to_csv(f'../data/processed/fastf1_batches/debug/fp2_{i//batch_size + 1}.csv')
    # Concatenate all FP3 DataFrames
    all_fp3_df = pd.concat(all_fp3_data, ignore_index=True)
    if debug:
        all_fp3_df.to_csv(f'../data/processed/fastf1_batches/debug/fp3_{i//batch_size + 1}.csv')
    # Concatenate all qualifying DataFrames
    all_qualifying_df = pd.concat(all_qualifying_data, ignore_index=True)
    if debug:
        all_qualifying_df.to_csv(f'../data/processed/fastf1_batches/debug/q_{i//batch_size + 1}.csv')

    # Merge all session DataFrames on 'raceId' and 'driver_code'
    session_df = pd.merge(all_qualifying_df, all_fp1_df, on=['raceId', 'driver_code'], how='left')
    session_df = pd.merge(session_df, all_fp2_df, on=['raceId', 'driver_code'], how='left')
    session_df = pd.merge(session_df, all_fp3_df, on=['raceId', 'driver_code'], how='left')

    # Save the merged DataFrame to a CSV file
    batch_file = f'../data/processed/fastf1_batches/batch_{i//batch_size + 1}.csv'
    session_df.to_csv(batch_file, index=False)

    # Clear the lists for the next batch
    all_fp1_data.clear()
    all_fp2_data.clear()
    all_fp3_data.clear()
    all_qualifying_data.clear()

100%|##########| 30/30 [00:00<?, ?it/s]

In [61]:
batch_files_path = '../data/processed/fastf1_batches/batch_*.csv'
combined_file_path = '../data/processed/export_2018_v1.csv'
full_file_path = '../data/processed/export_2018_full_v1.csv'

original_df = original_df[original_df['year'] >= 2018]


# Get a list of all batch files
batch_files = glob(batch_files_path)

# Initialize an empty list to store DataFrames
batch_dfs = []

# Load each batch file and append to the list
for batch_file in batch_files:
    batch_df = pd.read_csv(batch_file)
    batch_dfs.append(batch_df)

# Concatenate all batch DataFrames
combined_batch_df = pd.concat(batch_dfs, ignore_index=True)
combined_batch_df.rename({'Driver': 'driver_code'}, axis=1, inplace=True)

# Merge the combined batch DataFrame with the original DataFrame
full_df = pd.merge(original_df, combined_batch_df, on=['raceId', 'driver_code'], how='left')

# Save the combined DataFrame to a CSV file
full_df.to_csv(full_file_path, index=False)

combined_batch_df.to_csv(combined_file_path, index=False)
print(f"Combined file saved to {combined_file_path}")

Combined file saved to ../data/processed/export_2018_v1.csv


In [62]:
combined_batch_df.head()

Unnamed: 0,raceId,driver,q1_time,q2_time,q3_time,q_position,fp1_avg_sector_1,fp1_avg_sector_2,fp1_avg_lap_time,fp1_avg_speedI1,...,fp3_avg_sector_1,fp3_avg_sector_2,fp3_avg_lap_time,fp3_avg_speedI1,fp3_avg_speedI2,fp3_avg_speedFL,fp3_avg_speedST,fp3_avg_tyre_life,fp3_avg_is_on_fresh_tyres,driver_code
0,1024,LEC,98014.0,96650.0,96217.0,1.0,28254.8,40635.2,104875.2,301.8,...,26996.25,38197.75,99164.75,313.0,283.25,261.5,302.0,3.25,1.0,
1,1024,HAM,97565.0,96933.0,96408.0,2.0,28423.333333,40330.111111,104393.555556,294.444444,...,27249.25,38362.75,99744.5,307.25,277.0,258.25,299.5,4.0,1.0,
2,1024,VET,98374.0,96720.0,96437.0,3.0,28084.857143,40368.857143,104383.0,301.857143,...,27014.0,38519.75,99672.75,315.5,285.0,262.75,303.75,3.25,1.0,
3,1024,VER,98540.0,97089.0,96813.0,4.0,28259.555556,39885.888889,103713.0,296.0,...,27260.0,38034.0,99372.5,307.5,278.5,258.5,298.5,3.0,0.5,
4,1024,BOT,97317.0,97142.0,97146.0,5.0,27639.0,39344.5,101909.75,304.75,...,27351.5,38417.75,99924.5,307.0,278.25,257.5,286.5,3.25,1.0,


In [63]:
full_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,year,round,circuitId,race_name,...,fp2_avg_is_on_fresh_tyres,fp3_avg_sector_1,fp3_avg_sector_2,fp3_avg_lap_time,fp3_avg_speedI1,fp3_avg_speedI2,fp3_avg_speedFL,fp3_avg_speedST,fp3_avg_tyre_life,fp3_avg_is_on_fresh_tyres
0,23782,989,20,6,5,3,2018,1,1,Australian Grand Prix,...,,,,,,,,,,
1,23783,989,1,131,44,1,2018,1,1,Australian Grand Prix,...,,,,,,,,,,
2,23784,989,8,6,7,2,2018,1,1,Australian Grand Prix,...,,,,,,,,,,
3,23785,989,817,9,3,8,2018,1,1,Australian Grand Prix,...,,,,,,,,,,
4,23786,989,4,1,14,10,2018,1,1,Australian Grand Prix,...,,,,,,,,,,
