# FastF1 Lap Dataset Builder

Use [FastF1](https://theoehrly.github.io/Fast-F1/) to download official timing data (laps, weather, race control, telemetry) and build a driver/lap level dataset enriched with tyre compounds, DRS, weather, and safety car indicators.

## Requirements

* `pip install fastf1`
* Configure a cache directory. FastF1 caches the raw livetiming files, so reuse the cache to avoid repeated downloads.
* An internet connection (FastF1 talks to the official F1 API).

In [9]:
%pip install fastf1

Note: you may need to restart the kernel to use updated packages.


In [10]:
import fastf1 as ff1
import pandas as pd
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()
if not (NOTEBOOK_DIR / 'fastf1_lap_dataset.ipynb').exists():
    NOTEBOOK_DIR = NOTEBOOK_DIR / 'models'
NOTEBOOK_DIR.mkdir(parents=True, exist_ok=True)

cache_dir = NOTEBOOK_DIR / '.fastf1_cache'
cache_dir.mkdir(parents=True, exist_ok=True)
ff1.Cache.enable_cache(str(cache_dir.resolve()))

In [None]:
MIN_YEAR = 2012
MAX_YEAR = 2025  # Fetch every season from 2018 through 2024
SESSION_TYPES = ['Race', 'Sprint']

OUTPUT_PATH = NOTEBOOK_DIR / 'fastf1_lap_dataset.csv'
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

In [24]:
def collect_sessions(year):
    schedule = ff1.get_event_schedule(year)
    for _, event in schedule.iterrows():
        for session_name in SESSION_TYPES:
            try:
                session = ff1.get_session(year, event['EventName'], session_name)
            except Exception as exc:
                print(f"Skipping {year} {event['EventName']} {session_name}: {exc}")
                continue
            yield session

In [25]:
def build_lap_frame(session):
    session.load(laps=True, telemetry=False, weather=True, messages=True)
    laps = session.laps.copy()
    laps['LapTimeSeconds'] = laps['LapTime'].dt.total_seconds()
    laps['GapToLeaderSeconds'] = laps['Time'] - laps.groupby('LapNumber')['Time'].transform('min')
    laps['GapToLeaderSeconds'] = laps['GapToLeaderSeconds'].dt.total_seconds()
    laps = laps.sort_values(['LapNumber', 'Time'])
    laps['GapToAheadSeconds'] = laps.groupby('LapNumber')['Time'].diff().dt.total_seconds().fillna(0)
    laps['Compound'] = laps.groupby(['Driver', 'Stint'])['Compound'].transform(lambda s: s.ffill().bfill())
    stint_info = laps[['Driver', 'Stint', 'LapNumber', 'Compound', 'TyreLife']]
    weather = session.weather_data.copy() if session.weather_data is not None else pd.DataFrame()
    if not weather.empty:
        weather['Timestamp'] = weather['Time'].dt.total_seconds()
    rc_messages = session.race_control_messages.copy() if session.race_control_messages is not None else pd.DataFrame()
    track_status = session.track_status.copy() if session.track_status is not None else pd.DataFrame()
    return laps, stint_info, weather, rc_messages, track_status


In [26]:
def safety_car_flag(lap_start, lap_end, track_status):
    if lap_start is None or lap_end is None:
        return 0, 0
    if track_status is None or track_status.empty:
        return 0, 0
    sc_codes = {'SC', 'SC Withdrawn', 'SafetyCar'}
    vsc_codes = {'VSC', 'VSC Ending', 'Virtual SafetyCar'}
    sc = 0
    vsc = 0
    for _, row in track_status.iterrows():
        status_time = row['Time'].total_seconds()
        message = str(row.get('Message', '')).upper()
        if lap_start <= status_time <= lap_end:
            if any(code in message for code in ['SAFETY', 'SC', 'CODE4']):
                sc = 1
            if 'VIRTUAL' in message or 'VSC' in message:
                vsc = 1
    return sc, vsc


In [27]:
def drs_flag(lap_start, lap_end, rc_messages):
    if lap_start is None or lap_end is None:
        return 0
    if rc_messages is None or rc_messages.empty:
        return 0
    enabled_times = []
    disabled_times = []
    for _, row in rc_messages.iterrows():
        if str(row.get('Category', '')).upper() != 'DRS':
            continue
        rel_time = row.get('Time')
        if rel_time is None or pd.isna(rel_time):
            continue
        if hasattr(rel_time, 'total_seconds'):
            ts = rel_time.total_seconds()
        elif hasattr(rel_time, 'timestamp'):
            ts = rel_time.timestamp()
        else:
            try:
                ts = float(rel_time)
            except (TypeError, ValueError):
                continue
        message = str(row.get('Message', '')).upper()
        if 'ENABLE' in message:
            enabled_times.append(ts)
        elif 'DISABLE' in message:
            disabled_times.append(ts)
    if not enabled_times:
        return 0
    eligible_enables = [t for t in enabled_times if t <= lap_end]
    if not eligible_enables:
        return 0
    enable_limit = max(eligible_enables)
    disable_after = min((t for t in disabled_times if t >= enable_limit), default=None)
    if disable_after is not None and disable_after < lap_start:
        return 0
    return 1

In [29]:
def slugify(value: str) -> str:
    if not isinstance(value, str):
        return ''
    cleaned = ''.join(ch.lower() if ch.isalnum() else '_' for ch in value)
    return '_'.join(part for part in cleaned.split('_') if part)


def rainfall_indicator(value):
    if value is None:
        return pd.NA
    try:
        if pd.isna(value):
            return pd.NA
    except TypeError:
        pass
    if isinstance(value, str):
        lowered = value.strip().lower()
        if not lowered:
            return pd.NA
        if lowered in {'no', 'dry', 'false'}:
            return False
        if lowered in {'yes', 'wet', 'rain', 'true'}:
            return True
        try:
            numeric = float(lowered)
        except ValueError:
            return pd.NA
        return numeric > 0
    try:
        numeric = float(value)
    except (TypeError, ValueError):
        return pd.NA
    return numeric > 0


def timedelta_to_seconds(value):
    if value is None:
        return None
    try:
        if pd.isna(value):
            return None
    except TypeError:
        pass
    if hasattr(value, 'total_seconds'):
        return value.total_seconds()
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


rows = []
for year in range(MIN_YEAR, MAX_YEAR + 1):
    for session in collect_sessions(year):
        print(f"Loading {session.event['EventName']} {session.name}")
        try:
            laps, _, weather, rc_messages, track_status = build_lap_frame(session)
        except Exception as exc:
            print(f"Failed to load {session.event['EventName']} {session.name}: {exc}")
            continue

        driver_meta = getattr(session, 'results', None)
        has_driver_meta = isinstance(driver_meta, pd.DataFrame) and {'DriverNumber', 'GridPosition'}.issubset(driver_meta.columns)
        if has_driver_meta:
            driver_meta = driver_meta[['DriverNumber', 'GridPosition']].copy()
            driver_meta['DriverNumber'] = driver_meta['DriverNumber'].astype(str)
            laps['DriverNumber'] = laps['DriverNumber'].astype(str)
            driver_meta = driver_meta.rename(columns={'GridPosition': 'GridPositionResult'})
            laps = laps.merge(driver_meta, on='DriverNumber', how='left')
            laps['grid_position'] = laps['GridPositionResult']
            laps = laps.drop(columns=['GridPositionResult'])
        else:
            laps['grid_position'] = pd.NA

        if 'StintLap' not in laps.columns:
            laps['StintLap'] = laps.groupby(['Driver', 'Stint']).cumcount() + 1

        total_laps = session.total_laps or int(laps['LapNumber'].max())
        session_identifier = getattr(session, 'session_key', None)
        if not session_identifier:
            session_identifier = slugify(f"{session.date.year}_{session.event['EventName']}_{session.name}")
        circuit_source = session.event.get('Location') or session.event.get('EventName') or ''
        circuit_slug = slugify(circuit_source)
        team_slug_cache = {}

        for _, lap in laps.iterrows():
            team_name = lap.get('Team')
            if team_name not in team_slug_cache:
                team_slug_cache[team_name] = slugify(team_name) if isinstance(team_name, str) else ''
            lap_start = timedelta_to_seconds(lap.get('LapStartTime'))
            lap_end = timedelta_to_seconds(lap.get('Time'))
            sc_flag, vsc_flag = safety_car_flag(lap_start, lap_end, track_status)
            drs_active = drs_flag(lap_start, lap_end, rc_messages)
            weather_slice = {}
            if weather is not None and not weather.empty and lap_end is not None:
                idx = (weather['Timestamp'] - lap_end).abs().argsort()[:1]
                weather_slice = weather.iloc[idx].to_dict('records')[0]
            rainfall_value = weather_slice.get('Rainfall', pd.NA)
            rows.append({
                'session_key': session_identifier,
                'race_name': session.event['EventName'],
                'year': session.date.year,
                'session_name': session.name,
                'circuit_id': circuit_slug,
                'driver_id': lap['Driver'],
                'team_id': team_slug_cache.get(team_name, ''),
                'team_name': team_name,
                'lap_number': lap['LapNumber'],
                'total_race_laps': total_laps,
                'grid_position': lap.get('grid_position'),
                'current_position': lap.get('Position'),
                'lap_time_s': lap['LapTimeSeconds'],
                'gap_to_leader_s': lap['GapToLeaderSeconds'],
                'gap_to_ahead_s': lap['GapToAheadSeconds'],
                'laps_on_current_tyre': lap['TyreLife'] if pd.notna(lap['TyreLife']) else lap['StintLap'],
                'tyre_compound': lap.get('Compound'),
                'drs_enabled': drs_active,
                'safety_car_this_lap': sc_flag,
                'virtual_sc_this_lap': vsc_flag,
                'air_temperature': weather_slice.get('AirTemp'),
                'track_temperature': weather_slice.get('TrackTemp'),
                'humidity': weather_slice.get('Humidity'),
                'pressure': weather_slice.get('Pressure'),
                'rainfall': weather_slice.get('Rainfall'),
                'wind_speed': weather_slice.get('WindSpeed'),
                'wind_direction': weather_slice.get('WindDirection'),
                'has_rain': rainfall_indicator(rainfall_value),
            })

fastf1_df = pd.DataFrame(rows)
if fastf1_df.empty:
    raise ValueError('No laps collected. Adjust MIN_YEAR/MAX_YEAR or SESSION_TYPES.')

integer_columns = ['lap_number', 'total_race_laps', 'grid_position', 'current_position', 'laps_on_current_tyre']
for col in integer_columns:
    fastf1_df[col] = pd.to_numeric(fastf1_df[col], errors='coerce').astype('Int64')

fastf1_df['tyre_compound'] = fastf1_df.groupby(['session_key', 'driver_id'])['tyre_compound'].transform(lambda s: s.ffill().bfill())

boolean_columns = ['drs_enabled', 'safety_car_this_lap', 'virtual_sc_this_lap', 'has_rain']
for col in boolean_columns:
    fastf1_df[col] = fastf1_df[col].astype('boolean')

fastf1_df = fastf1_df.sort_values(['year', 'session_key', 'driver_id', 'lap_number']).reset_index(drop=True)

dedup_subset = ['session_key', 'driver_id', 'lap_number']
before_dedup = len(fastf1_df)
fastf1_df = fastf1_df.drop_duplicates(subset=dedup_subset, keep='first')
after_dedup = len(fastf1_df)
if after_dedup != before_dedup:
    print(f'Removed {before_dedup - after_dedup} duplicate driver/lap rows')
fastf1_df = fastf1_df.reset_index(drop=True)

base_columns = [
    'driver_id',
    'team_id',
    'circuit_id',
    'total_race_laps',
    'year',
    'session_name',
    'grid_position',
    'current_position',
    'gap_to_leader_s',
    'gap_to_ahead_s',
    'lap_time_s',
    'laps_on_current_tyre',
    'tyre_compound',
    'safety_car_this_lap',
    'lap_number',
    'drs_enabled',
    'track_temperature',
    'air_temperature',
    'has_rain',
]
extra_columns = [
    'session_key',
    'race_name',
    'team_name',
    'virtual_sc_this_lap',
    'humidity',
    'pressure',
    'rainfall',
    'wind_speed',
    'wind_direction',
]
ordered_columns = base_columns + [col for col in extra_columns if col in fastf1_df.columns]
fastf1_df = fastf1_df[ordered_columns]

fastf1_df.to_csv(OUTPUT_PATH, index=False)
fastf1_df.head()


KeyboardInterrupt: 