In [1]:
import fastf1
import pandas as pd
from fastf1.req import RateLimitExceededError
import time
import logging
import concurrent.futures
from threading import Lock
import os

In [1]:


def process_year(year):
    """Process an entire year of F1 data"""
    print(f"\nStarting to process year {year}")
    year_lap_dfs = []
    year_weather_dfs = []
    
    try:
        schedule = fastf1.get_event_schedule(year)
        
        for event in schedule.itertuples():
            print(f"\nProcessing event: {event.EventName} ({year})")
            
            # Get all session types based on event format
            if event.EventFormat == 'conventional':
                sessions = ['FP1', 'FP2', 'FP3', 'Q', 'R']
            elif event.EventFormat == 'sprint':
                sessions = ['FP1', 'Q', 'FP2', 'S', 'R']
            elif event.EventFormat == 'sprint_shootout':
                sessions = ['FP1', 'Q', 'SS', 'S', 'R']
            elif event.EventFormat == 'sprint_qualifying':
                sessions = ['FP1', 'SQ', 'S', 'Q', 'R']
            else:
                sessions = ['FP1', 'FP2', 'FP3']
            
            for session_name in sessions:
                while True:  # Keep trying until success or non-rate-limit error
                    try:
                        session = fastf1.get_session(year, event.RoundNumber, session_name)
                        session.load()
                        
                        # Get the laps data
                        laps_df = session.laps.copy()
                        laps_df['Year'] = year
                        laps_df['EventName'] = event.EventName
                        laps_df['SessionName'] = session_name
                        laps_df['EventFormat'] = event.EventFormat
                        laps_df['RoundNumber'] = event.RoundNumber
                        
                        # Get weather data
                        weather_df = session.weather_data.copy()
                        weather_df['Year'] = year
                        weather_df['EventName'] = event.EventName
                        weather_df['SessionName'] = session_name
                        weather_df['EventFormat'] = event.EventFormat
                        weather_df['RoundNumber'] = event.RoundNumber
                        
                        year_lap_dfs.append(laps_df)
                        year_weather_dfs.append(weather_df)
                        print(f"Completed {year} - {event.EventName} - {session_name}")
                        break  # Success, exit the while loop
                        
                    except RateLimitExceededError:
                        print(f"\nRate limit hit on {session_name} for {event.EventName} in {year}")
                        print("Waiting 60 seconds...")
                        time.sleep(60)  # Wait a minute and retry
                        continue
                        
                    except Exception as e:
                        print(f"Error processing {session_name} for {event.EventName} in {year}: {e}")
                        break  # Non-rate-limit error, skip this session
        
        # Save year data
        if year_lap_dfs and year_weather_dfs:
            year_laps = pd.concat(year_lap_dfs, ignore_index=True)
            year_weather = pd.concat(year_weather_dfs, ignore_index=True)
            
            # Create data directory if it doesn't exist
            os.makedirs('data', exist_ok=True)
            
            year_laps.to_csv(f'data/lap_data_{year}.csv', index=False)
            year_weather.to_csv(f'data/weather_data_{year}.csv', index=False)
            print(f"\nYear {year} data saved!")
            
            return {'year': year, 'laps': year_laps, 'weather': year_weather}
            
    except Exception as e:
        print(f"Error processing year {year}: {e}")
        return None

def collect_f1_data(start_year=2018, end_year=2024, max_workers=5):
    # Enable cache
    fastf1.Cache.enable_cache('cache')
    fastf1.set_log_level(logging.DEBUG)
    
    years = list(range(start_year, end_year))
    all_results = []
    
    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all years to be processed in parallel
            future_to_year = {executor.submit(process_year, year): year for year in years}
            
            # Process completed years as they finish
            for future in concurrent.futures.as_completed(future_to_year):
                year = future_to_year[future]
                try:
                    result = future.result()
                    if result is not None:
                        all_results.append(result)
                        print(f"Year {year} completely processed!")
                except Exception as e:
                    print(f"Year {year} failed: {e}")
    
    except KeyboardInterrupt:
        print("\nScript interrupted by user. Saving progress...")
    
    finally:
        if all_results:
            # Combine all years
            all_lap_dfs = [r['laps'] for r in all_results]
            all_weather_dfs = [r['weather'] for r in all_results]
            
            all_lap_data = pd.concat(all_lap_dfs, ignore_index=True)
            all_weather_data = pd.concat(all_weather_dfs, ignore_index=True)
            
            # Save final combined data
            all_lap_data.to_csv('all_lap_data_final.csv', index=False)
            all_weather_data.to_csv('all_weather_data_final.csv', index=False)
            
            print("\nData collection complete. Data saved to:")
            print("- 'all_lap_data_final.csv'")
            print("- 'all_weather_data_final.csv'")
            return {'laps': all_lap_data, 'weather': all_weather_data}
        else:
            print("No data was collected!")
            return None

# Run the collection
if __name__ == "__main__":
    # Process up to 3 years simultaneously
    data = collect_f1_data(start_year=2022, end_year=2025, max_workers=5)

Completed 2022 - Saudi Arabian Grand Prix - Q


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['1', '55', '11', '4', '16', '81', '63', '22', '18', '14', '44', '23', '77', '20', '31', '27', '10', '3', '24']
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '14

Completed 2024 - Australian Grand Prix - Q
Completed 2023 - Australian Grand Prix - R

Processing event: Azerbaijan Grand Prix (2023)


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
req            INFO 	Using cached data for car_data
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '55', '11', '63', '31', '4', '10', '20', '44', '24', '27', '18', '23', '77', '14', '3', '6', '22', '47']
core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for position_data


Completed 2022 - Saudi Arabian Grand Prix - R

Processing event: Australian Grand Prix (2022)


req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '10', '11', '14', '16', '18', '2', '20', '21', '22', '23', '24', '27', '31', '4', '44', '55', '63', '77', '81']
core           INFO 	Loading data for Azerbaijan Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['55', '16', '4', '81', '11', '18', '22', '14', '27', '20', '23', '3', '10', '77', '24', '31', '63', '44', '1']
core           INFO 	Loading data for Japanese Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Usi

Completed 2023 - Azerbaijan Grand Prix - FP1
Completed 2024 - Australian Grand Prix - R

Processing event: Japanese Grand Prix (2024)


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 21 drivers: ['1', '10', '11', '14', '16', '18', '2', '20', '22', '23', '24', '27', '3', '31', '4', '40', '44', '55', '63', '77', '81']
core           INFO 	Loading data for Japanese Grand Prix - Practice 2 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data 

Completed 2024 - Japanese Grand Prix - FP1


req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '10', '11', '14', '16', '18', '2', '20', '22', '23', '24', '27', '3', '31', '4', '44', '55', '63', '77', '81']
core           INFO 	Loading data for Japanese Grand Prix - Practice 3 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Completed 2024 - Japanese Grand Prix - FP2


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '10', '11', '14', '16', '18', '2', '20', '22', '23', '24', '27', '3', '31', '4', '44', '55', '63', '77', '81']
core           INFO 	Loading data for Japanese Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Completed 2024 - Japanese Grand Prix - FP3


In [2]:
laps = pd.read_csv('all_lap_data_final.csv')
old_laps = pd.read_csv('../../data/raw_data/ff1_laps.csv')
weather = pd.read_csv('all_weather_data_final.csv')

  old_laps = pd.read_csv('../../data/raw_data/ff1_laps.csv')


In [3]:
print(laps.columns)
print(laps.shape)

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'Year', 'EventName', 'SessionName',
       'EventFormat', 'RoundNumber'],
      dtype='object')
(150847, 36)


In [4]:
print(old_laps.columns)
print(old_laps.shape)

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'Year', 'EventName', 'SessionName'],
      dtype='object')
(281857, 34)
