In [1]:
import fastf1
import pandas as pd
from fastf1.req import RateLimitExceededError
import time
import logging
import concurrent.futures
from threading import Lock
import os

def process_year(year):
    """Process an entire year of F1 data"""
    print(f"\nStarting to process year {year}")
    year_lap_dfs = []
    year_weather_dfs = []
    
    try:
        schedule = fastf1.get_event_schedule(year)
        
        for event in schedule.itertuples():
            print(f"\nProcessing event: {event.EventName} ({year})")
            
            # Get all session types based on event format
            if event.EventFormat == 'conventional':
                sessions = ['FP1', 'FP2', 'FP3', 'Q', 'R']
            elif event.EventFormat == 'sprint':
                sessions = ['FP1', 'Q', 'FP2', 'S', 'R']
            elif event.EventFormat == 'sprint_shootout':
                sessions = ['FP1', 'Q', 'SS', 'S', 'R']
            elif event.EventFormat == 'sprint_qualifying':
                sessions = ['FP1', 'SQ', 'S', 'Q', 'R']
            else:
                sessions = ['FP1', 'FP2', 'FP3']
            
            for session_name in sessions:
                while True:  # Keep trying until success or non-rate-limit error
                    try:
                        session = fastf1.get_session(year, event.RoundNumber, session_name)
                        session.load()
                        
                        # Get the laps data
                        laps_df = session.laps.copy()
                        laps_df['Year'] = year
                        laps_df['EventName'] = event.EventName
                        laps_df['SessionName'] = session_name
                        laps_df['EventFormat'] = event.EventFormat
                        laps_df['RoundNumber'] = event.RoundNumber
                        
                        # Get weather data
                        weather_df = session.weather_data.copy()
                        weather_df['Year'] = year
                        weather_df['EventName'] = event.EventName
                        weather_df['SessionName'] = session_name
                        weather_df['EventFormat'] = event.EventFormat
                        weather_df['RoundNumber'] = event.RoundNumber
                        
                        year_lap_dfs.append(laps_df)
                        year_weather_dfs.append(weather_df)
                        print(f"Completed {year} - {event.EventName} - {session_name}")
                        break  # Success, exit the while loop
                        
                    except RateLimitExceededError:
                        print(f"\nRate limit hit on {session_name} for {event.EventName} in {year}")
                        print("Waiting 60 seconds...")
                        time.sleep(60)  # Wait a minute and retry
                        continue
                        
                    except Exception as e:
                        print(f"Error processing {session_name} for {event.EventName} in {year}: {e}")
                        break  # Non-rate-limit error, skip this session
        
        # Save year data
        if year_lap_dfs and year_weather_dfs:
            year_laps = pd.concat(year_lap_dfs, ignore_index=True)
            year_weather = pd.concat(year_weather_dfs, ignore_index=True)
            
            # Create data directory if it doesn't exist
            os.makedirs('data', exist_ok=True)
            
            year_laps.to_csv(f'data/lap_data_{year}.csv', index=False)
            year_weather.to_csv(f'data/weather_data_{year}.csv', index=False)
            print(f"\nYear {year} data saved!")
            
            return {'year': year, 'laps': year_laps, 'weather': year_weather}
            
    except Exception as e:
        print(f"Error processing year {year}: {e}")
        return None

def collect_f1_data(start_year=2018, end_year=2024, max_workers=5):
    # Enable cache
    fastf1.Cache.enable_cache('cache')
    fastf1.set_log_level(logging.DEBUG)
    
    years = list(range(start_year, end_year))
    all_results = []
    
    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all years to be processed in parallel
            future_to_year = {executor.submit(process_year, year): year for year in years}
            
            # Process completed years as they finish
            for future in concurrent.futures.as_completed(future_to_year):
                year = future_to_year[future]
                try:
                    result = future.result()
                    if result is not None:
                        all_results.append(result)
                        print(f"Year {year} completely processed!")
                except Exception as e:
                    print(f"Year {year} failed: {e}")
    
    except KeyboardInterrupt:
        print("\nScript interrupted by user. Saving progress...")
    
    finally:
        if all_results:
            # Combine all years
            all_lap_dfs = [r['laps'] for r in all_results]
            all_weather_dfs = [r['weather'] for r in all_results]
            
            all_lap_data = pd.concat(all_lap_dfs, ignore_index=True)
            all_weather_data = pd.concat(all_weather_dfs, ignore_index=True)
            
            # Save final combined data
            all_lap_data.to_csv('all_lap_data_final.csv', index=False)
            all_weather_data.to_csv('all_weather_data_final.csv', index=False)
            
            print("\nData collection complete. Data saved to:")
            print("- 'all_lap_data_final.csv'")
            print("- 'all_weather_data_final.csv'")
            return {'laps': all_lap_data, 'weather': all_weather_data}
        else:
            print("No data was collected!")
            return None

# Run the collection
if __name__ == "__main__":
    # Process up to 3 years simultaneously
    data = collect_f1_data(start_year=2019, end_year=2024, max_workers=3)

core           INFO 	Loading data for Bahrain Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for session_info
core           INFO 	Loading data for Austrian Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info



Starting to process year 2019
Starting to process year 2020


Starting to process year 2021

Processing event: Pre-Season Test (2021)

Processing event: Pre-Season Test 1 (2020)
Error processing FP1 for Pre-Season Test 1 in 2020: Cannot get testing event by round number!
Error processing FP1 for Pre-Season Test in 2021: Cannot get testing event by round number!
Error processing FP2 for Pre-Season Test 1 in 2020: Cannot get testing event by round number!
Error processing FP2 for Pre-Season Test in 2021: Cannot get testing event by round number!
Error processing FP3 for Pre-Season Test 1 in 2020: Cannot get testing event by round number!

Processing event: Pre-Season Test 2 (2020)
Error processing FP3 for Pre-Season Test in 2021: Cannot get testing event by round number!

Processing event: Bahrain Grand Prix (2021)
Error processing FP1 for Pre-Season Test 2 in 2020: Cannot get testing event by round number!
Error processing FP2 for Pre-Season Test 2 in 2020: Cannot get testing event by 

logger        DEBUG 	Traceback for failure in FastF1 schedule
Traceback (most recent call last):
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 976, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror


Processing event: Australian Grand Prix (2019)


Request for URL https://ergast.com/api/f1/2021/1/results.json failed; using cached response
Traceback (most recent call last):
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 976, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Completed 2021 - Bahrain Grand Prix - FP1
Completed 2020 - Austrian Grand Prix - FP1


core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
structure      DEBUG 	Failed to parse timestamp '66.004' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '67.100' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '85.692' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '86.713' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '88.864' in Ergastresponse.
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req     

Completed 2021 - Bahrain Grand Prix - FP2
Completed 2019 - Australian Grand Prix - FP1


req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '11', '16', '18', '20', '23', '26', '3', '31', '33', '4', '44', '5', '55', '6', '63', '7', '77', '8', '99']
core           INFO 	Loading data for Austrian Grand Prix - Practice 3 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Completed 2020 - Austrian Grand Prix - FP2


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '11', '14', '16', '18', '22', '3', '31', '33', '4', '44', '47', '5', '55', '6', '63', '7', '77', '9', '99']
core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '11', '16', '18', '20', '23', '26', '27', '3', '33', 

Completed 2021 - Bahrain Grand Prix - FP3
Completed 2019 - Australian Grand Prix - FP2


req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '11', '16', '18', '20', '23', '26', '3', '31', '33', '4', '44', '5', '55', '6', '63', '7', '77', '8', '99']
core           INFO 	Loading data for Austrian Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data


Completed 2020 - Austrian Grand Prix - FP3


req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '11', '16', '18', '20', '23', '26', '27', '3', '33', '4', '44', '5', '55', '63', '7', '77', '8', '88', '99']
core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Completed 2019 - Australian Grand Prix - FP3


Request for URL https://ergast.com/api/f1/2021/1/qualifying.json failed; using cached response
Traceback (most recent call last):
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 507, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/http/client.py", line 1428, in getresponse
    response.begin()
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = self._read

Completed 2021 - Bahrain Grand Prix - Q


structure      DEBUG 	Failed to parse timestamp '66.004' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '67.100' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '85.692' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '86.713' in Ergastresponse.
structure      DEBUG 	Failed to parse timestamp '88.864' in Ergastresponse.
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['77', '44', '33', '4', '23', '11',

Completed 2020 - Austrian Grand Prix - Q
Completed 2019 - Australian Grand Prix - Q


req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
Request for URL https://ergast.com/api/f1/2021/1/laps/1.json failed; using cached response
Traceback (most recent call last):
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/soc

Completed 2021 - Bahrain Grand Prix - R

Processing event: Emilia Romagna Grand Prix (2021)


Request for URL https://ergast.com/api/f1/2019/1/laps/1.json failed; using cached response
Traceback (most recent call last):
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551659/Library/Caches/pypoetry/virtualenvs/ie500-data-mining-group7-LKR-OXJO-py3.12/lib/python3.12/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 976, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Completed 2019 - Australian Grand Prix - R

Processing event: Bahrain Grand Prix (2019)


core           INFO 	Finished loading data for 20 drivers: ['77', '16', '4', '44', '55', '11', '10', '31', '99', '5', '6', '26', '23', '7', '63', '8', '20', '18', '3', '33']
core           INFO 	Loading data for Styrian Grand Prix - Practice 1 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Completed 2020 - Austrian Grand Prix - R

Processing event: Styrian Grand Prix (2020)
