<a href="https://colab.research.google.com/github/danielmendesandrei/F1Analysis/blob/main/F1_Data_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
"""
Code cell to extract the desired data from the jolpica-f1 API:

https://github.com/jolpica/jolpica-f1/
https://api.jolpi.ca/ergast/


The dataset has approximately 8000 entries (as of June 2025),
and is comprised of data from the 2006 season onwards, as it is the first season
to use a qualifying structure similar to the current one.

Each entry row has the fields:
  season, round, circuitId, driverId, constructorId,
  qualifyPosition, Q1, Q2, Q3, racePosition
Where Q1, Q2, Q3 are the best lap times achieved in that qualifying session
  and None if the driver did not participate in it.

TODO:
1. Combine helper functions so they take the desired API page and info to extract
  e.g. fetch_for_season(season: int, page: str, info: dict) -> list
2. Experiment with sleep timings to reduce runtime while still avoiding HTTP 429 Too Many Requests
"""

import requests
import pandas as pd
import time

BASE_URL = "https://api.jolpi.ca/ergast/f1"
PAGE_SIZE = 100

def fetch_all_qualifying_rows_for_season(season: int) -> list:
    """
    Returns a list of all QualifyingResults entries for the desired season.
    Each entry is a dict with fields:
      season, round, circuitId, driverId, constructorId,
      qualifyPosition, Q1, Q2, Q3
    """
    all_entries = []
    offset = 0

    while True:
        # API call to url = https://api.jolpi.ca/ergast/f1/{season}/qualifying/?limit={PAGE_SIZE}&offset={offset}&format=json

        params = {"limit": PAGE_SIZE, "offset": offset, "format": "json"}
        resp = requests.get(
            f"{BASE_URL}/{season}/qualifying/",
            params=params
        )
        resp.raise_for_status()

        #Extract the list of races and number of total entries for that season
        MRData = resp.json()["MRData"]
        total_entries = int(MRData["total"])
        races = MRData["RaceTable"]["Races"]

        # Extract every nested QualifyingResults row
        for race in races:
            round        = int(race["round"])
            circuit_id   = race["Circuit"]["circuitId"]
            for res in race.get("QualifyingResults", []):

                all_entries.append({
                    "season":           season,
                    "round":            round,
                    "circuitId":        circuit_id,

                    "driverId":         res["Driver"]["driverId"],
                    "constructorId":    res["Constructor"]["constructorId"],

                    "qualifyPosition":  int(res["position"]),
                    "Q1":               res.get("Q1", None),
                    "Q2":               res.get("Q2", None),
                    "Q3":               res.get("Q3", None),
                })

        # If fetched ≥ total entries, break
        if offset + PAGE_SIZE >= total_entries:
            break

        # Otherwise bump offset
        offset += PAGE_SIZE
        time.sleep(0.5)

    return all_entries

def fetch_all_result_rows_for_season(season: int) -> list:
    """
    Returns a list of all Results entries for the desired season.
    Each entry is a dict with fields:
      season, round, circuitId, driverId, constructorId, racePosition
    """
    all_entries = []
    offset = 0

    while True:

        # API call to url = https://api.jolpi.ca/ergast/f1/{season}/results/?limit={PAGE_SIZE}&offset={offset}&format=json
        resp = requests.get(
            f"{BASE_URL}/{season}/results/",
            params={"limit": PAGE_SIZE, "offset": offset, "format": "json"}
        )
        resp.raise_for_status()

        #Extract the list of races and number of total entries for that season
        MRData = resp.json()["MRData"]
        total_entries = int(MRData["total"])
        races = MRData["RaceTable"]["Races"]

        # Extract every nested Results row
        for race in races:
            round        = int(race["round"])
            circuit_id = race["Circuit"]["circuitId"]
            for res in race.get("Results", []):

                all_entries.append({
                    "season":         season,
                    "round":          round,
                    "circuitId":      circuit_id,

                    "driverId":       res["Driver"]["driverId"],
                    "constructorId":  res["Constructor"]["constructorId"],
                    "racePosition":   int(res["position"]),
                })

        # If fetched ≥ total entries, break
        if offset + PAGE_SIZE >= total_entries:
            break

        # Otherwise bump offset
        offset += PAGE_SIZE
        time.sleep(0.5)

    return all_entries

# Main loop over the desired seasons
qualifying_rows = []
result_rows    = []

for season in range(2006, 2026):
    quals = fetch_all_qualifying_rows_for_season(season)
    qualifying_rows.extend(quals)

    results = fetch_all_result_rows_for_season(season)
    result_rows.extend(results)

    #Sleep timer between seasons to avoid HTTP 429 Too Many Requests
    time.sleep(60)

# Build DataFrames
df_qual = pd.DataFrame(qualifying_rows)
df_res  = pd.DataFrame(result_rows)

# Merge on (season, round, driverId)
df_full = pd.merge(
    df_qual,
    df_res[["season","round","driverId","racePosition"]],
    how="left",
    on=["season","round","driverId"]
)

# Re‐order columns & sort
cols = [
    "season","round","circuitId","driverId","constructorId",
    "qualifyPosition","Q1","Q2","Q3","racePosition"
]
df_full = df_full[cols]
df_full.sort_values(by=["season","round","qualifyPosition"], inplace=True)

# Save data to CSV to avoid re-running it
df_full.to_csv("f1_2006_2025.csv", index=False)
print("Saved all seasons (2006–2025) to csv")

Saved all seasons (2006–2025) to csv
