In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import certifi
import pandas as pd
import concurrent.futures

head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
base_url = "https://www.formula1.com"

# Set up Session for requests
session = requests.Session()

# Get years that statistics have been published
current_year = datetime.now().year
years = [year for year in range(1950, current_year + 1)]

# Helper function to save data to CSV
def save_to_json(data, headers, filename):
    df = pd.DataFrame(data, columns=headers)
    df.to_json(filename, orient='records', lines=True)
    print(df)

In [None]:

def scrape_races_year(year):
    # Default values in case elements are not found
    race_date = None
    circuit = None
    city = None
    
    # URL of the page
    url = f"{base_url}/en/results/{year}/races"

    # Send a GET request to the URL
    response = session.get(url, headers=head, verify=certifi.where())

    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find table
        table = soup.find('table', class_='f1-table-with-data')
        
        if table:
            headers = [header.text.strip() for header in table.find('thead').find_all('th')]
            
            rows = table.find('tbody').find_all('tr')
            data = []
            race_links = []
            
            for row in rows:
                cols = row.find_all('td')
                row_data = []
                
                for i, col in enumerate(cols):
                    if i == 2: #Driver column
                        winner = col.text.strip().replace("\xa0", " ")[:-3]
                        row_data.append(winner)
                    else:
                        row_data.append(col.text.strip())
                        
                # Append the row data to the data list (only once per row)
                data.append(row_data)
                
                # Extract race link
                race_link = cols[0].find('a')['href']
                full_link = f"{base_url}/en/results/{year}/{race_link}"
                race_links.append((row_data[0], full_link))
                
    return data, headers, race_links

In [None]:
def scrape_race_location(race_url):
    response = session.get(race_url, headers=head, verify=certifi.where())
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the location table
        header_section = soup.find('div', class_='max-tablet:flex-col flex gap-xs')
        
        if header_section:
            location_info = header_section.find_all('p')
            
            race_date = location_info[0].text.strip()[:-5]
            track = location_info[1].text.strip().split(", ")
            circuit = track[0]
            city = track[1]
            
    return race_date, circuit, city
            

In [None]:
def scrape_race_results(race_link, type):
    base_url = race_link.rsplit('/', 1)[0]
    response = session.get(f'{base_url}/{type}', headers=head, verify=certifi.where())
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the location table
        table = soup.find('table', class_='f1-table-with-data')
        
        if table:
            headers = [header.text.strip() for header in table.find('thead').find_all('th')]
            
            rows = table.find('tbody').find_all('tr')
            data = []
            
            for row in rows:
                cols = row.find_all('td')
                row_data = []
                
                for i, col in enumerate(cols):
                    if i == 2: #Driver column
                        winner = col.text.strip().replace("\xa0", " ")[:-3]
                        row_data.append(winner)
                    else:
                        row_data.append(col.text.strip())
                        
                data.append(row_data)
    return headers, data

In [22]:
headers_race = []
races = []

headers_race_location = ['Grand Prix', 'Circuit', 'Country/City', 'Year', 'Date']
race_location = []

race_result_type = ['race-result', 'fastest-laps', 'pit-stop-summary', 'starting-grid', 'qualifying',
                'practice/3', 'practice/2', 'practice/1']

# Collect all race links
all_race_links = []
for year in years:
    race, header_race, race_links = scrape_races_year(year)
    races.extend(race)
    
    all_race_links.extend([(link[0], link[1]) for link in race_links])

    if len(headers_race) == 0:
        headers_race = header_race
        
# Create a list of all tasks for race results
all_result_tasks = []
for race_link in all_race_links:
    for result_type in race_result_type:
        all_result_tasks.append((race_link[1], result_type))  

In [30]:
# Process functions in parallel to get data
def process_race_location(race_link_tuple):
    grand_prix, url = race_link_tuple
    year = url.split('/results/')[1].split('/')[0]

    try:
        race_date, circuit, city = scrape_race_location(url)
        return [grand_prix, circuit, city, year, race_date]
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None
    
def process_race_results(args):
    race_link_tuple, result_type = args
    grand_prix, url = race_link_tuple
    year = url.split('/results/')[1].split('/')[0]
    
    try:
        header_result, data_result = scrape_race_results(url, result_type)
        
        if header_result and data_result:
            new_header = ["Grand Prix", "Year"] + header_result
            new_data = [[grand_prix, year] + row for row in data_result]
            
            file_name = result_type.replace("/", "-").replace("-", "_")
            
            return new_header, new_data, file_name
        else:
            print(f"No data found for {url}, {result_type}")
            return None
    except Exception as e:
        print(f"Error processing {url}, {result_type}: {e}")
        return None 

In [None]:
# Create a ThreadPoolExecutor to manage threads
with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
    # Submit location tasks
    location_futures = {executor.submit(process_race_location, link): link for link in all_race_links}
    
    # Submit result tasks
    result_futures = {executor.submit(scrape_race_results, task[0], task[1]): task for task in all_result_tasks}
      
    # Process location results
    race_location = []
    for future in concurrent.futures.as_completed(location_futures):
        result = future.result()
        if result is not None:
            race_location.append(result) 
    
    # Process race results
    race_result = {}
    for future in concurrent.futures.as_completed(result_futures):
        result = future.result()
        if result is not None:
            race_result[result[2]] = {
                "header": result[0],
                "data": result[1]
            }

In [None]:
# Save to JSON
for result_type, result_data in race_result.items():
    save_to_json(result_data['data'], result_data['header'], result_type)
    
# Save race and location data
save_to_json(races, headers_race, "races")
save_to_json(race_location, headers_race_location, "race_location")

NameError: name 'race_result' is not defined

In [None]:
print(scrape_race_results("https://www.formula1.com/en/results/2025/races/1254/australia/race-result", "qualifying"))

(['Pos', 'No', 'Driver', 'Car', 'Q1', 'Q2', 'Q3', 'Laps'], [['1', '4', 'Lando Norris', 'McLaren Mercedes', '1:15.912', '1:15.415', '1:15.096', '20'], ['2', '81', 'Oscar Piastri', 'McLaren Mercedes', '1:16.062', '1:15.468', '1:15.180', '18'], ['3', '1', 'Max Verstappen', 'Red Bull Racing Honda RBPT', '1:16.018', '1:15.565', '1:15.481', '17'], ['4', '63', 'George Russell', 'Mercedes', '1:15.971', '1:15.798', '1:15.546', '21'], ['5', '22', 'Yuki Tsunoda', 'Racing Bulls Honda RBPT', '1:16.225', '1:16.009', '1:15.670', '18'], ['6', '23', 'Alexander Albon', 'Williams Mercedes', '1:16.245', '1:16.017', '1:15.737', '21'], ['7', '16', 'Charles Leclerc', 'Ferrari', '1:16.029', '1:15.827', '1:15.755', '20'], ['8', '44', 'Lewis Hamilton', 'Ferrari', '1:16.213', '1:15.919', '1:15.973', '23'], ['9', '10', 'Pierre Gasly', 'Alpine Renault', '1:16.328', '1:16.112', '1:15.980', '21'], ['10', '55', 'Carlos Sainz', 'Williams Mercedes', '1:16.360', '1:15.931', '1:16.062', '21'], ['11', '6', 'Isack Hadjar',