In [155]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import certifi
import pandas as pd
import concurrent.futures

head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
base_url = "https://www.formula1.com"

# Get years that statistics have been published
current_year = datetime.now().year
years = [year for year in range(1950, current_year + 1)]

# Helper function to save data to CSV
def save_to_csv(data, headers, filename):
    df = pd.DataFrame(data, columns=headers)
    df.to_csv(filename, index=False)
    print(df)

In [156]:

def scrape_races_year(year):
    # Default values in case elements are not found
    race_date = None
    circuit = None
    city = None
    
    # URL of the page
    url = f"{base_url}/en/results/{year}/races"

    # Send a GET request to the URL
    response = requests.get(url, headers=head, verify=certifi.where())

    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find table
        table = soup.find('table', class_='f1-table-with-data')
        
        if table:
            headers = [header.text.strip() for header in table.find('thead').find_all('th')]
            
            rows = table.find('tbody').find_all('tr')
            data = []
            race_links = []
            
            for row in rows:
                cols = row.find_all('td')
                row_data = []
                
                for i, col in enumerate(cols):
                    if i == 2: #Driver column
                        winner = col.text.strip().replace("\xa0", " ")[:-3]
                        row_data.append(winner)
                    else:
                        row_data.append(col.text.strip())
                        
                # Append the row data to the data list (only once per row)
                data.append(row_data)
                
                # Extract race link
                race_link = cols[0].find('a')['href']
                full_link = f"{base_url}/en/results/{year}/{race_link}"
                race_links.append((row_data[0], full_link))
                
    return data, headers, race_links

In [157]:
def scrape_race_location(race_url):
    response = requests.get(race_url, headers=head, verify=certifi.where())
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the location table
        header_section = soup.find('div', class_='max-tablet:flex-col flex gap-xs')
        
        if header_section:
            location_info = header_section.find_all('p')
            
            race_date = location_info[0].text.strip()[:-5]
            track = location_info[1].text.strip().split(", ")
            circuit = track[0]
            city = track[1]
            
    return race_date, circuit, city
            

In [None]:
headers_race = []
races = []

headers_race_location = ['Grand Prix', 'Circuit', 'Country/City', 'Year', 'Date']
race_location = []

# Step 1: First collect all race links
all_race_links = []
for year in years:
    race, header_race, race_links = scrape_races_year(year)
    races.extend(race)
    
    all_race_links.extend([(link[0], link[1]) for link in race_links])

    if len(headers_race) == 0:
        headers_race = header_race
        
# Process race links in parallel to get location data
def process_race_link(race_link_tuple):
    grand_prix, url = race_link_tuple
    try:
        race_date, circuit, city = scrape_race_location(url)
        return [grand_prix, circuit, city, year, race_date]
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None
    
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    results = executor.map(process_race_link, all_race_links)
        
race_location = [r for r in results if r is not None]

# Use ThreadPoolExecutor for parallel requests
save_to_csv(races, headers_race, "races")
save_to_csv(race_location, headers_race_location, "race_location")

In [None]:
print(scrape_race_location("https://www.formula1.com/en/results/1950/races/94/great-britain/race-result"))


('13 May', 'Silverstone Circuit', 'Great Britain')
