In [17]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os

# Configuration and setup

In [8]:
# configuration and setup 
class Config:
    # Date range for scraping
    START_DATE = "2023-01-01"
    END_DATE = "2025-07-14"
    
    # Scraping settings
    RATE_LIMIT = 2  # seconds between requests
    BATCH_SIZE = 10  # races per batch
    
    # Selenium settings
    HEADLESS = True  # Set to False to see browser window
    IMPLICIT_WAIT = 10
    PAGE_LOAD_TIMEOUT = 30
    
    # File output settings
    OUTPUT_DIR = "./data/"

# Create output directory
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)  # exist_ok make sure not to double the creation of the directory if it already exist 
print(f"✅ Configuration loaded. Output directory: {Config.OUTPUT_DIR}")

✅ Configuration loaded. Output directory: ./data/


In [13]:
# WebDriver Setup
def setup_webdriver():
    """Setup Chrome WebDriver with optimal settings"""
    chrome_options = Options()
    
    if Config.HEADLESS:
        chrome_options.add_argument("--headless")
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # User agent
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Setup driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Configure timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT)
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT)
    
    # Hide webdriver property
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

# Test WebDriver setup
print("🚗 Testing WebDriver setup...")
try:
    test_driver = setup_webdriver()
    print(f"✅ WebDriver setup successful! Browser: {test_driver.capabilities['browserName']} {test_driver.capabilities['browserVersion']}")
    test_driver.quit()
    print("✅ WebDriver test completed.")
except Exception as e:
    print(f"❌ WebDriver setup failed: {e}")
    print("💡 Make sure Chrome browser is installed on your system.")


🚗 Testing WebDriver setup...
❌ WebDriver setup failed: name 'Service' is not defined
💡 Make sure Chrome browser is installed on your system.


In [25]:
# setup webdriver
def setup_webdriver():
    chrome_options = Options()
    
    if Config.HEADLESS: # HEADLESS = True from config
        chrome_options.add_argument('--headless')
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    # User agent -- pretend to be regular user browser to prevent block/detection
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Setup driver
    service = Service(ChromeDriverManager().install()) # install the correct chromedriver for browser
    driver = webdriver.Chrome(service=service, options=chrome_options) 

    # Config timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT) # set wait time for the content to load
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT) # maxium time allow the page to load

    # Hide webdriver property -- hind automation signature to avoid bot detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver


In [26]:
# Test WebDriver setup
print("Testing WebDriver setup...")

try:
    test_driver = setup_webdriver()
    print(f"WebDriver setup successful! Browser: {test_driver.capabilities['browserName']} {test_driver.capabilities['browserVersion']}")
    test_driver.quit()
    print("WebDriver test completed.")

# python creates a Exception object whenever it encounters an error
except Exception as e: 
    print(f"WebDriver setup failed: {e}")
    print("Make sure Chrome browser is installed on your system.")

Testing WebDriver setup...
WebDriver setup successful! Browser: chrome 138.0.7204.169
WebDriver test completed.
