In [None]:
#%pip install selenium requests pandas beautifulsoup4 webdriver-manager python-dateutil

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import os

# Configuration and setup

In [5]:
# configuration and setup 
class Config:

    # Date range for scraping
    START_DATE = "2023-01-01"
    END_DATE = "2025-07-14"
    
    # Scraping settings
    RATE_LIMIT = 2  # seconds between requests
    BATCH_SIZE = 10  # races per batch
    
    # Selenium settings
    HEADLESS = True  # Set to False to see browser window
    IMPLICIT_WAIT = 10
    PAGE_LOAD_TIMEOUT = 30
    
    # File output settings
    OUTPUT_DIR = "data_container/"

# Create output directory
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)  # exist_ok make sure not to double the creation of the directory if it already exist 
print(f"Configuration loaded. Output directory: {Config.OUTPUT_DIR}")

Configuration loaded. Output directory: data_container/


In [None]:
# setup webdriver
def setup_webdriver():
    chrome_options = Options()
    
    if Config.HEADLESS: # HEADLESS = True from config
        chrome_options.add_argument('--headless')
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    # User agent -- pretend to be regular user browser to prevent block/detection
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Setup driver
    service = Service(ChromeDriverManager().install()) # install the correct chromedriver for browser
    driver = webdriver.Chrome(service=service, options=chrome_options) 

    # Config timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT) # set wait time for the content to load
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT) # maximum time allow the page to load

    # Hide webdriver property -- hind automation signature to avoid bot detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver


In [10]:
# Test WebDriver setup
print("Testing WebDriver setup...")

try:
    test_driver = setup_webdriver()
    print(f"WebDriver setup successful! Browser: {test_driver.capabilities['browserName']} {test_driver.capabilities['browserVersion']}")
    test_driver.quit()
    print("WebDriver test completed.")

# python creates a Exception object whenever it encounters an error
except Exception as e: 
    print(f"WebDriver setup failed: {e}")
    print("Make sure Chrome browser is installed on your system.")

Testing WebDriver setup...
WebDriver setup failed: Could not reach host. Are you offline?
Make sure Chrome browser is installed on your system.


# Define scraper

In [None]:
# Main HKJC Scraper Class
class HKJCScraper:
    def __init__(self): # when starting a new scraper instance
        self.driver = None 
        self.all_data = [] # start a empty list to collect data
        self.processed_count = 0 # counting how many race pages have been processed
        self.errors = [] # start a empty list to log errors for debugging

    # browser management
    def start_browser(self):
        """Start the browser"""
        if self.driver is None: # check if browser is already running
            self.driver = setup_webdriver() # setup the webdriver by using function defined above
            print("Browser started")
    
    def stop_browser(self):
        """Stop the browser"""
        if self.driver: # is not none means browser is running
            self.driver.quit() # close the browser
            self.driver = None # reset the driver to None
            print("Browser stopped")

    # define main scraper
    def scrape_single_race(self, date, venue, race_no):
        try:
            # build the url from info provided
            date_str = date.strftime('%Y/%m/%d')
            url = f'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date_str}&Racecourse={venue}&RaceNo={race_no}'

            print(f'scraping: {date.strftime("%Y-%m-%d")}, {venue}, Race:{race_no}')

            # get page content
            self.driver.get(url)
            time.sleep(2) # wait for the page to load
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # check if race exists (look for results table)
            results_table = soup.find('table', class_='table_bd')
            if not results_table:
                print(f"No results found for {date.strftime('%Y-%m-%d')}")
                return False

            # Extract basic race info
            race_info = {
                'date': date.strftime('%Y-%m-%d'),
                'venue': venue,
                'race_no': race_no,
                'data_type': 'race_info',
                'scrape_time': datetime.now().isoformat()
            }

            # Try to extract race details by finding 'Class', 'HANDICAP', or 'M' in text
            try:
                # Find all relevant divs, spans, or table with filters with lambda function, selecting those only with 'Class', 'HANDICAP', or 'M' in text
                race_detail_divs = soup.find_all(['div', 'span', 'td'], string = lambda text: text and ('Class' in text or 'HANDICAP' in text or 'M' in text))




            
