In [17]:
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import os
import re

In [2]:
# configuration and setup 
class Config:
    
    # Scraping settings
    RATE_LIMIT = 2  # seconds between requests
    BATCH_SIZE = 10  # races per batch
    
    # Selenium settings
    HEADLESS = True  # Set to False to see browser window
    IMPLICIT_WAIT = 10
    PAGE_LOAD_TIMEOUT = 30
    
    # File output settings
    OUTPUT_DIR = "data_container/"

# Create output directory
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)  # exist_ok make sure not to double the creation of the directory if it already exist 
print(f"Configuration loaded. Output directory: {Config.OUTPUT_DIR}")

# setup webdriver
def setup_webdriver():
    chrome_options = Options()
    
    if Config.HEADLESS: # HEADLESS = True from config
        chrome_options.add_argument('--headless')
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    # User agent -- pretend to be regular user browser to prevent block/detection
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Setup driver
    service = Service(ChromeDriverManager().install()) # install the correct chromedriver for browser
    driver = webdriver.Chrome(service=service, options=chrome_options) 

    # Config timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT) # set wait time for the content to load
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT) # maximum time allow the page to load

    # Hide webdriver property -- hind automation signature to avoid bot detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver


Configuration loaded. Output directory: data_container/


# scraping race information by horses

In [21]:
driver = setup_webdriver()

In [63]:
url = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=HK'

driver.get(url)

soup = BeautifulSoup(driver.page_source, 'html.parser')

In [64]:
result_table = soup.find_all('table', class_ = 'bigborder')

In [68]:
table = result_table[1]

# Assume you already have the second_table from previous step
links = table.find_all('a')  # find all anchor tags in the table

hrefs = [link.get('href') for link in links if link.get('href')]  # extract href attribute if it exists


In [69]:
hrefs = hrefs[0]

In [56]:
soup = BeautifulSoup(response.content, 'html.parser')

horse_table = soup.find_all('table', class_ = 'bigborder')

In [61]:
header = None
rows = []

base_url = "https://racing.hkjc.com"

for href in hrefs:
    full_url = base_url + href + '&Option=1'
    response = requests.get(full_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        horse_table = soup.find('table', class_ = 'bigborder')

        if horse_table:
            if header is None:
                header_row = horse_table.find('tr')
                header = [th.get_text(strip = True) for th in header_row.find_all('td')]

            for tr in horse_table.find_all('tr')[1:]:
                cols = [td.get_text(strip = True) for td in tr.find_all('td')]
                rows.append(cols)

        else: print(f'Table not found in {full_url}')

    else:
        print(f"Failed to retrieve {full_url}")


In [62]:
rows

[['24/25Season'],
 ['Overseas',
  '02',
  '05/04/25',
  'Meydan / Turf',
  '1800',
  'G',
  'G1',
  '9',
  '134',
  'C S Shum',
  'J McDonald',
  'NOSE',
  '1.3',
  '126',
  '--',
  '1.45.84',
  '--',
  'TT',
  '--'],
 ['Overseas',
  '02',
  '22/02/25',
  'King Abdulaziz / Dirt',
  '1800',
  'FT',
  'G1',
  '3',
  '134',
  'C S Shum',
  'J McDonald',
  'N',
  '1.8',
  '126',
  '--',
  '1.49.14',
  '--',
  'TT',
  '--'],
 ['Overseas',
  '01',
  '24/01/25',
  'Meydan / Turf',
  '1800',
  'G',
  'G1',
  '1',
  '134',
  'C S Shum',
  'J McDonald',
  '4-1/2',
  '1.5',
  '126',
  '--',
  '1.45.10',
  '--',
  'TT',
  '--'],
 ['247',
  '01',
  '08/12/24',
  'ST / Turf / "A"',
  '2000',
  'G',
  'G1',
  '1',
  '133',
  'C S Shum',
  'J McDonald',
  '1-1/2',
  '1.1',
  '126',
  '4 4 4 3 1',
  '2.00.51',
  '1183',
  'TT',
  '',
  ''],
 [''],
 ['190',
  '01',
  '17/11/24',
  'ST / Turf / "B+2"',
  '2000',
  'G',
  'G2',
  '1',
  '133',
  'C S Shum',
  'J McDonald',
  '4-1/4',
  '1.1',
  '128',
  '