In [2]:
#%pip install selenium requests pandas beautifulsoup4 webdriver-manager python-dateutil

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import os
import re

# Configuration and setup

In [None]:
# configuration and setup 
class Config:
    
    # Scraping settings
    RATE_LIMIT = 2  # seconds between requests
    BATCH_SIZE = 10  # races per batch
    
    # Selenium settings
    HEADLESS = True  # Set to False to see browser window
    IMPLICIT_WAIT = 10
    PAGE_LOAD_TIMEOUT = 30
    
    # File output settings
    OUTPUT_DIR = "data_container/"

# Create output directory
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)  # exist_ok make sure not to double the creation of the directory if it already exist 
print(f"Configuration loaded. Output directory: {Config.OUTPUT_DIR}")

Configuration loaded. Output directory: data_container/


In [3]:
# setup webdriver
def setup_webdriver():
    chrome_options = Options()
    
    if Config.HEADLESS: # HEADLESS = True from config
        chrome_options.add_argument('--headless')
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    # User agent -- pretend to be regular user browser to prevent block/detection
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Setup driver
    service = Service(ChromeDriverManager().install()) # install the correct chromedriver for browser
    driver = webdriver.Chrome(service=service, options=chrome_options) 

    # Config timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT) # set wait time for the content to load
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT) # maximum time allow the page to load

    # Hide webdriver property -- hind automation signature to avoid bot detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver


In [4]:
# Test WebDriver setup
print("Testing WebDriver setup...")

try:
    test_driver = setup_webdriver()
    print(f"WebDriver setup successful! Browser: {test_driver.capabilities['browserName']} {test_driver.capabilities['browserVersion']}")
    test_driver.quit()
    print("WebDriver test completed.")

# python creates a Exception object whenever it encounters an error
except Exception as e: 
    print(f"WebDriver setup failed: {e}")
    print("Make sure Chrome browser is installed on your system.")

Testing WebDriver setup...
WebDriver setup successful! Browser: chrome 138.0.7204.184
WebDriver test completed.


In [5]:
date = '2025/07/16'
venue = 'HV'
race_no = '1'

In [8]:
driver = setup_webdriver()

In [9]:
url = f'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue}&RaceNo={race_no}'

In [10]:
driver.get(url)

In [11]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [12]:
result_table = soup.find('div', class_ = 'race_tab')

In [13]:
race_info ={}

In [14]:
race_info['date'] = date
race_info['venue'] = venue
race_info['race_no'] = race_no

In [23]:
def find_class(soup):
    # find class label in soup
    try:
        race_class = soup.find_all(['td'], string = lambda text: text and ('Class' in text))

        for c in race_class:
            response = c.get_text()
            if 'Class' in response:
                classes = re.search(r"^(Class [1-5])", response)
                race_info['race_class'] = classes.group(0)
                print(classes.group(0))
                
                length = re.search(r"\b(\d{3,4})M\b", response)
                race_info['length'] = length.group(0)
                print(length.group(0))

    except:
        pass

# Find the condition of the course
def find_going(soup): 
    try:
        going = soup.find_all(['td'], string = lambda text : text and ('GOOD TO FIRM' in text))

        for t in going:
            response = t.get_text()
            race_info['course condition'] = response
            print(f'Course condition: {response}')

    except:
        pass    

def find_course(soup):
    try:
        course_label = soup.find(['td'], string = lambda text : text and 'Course' in text)
        if course_label:
            course_info = course_label.find_next().get_text(strip = True)
            race_info['course'] = course_info
            print(f'course info: {course_info}')

    except:
        pass    

In [45]:
def find_result(soup):
    try:
        table = soup.find('table', class_ = 'f_tac table_bd draggable')
        headers = [td.get_text(strip=True) for td in table.find('thead').find_all('td')]

        data_rows = []
    
        for row in table.find('tbody').find_all('tr'):
           cols = [td.get_text(strip=True) for td in row.find_all('td')]
           if cols and any(cols):   # skip empty rows
               data_rows.append(cols)
    
        table_data = [dict(zip(headers, row)) for row in data_rows]
        print(table_data)

    except:
        pass


In [46]:
find_going(soup)
find_class(soup)
find_course(soup)
find_result(soup)

Course condition: GOOD TO FIRM
Class 5
1200M
course info: TURF - "B" Course
[{'Pla.': '1', 'Horse No.': '11', 'Horse': 'SPEEDY SMARTIE(H108)', 'Jockey': 'L Ferraris', 'Trainer': 'T P Yung', 'Act. Wt.': '123', 'Declar. Horse Wt.': '1125', 'Dr.': '7', 'LBW': '-', 'RunningPosition': '111', 'Finish Time': '1:09.96', 'Win Odds': '7.7'}, {'Pla.': '2', 'Horse No.': '5', 'Horse': 'RAGNARR(H297)', 'Jockey': 'M F Poon', 'Trainer': 'D J Hall', 'Act. Wt.': '130', 'Declar. Horse Wt.': '1049', 'Dr.': '4', 'LBW': 'NOSE', 'RunningPosition': '432', 'Finish Time': '1:09.97', 'Win Odds': '6.2'}, {'Pla.': '3', 'Horse No.': '4', 'Horse': 'DAN ATTACK(H317)', 'Jockey': 'H Bowman', 'Trainer': 'D J Whyte', 'Act. Wt.': '131', 'Declar. Horse Wt.': '1206', 'Dr.': '2', 'LBW': 'HD', 'RunningPosition': '223', 'Finish Time': '1:10.00', 'Win Odds': '5.5'}, {'Pla.': '4', 'Horse No.': '9', 'Horse': "YOU'REMYEVERYTHING(E413)", 'Jockey': 'K C Leung', 'Trainer': 'C W Chang', 'Act. Wt.': '126', 'Declar. Horse Wt.': '996', '

# Define scraper

In [68]:
# Main HKJC Scraper Class
class HKJCScraper:
    def __init__(self): # when starting a new scraper instance
        self.driver = None 
        self.all_data = [] # start a empty list to collect data
        self.processed_count = 0 # counting how many race pages have been processed
        self.errors = [] # start a empty list to log errors for debugging


    # browser management
    def start_browser(self):
        """Start the browser"""
        if self.driver is None: # check if browser is already running
            self.driver = setup_webdriver() # setup the webdriver by using function defined above
            print("Browser started")
    
    def stop_browser(self):
        """Stop the browser"""
        if self.driver: # is not none means browser is running
            self.driver.quit() # close the browser
            self.driver = None # reset the driver to None
            print("Browser stopped")


    # define main scraper
    def scrape_single_race(self, date, venue, race_no):
        try:
            # build the url from info provided
            date_str = date.strftime('%Y/%m/%d')
            url = f'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date_str}&Racecourse={venue}&RaceNo={race_no}'

            print(f'scraping: {date.strftime("%Y-%m-%d")}, {venue}, Race:{race_no}')

            # get page content
            self.driver.get(url)
            time.sleep(2) # wait for the page to load
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # check if race exists (look for results table)
            results_table = soup.find('table', class_='table_bd')
            if not results_table:
                print(f"No results found for {date.strftime('%Y-%m-%d')}")
                return False

            # Extract basic race info
            race_info = {
                'date': date.strftime('%Y-%m-%d'),
                'venue': venue,
                'race_no': race_no,
                'data_type': 'race_info',
                'scrape_time': datetime.now().isoformat()
            }

            # Try to extract race details by finding 'Class', 'HANDICAP', or 'M' in text
            try:
                # Find all relevant divs, spans, or table with filters with lambda function, selecting those only with 'Class', 'HANDICAP', or 'M' in text
                race_detail_divs = soup.find_all(['div', 'span', 'td'], string = lambda text: text and ('Class' in text or 'HANDICAP' in text or 'M' in text))
                for div in race_detail_divs[:3]: # check the first 3 elements because they often appears at the top of the page
                    text = div.get_text() 
                    if 'Class' in text: # check if 'Class' is in the text
                        race_info['race_class'] = text.strip()
                    if any(char.isdigit() and 'M' in text for char in text): # quick check if there is a digit followed by 'M' in the text
                        import re
                        distance_match = re.search(r'(\d+)M', text) # extract the distance in meters using regex
                        if distance_match:
                            race_info['distance'] = distance_match.group(0) # .group(0): return the whole regex match. .group(1): return the first capturing group

            except:
                pass

            # Extract the horse performance
            performances = [] # initialize a list to store performances
            try:
                rows = results_table.find_all('tr')[1:]  # skip header row
            
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 6: # exclude if shorter than 6 columns

                        # Extract horse ID from link if available
                        horse_id = None
                        horse_name = cells[2].get_text(strip=True) if len(cells) > 2 else ''
                        horse_link = cells[2].find('a') if len(cells) > 2 else None
                        if horse_link and horse_link.get('href'):
                            import re
                            match = re.search(r'HorseId=([^&]+)', horse_link['href'])
                            if match:
                                horse_id = match.group(1)
                        
                        performance = {
                            'date': date.strftime('%Y-%m-%d'),
                            'venue': venue,
                            'race_no': race_no,
                            'data_type': 'performance',
                            'position': cells[0].get_text(strip=True),
                            'horse_no': cells[1].get_text(strip=True) if len(cells) > 1 else '',
                            'horse_name': horse_name,
                            'horse_id': horse_id,
                            'jockey': cells[3].get_text(strip=True) if len(cells) > 3 else '',
                            'trainer': cells[4].get_text(strip=True) if len(cells) > 4 else '',
                            'weight': cells[5].get_text(strip=True) if len(cells) > 5 else '',
                            'draw': cells[6].get_text(strip=True) if len(cells) > 6 else '',
                            'margin': cells[7].get_text(strip=True) if len(cells) > 7 else '',
                            'time': cells[8].get_text(strip=True) if len(cells) > 8 else '',
                            'odds': cells[9].get_text(strip=True) if len(cells) > 9 else '',
                            'scrape_time': datetime.now().isoformat()
                        }
                        performances.append(performance)
            except Exception as e:
                print(f'error extracting performances: {e}')
            

            self.all_data.append(race_info)
            self.all_data.extend(performance)

            self.processed_count += 1
            print(f"Extracted {len(performances)} horses")
            
            return True
            
        except Exception as e:
            error_msg = f"Error scraping {date.strftime('%Y-%m-%d')} {venue} R{race_no}: {str(e)}"
            print(f" {error_msg}")
            self.errors.append(error_msg)
            return False
    
    def save_data(self, filename_prefix="hkjc_data"):
        """Save all collected data to CSV"""
        if not self.all_data:
            print("No data to save!")
            return
        
        # Convert to DataFrame from dictonaries
        df = pd.DataFrame(self.all_data)
        
        # Save main data file
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{Config.OUTPUT_DIR}{filename_prefix}_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"✅ Saved {len(df)} records to {filename}")
        
        # Save separate files by data type
        for data_type in df['data_type'].unique():
            type_df = df[df['data_type'] == data_type]
            type_filename = f"{Config.OUTPUT_DIR}{filename_prefix}_{data_type}_{timestamp}.csv"
            type_df.to_csv(type_filename, index=False)
            print(f"  📊 {data_type}: {len(type_df)} records → {type_filename}")
        
        # Save errors if any
        if self.errors:
            error_filename = f"{Config.OUTPUT_DIR}{filename_prefix}_errors_{timestamp}.txt"
            with open(error_filename, 'w') as f:
                for error in self.errors:
                    f.write(error + '\\n')
            print(f"  ⚠️ Saved {len(self.errors)} errors to {error_filename}")
        
        return filename

print("✅ HKJCScraper class ready!")







            


✅ HKJCScraper class ready!


In [69]:
# Utility Functions
def generate_race_dates(start_date, end_date):
    """Generate list of race dates (typically Wed, Sat, Sun)"""
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    race_dates = []
    current = start
    
    while current <= end:
        # Check if it's a racing day (Wednesday=2, Saturday=5, Sunday=6)
        if current.weekday() in [2, 5, 6]:
            race_dates.append(current)
        current += timedelta(days=1)
    
    return race_dates

def get_race_venues():
    """Get list of race venues"""
    return ['ST', 'HV']  # Sha Tin, Happy Valley

def get_race_numbers():
    """Get typical race numbers"""
    return list(range(1, 12))  # Races 1-11

# Main scraping function
def run_hkjc_scraper(start_date=None, end_date=None, venues=None, max_races=None):
    """Run the HKJC scraper"""
    
    # Set defaults
    # if start_date is None:
    #     start_date = "2024-01-01"
    # if end_date is None:
    #     end_date = "2024-01-31"
    # if venues is None:
    #     venues = ['ST']  # Default to Sha Tin only
    
    print(f"🚀 Starting HKJC scraper")
    print(f"📅 Date range: {start_date} to {end_date}")
    print(f"🏟️ Venues: {venues}")
    print(f"🎯 Max races: {max_races if max_races else 'No limit'}")
    
    # Initialize scraper
    scraper = HKJCScraper()
    
    try:
        # Start browser
        scraper.start_browser()
        
        # Generate race dates
        race_dates = generate_race_dates(start_date, end_date)
        print(f"📊 Found {len(race_dates)} potential race dates")
        
        total_processed = 0
        
        # Process each date
        for race_date in race_dates:
            if max_races and total_processed >= max_races:
                print(f"🛑 Reached maximum races limit ({max_races})")
                break
                
            print(f"\\n📅 Processing {race_date.strftime('%Y-%m-%d %A')}")
            
            # Process each venue
            for venue in venues:
                if max_races and total_processed >= max_races:
                    break
                    
                print(f"  🏟️ Venue: {venue}")
                
                # Process races 1-11
                for race_no in get_race_numbers():
                    if max_races and total_processed >= max_races:
                        break
                    
                    success = scraper.scrape_single_race(race_date, venue, race_no)
                    
                    if success:
                        total_processed += 1
                        print(f"    📈 Progress: {total_processed}/{max_races if max_races else '∞'}")
                    
                    # Rate limiting
                    time.sleep(Config.RATE_LIMIT)
                    
                    # Save data periodically
                    if total_processed % Config.BATCH_SIZE == 0 and total_processed > 0:
                        print(f"\\n💾 Saving batch at {total_processed} races...")
                        scraper.save_data(f"batch_{total_processed//Config.BATCH_SIZE:03d}")
        
        # Final save
        print(f"\\n💾 Final save...")
        final_file = scraper.save_data("final")
        
        # Summary
        print(f"\\n📊 === SCRAPING SUMMARY ===")
        print(f"✅ Total races processed: {scraper.processed_count}")
        print(f"📁 Total data records: {len(scraper.all_data)}")
        print(f"❌ Errors encountered: {len(scraper.errors)}")
        print(f"💾 Final data file: {final_file}")
        
        return scraper
        
    except Exception as e:
        print(f"💥 Fatal error: {e}")
        return None
        
    finally:
        # Always clean up
        scraper.stop_browser()
        print("🧹 Cleanup completed")

# Test the utility functions
print("🔧 Testing utility functions...")
test_dates = generate_race_dates('2024-01-01', '2024-01-31')
print(f"✅ Generated {len(test_dates)} race dates for January 2024")
print(f"📅 Sample dates: {[d.strftime('%Y-%m-%d %A') for d in test_dates[:3]]}")
print(f"🏟️ Venues: {get_race_venues()}")
print(f"🏇 Race numbers: {get_race_numbers()}")
print("✅ All utility functions ready!")


🔧 Testing utility functions...
✅ Generated 13 race dates for January 2024
📅 Sample dates: ['2024-01-03 Wednesday', '2024-01-06 Saturday', '2024-01-07 Sunday']
🏟️ Venues: ['ST', 'HV']
🏇 Race numbers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
✅ All utility functions ready!


In [70]:
# 🧪 TEST RUN - Small sample
print("🧪 === TEST RUN ===")
print("Running test with small sample to verify everything works...")

# Test with just a few races from January 2024
test_result = run_hkjc_scraper(
    start_date='2024-01-03',  # A Wednesday  
    end_date='2024-01-07',    # A Sunday
    venues=['ST'],            # Just Sha Tin
    max_races=3              # Only 3 races total
)

print("\\n🧪 === TEST COMPLETED ===")
if test_result:
    print("✅ Test successful! The scraper is working correctly.")
    print("💡 You can now run larger scraping jobs.")
else:
    print("❌ Test failed. Check the error messages above.")
    print("💡 Make sure Chrome browser is installed and internet connection is stable.")


🧪 === TEST RUN ===
Running test with small sample to verify everything works...
🚀 Starting HKJC scraper
📅 Date range: 2024-01-03 to 2024-01-07
🏟️ Venues: ['ST']
🎯 Max races: 3
Browser started
📊 Found 3 potential race dates
\n📅 Processing 2024-01-03 Wednesday
  🏟️ Venue: ST
scraping: 2024-01-03, ST, Race:1
No results found for 2024-01-03
scraping: 2024-01-03, ST, Race:2
No results found for 2024-01-03
scraping: 2024-01-03, ST, Race:3
Browser stopped
🧹 Cleanup completed


KeyboardInterrupt: 