In [213]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Dictionaries on condition

In [92]:
turf_going_dict = {
    'Firm': 'F',
    'Good To Firm': 'G/F',
    'Good': 'G',
    'Good To Yielding': 'G/Y',
    'Yielding': 'Y',
    'Yielding To Soft': 'Y/S',
    'Soft': 'S',
    'Heavy': 'H',
}

awt_going_dict = {
    'Fast': 'FT',
    'Slow': 'SL',
    'Wet': 'WE',
    'Good': 'GD',
    'Sealed': 'SE',
    'Rain affected': 'RA',
    'Normal watering': 'NW'
}


# Utility

In [222]:
# get race information
def get_race_info(soup):

    result = soup.find('div', class_ = 'f_fs13')
    lines = list(result.stripped_strings)

    # find venue
    venue = lines[1]
    venue = venue.split(',')
    venue = venue[3]
    venue = venue.strip()
    if venue == 'Sha Tin':
        venue = 'ST'
    elif venue == 'Happy Valley':
        venue = 'HV'
    else:
        venue = 'not found'

    # find track, course, distance, and condition
    info = lines[2].split(',')

    track = info[0].strip()
    course = info[1].replace('"', '').replace('Course', '').strip()
    distance = info[2].replace('M', '').strip()
    condition = info[3].strip()

    if track.lower() == 'turf':
        condition = turf_going_dict.get(condition, 'Unknown')
    elif track.lower() == 'awt':
        condition = awt_going_dict.get(condition, 'Unknown')
    else:
        condition = 'Unknown'
        print('track condition unknown')

    # find class
    race_class = lines[3].split(',')
    race_class = race_class[3].replace('Class', '').strip()

    return venue, track, course, distance, condition, race_class

In [98]:
def get_horse_info(soup):
    header = [
        'Act.Wt.',
        'Jockey',
        'gate_position',
        'Trainer',
        'Rtg.',
        'Declar.Horse Wt.'
    ]
    rows = []

    table = soup.find('table', class_='starter f_tac f_fs13 draggable hiddenable')

    if not table:
        print('no table found')
        return rows

    header_row = table.find('tr')  # Typo fixed: 'fined' -> 'find'
    headers = [td.get_text(strip=True) for td in header_row.find_all('td')]  # Typo fixed: 'et_text' -> 'get_text'

    # get column indices (typos fixed)
    wt_index = headers.index('Wt.')
    jockey_index = headers.index('Jockey')
    draw_index = headers.index('Draw')  # Case sensitivity!
    trainer_index = headers.index('Trainer')
    rtg_index = headers.index('Rtg.')
    horse_weight_index = headers.index('Horse Wt. (Declaration)')

    for row in table.find_all('tr')[1:]: #skip header row
        cells = row.find_all('td')
        if len(cells) > max(wt_index, jockey_index, draw_index, trainer_index, rtg_index, horse_weight_index):
            horse_data = [
                cells[wt_index].get_text(strip=True),
                cells[jockey_index].get_text(strip=True),
                cells[draw_index].get_text(strip=True),
                cells[trainer_index].get_text(strip=True),
                cells[rtg_index].get_text(strip=True),
                cells[horse_weight_index].get_text(strip=True)
            ]
            rows.append(horse_data)
    
    df = pd.DataFrame(rows, columns=header)
    return df


In [84]:
# get all horse href for characteristic
def get_race_horse(soup):
    all_links = []

    # find the table containing the horses url
    parti_horses = soup.find('table', class_ = 'starter f_tac f_fs13 draggable hiddenable')

    if not parti_horses:
        print('horses table not found')

    header_row = parti_horses.find('tr')

    headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
    if 'Horse' not in headers:
        print('no horse column found')

    # get the column index of the horse column
    horse_col_index = headers.index('Horse')

    for row in parti_horses.find_all('tr')[1:]:
        cells = row.find_all('td') # cell in the row
        if len(cells) > horse_col_index:
            td = cells[horse_col_index]
            a_tag = td.find('a', href = True)
            if a_tag:
                href = a_tag['href']
                if href.startswith('/'):
                    href = 'https://racing.hkjc.com' + href
                all_links.append(href)
    
    return all_links


In [218]:
def scrape_horses(all_links):
    header = [
        'Horse_name',
        'Origin / Age',
        'Colour / Sex',
        'Import type',
        'Sire',
        'Dam',
        "Dam sire",
    ]  # build this once outside the loop!
    rows = []    

    for ref in all_links:
        try:
            response = requests.get(ref, timeout=10)
        except Exception as e:
            print(f'Request exception for {ref}: {str(e)}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # name of the horse
        name_tag = soup.find('span', class_='title_text')
        if name_tag:
            name_text = name_tag.get_text()
            match = re.match(r'^([^\(]+)', name_text)
            horse_name = match.group(1).strip() if match else name_text
        else:
            print('horse name not found')
            horse_name = 'na'  # handle missing name

        # find origin and age
        origin_label = soup.find('td', string=lambda text: text and 'Country of Origin' in text)
        origin_info = origin_label.find_next().find_next().get_text(strip=True) if origin_label else 'na'

        # find colour and sex
        colour_label = soup.find('td', string=lambda text: text and 'Colour / Sex' in text)
        colour_info = colour_label.find_next().find_next().get_text(strip=True) if colour_label else 'na'

        # find import type
        import_label = soup.find('td', string=lambda text: text and 'Import Type' in text)
        import_info = import_label.find_next().find_next().get_text(strip=True) if import_label else 'na'

        # find sire
        sire_label = soup.find('td', string=lambda text: text and 'Sire' in text)
        sire_info = sire_label.find_next().find_next().get_text(strip=True) if sire_label else 'na'

        # find dam
        dam_label = soup.find('td', string=lambda text: text and 'Dam' in text)
        dam_info = dam_label.find_next().find_next().get_text(strip=True) if dam_label else 'na'

        # find dam's sire
        dam_sire_label = soup.find('td', string=lambda text: text and "Dam's Sire" in text)
        dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip=True) if dam_sire_label else 'na'

        cols = [
            horse_name,
            origin_info,
            colour_info,
            import_info,
            sire_info,
            dam_info,
            dam_sire_info,
        ]
        rows.append(cols)

    df = pd.DataFrame(rows, columns = header)
    return df

In [86]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [87]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

In [203]:
# obtain odds with selenium
def obtain_odds(odds_url):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options = options)

    driver.get(odds_url)
    wait = WebDriverWait(driver, 20)
    odd_table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.rc-odds-table.pr')))

    # Find header row
    header_cells = odd_table.find_element(By.CSS_SELECTOR, 'tr.rc-odds-table-header').find_elements(By.TAG_NAME, 'td')
    headers = [cell.text.strip() for cell in header_cells]

    if 'Win' not in headers:
        print('can\'t find win odds in odds url')
        
    odds_index = headers.index('Win')

    odds = []

    rows = odd_table.find_elements(By.CSS_SELECTOR, 'tr.rc-odds-row') 
    
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, 'td')
        win_cell = cells[odds_index]
        a_tag = win_cell.find_element(By.TAG_NAME, 'a')
        odds.append(a_tag.text.strip())
    
        
    df = pd.DataFrame(odds, columns=['Win Odds'])
    driver.quit()
    return df

# Main

In [None]:
def scrape_current_race(race_url, odds_url):
    http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}
    response = requests.get(race_url, headers = http_headers)

    if response.status_code != 200:
        print(f'races not found')
    else:
        print('race_url found')

    soup = BeautifulSoup(response.text, 'html.parser')

    # from soup, find the race specific information
    venue, track, course, distance, condition, race_class = get_race_info(soup)

    # from soup, find the horse specific information
    df_horse_in_race = get_horse_info(soup)

    # get the horse race_url for scraping horse related information
    participating_horse_links = get_race_horse(soup)

    # scrap horse data from race_url
    df_basic_info = scrape_horses(participating_horse_links)

    # join both df about the horse
    df = pd.concat([df_horse_in_race, df_basic_info], axis = 1)

    df['rc'] = venue
    df['track'] = track
    df['course'] = course
    df['Dist.'] = distance
    df['track_condition'] = condition
    df['RaceClass'] = race_class

    # clean origin, age
    df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
    df = df.drop(columns = ['Origin / Age'])

    # clean colour, sex
    df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
    df = df.drop(columns = ['Colour / Sex'])

    # obtain odds
    odds_df = obtain_odds(odds_url)

    final_df = pd.concat([df, odds_df], axis = 1)

    # Replace empty strings and numeric zeros with NaN
    final_df.replace(['', 0, '0'], np.nan, inplace=True)
    
    final_df = final_df.dropna(how = 'all')

    # Define feature columns order same as training
    feature_cols = [
        'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type',
        'Sire', 'Dam', "Dam sire", 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex',
        'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.', 'Horse_name'
    ]

    # Reorder columns in final_df to match training order, safely handle missing columns
    final_df = final_df.reindex(columns=feature_cols)


    return final_df

In [223]:
race_url = 'https://racing.hkjc.com/racing/information/English/Racing/RaceCard.aspx?RaceDate=2025/09/07&Racecourse=ST&RaceNo=2'
odds_url = 'https://bet.hkjc.com/en/racing/wp/2025-09-07/ST/2'

df = scrape_current_race(race_url, odds_url)

race_url found


In [224]:
df

Unnamed: 0,Dist.,track_condition,RaceClass,gate_position,Trainer,Jockey,Import type,Sire,Dam,Dam sire,...,track,course,origin,age,colour,sex,Rtg.,Win Odds,Act.Wt.,Declar.Horse Wt.
0,1600,G,5,3,A S Cruz,Y L Chung (-2),ISG,Dark Angel,Layla Jamil,Exceed And Excel,...,Turf,A,IRE,6,Grey,Gelding,37,9.4,132,1123
1,1600,G,5,11,W Y So,K De Melo,PPG,Holy Roman Emperor,Justice Angel,Dark Angel,...,Turf,A,FR,5,Grey,Gelding,36,28.0,131,1059
2,1600,G,5,10,W Y So,H Bentley,PPG,Burgundy,Steal,Red Ransom,...,Turf,A,NZ,7,Bay,Gelding,35,28.0,130,1024
3,1600,G,5,8,C S Shum,A Atzeni,PP,Massaat,Kalia Asha,Dark Angel,...,Turf,A,GB,5,Bay,Gelding,35,6.5,130,1132
4,1600,G,5,12,P F Yiu,K C Leung,PPG,Deep Field,Discreet,Show A Heart,...,Turf,A,AUS,5,Bay,Gelding,34,8.5,129,1187
5,1600,G,5,4,K L Man,A Badel,PPG,Iffraaj,Bellazeel,Zabeel,...,Turf,A,NZ,6,Chestnut,Gelding,34,12.0,129,1127
6,1600,G,5,9,D Eustace,B Avdulla,PP,Not A Single Doubt,Courtesan,Mastercraftsman,...,Turf,A,AUS,5,Bay,Gelding,31,6.3,126,1051
7,1600,G,5,2,W Y So,J Orman,PPG,Charm Spirit,Guessed,Snippetson,...,Turf,A,NZ,6,Bay,Gelding,31,21.0,126,1100
8,1600,G,5,14,J Size,M F Poon,PPG,Churchill,Pure Choice,Choisir,...,Turf,A,AUS,5,Chestnut,Gelding,31,21.0,126,1024
9,1600,G,5,7,C W Chang,Z Purton,ISG,Sebring,Fashion,Anabaa,...,Turf,A,AUS,7,Chestnut,Gelding,29,6.4,124,1092
