In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

# Utility

In [2]:
# get race information
def get_race_info(soup):

    result = soup.find('div', class_ = 'f_fs13')
    lines = list(result.stripped_strings)

    # find venue
    venue = lines[1]
    venue = venue.split(',')
    venue = venue[3]
    venue = venue.strip()
    if venue == 'Sha Tin':
        venue = 'ST'
    elif venue == 'Happy Valley':
        venue = 'HV'
    else:
        venue = 'not found'

    # find track, course, and distance
    info = lines[2].split(',')

    track = info[0].strip()
    course = info[1].replace('"', '').replace('Course', '').strip()
    distance = info[2].strip()

    # find class
    race_class = lines[3].split(',')
    race_class = race_class[3].replace('Class', '').strip()

    return venue, track, course, distance, race_class

In [3]:
# get all horse href for characteristic
def get_race_horse(soup):
    all_links = []

    # find the table containing the horses url
    parti_horses = soup.find('table', class_ = 'starter f_tac f_fs13 draggable hiddenable')

    if not parti_horses:
        print('horses table not found')

    header_row = parti_horses.find('tr')

    headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
    if 'Horse' not in headers:
        print('no horse column found')

    # get the column index of the horse column
    horse_col_index = headers.index('Horse')

    for row in parti_horses.find_all('tr')[1:]:
        cells = row.find_all('td') # cell in the row
        if len(cells) > horse_col_index:
            td = cells[horse_col_index]
            a_tag = td.find('a', href = True)
            if a_tag:
                href = a_tag['href']
                if href.startswith('/'):
                    href = 'https://racing.hkjc.com' + href
                all_links.append(href)
    
    return all_links


In [51]:
def scrape_horses(all_links):
    header = [
        'Horse_name',
        'Origin / Age',
        'Colour / Sex',
        'Import type',
        'Sire',
        'Dam',
        "Dam sire",
        "Rtg."
    ]  # build this once outside the loop!
    rows = []    

    for ref in all_links:
        try:
            response = requests.get(ref, timeout=10)
        except Exception as e:
            print(f'Request exception for {ref}: {str(e)}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # name of the horse
        name_tag = soup.find('span', class_='title_text')
        if name_tag:
            name_text = name_tag.get_text()
            match = re.match(r'^([^\(]+)', name_text)
            horse_name = match.group(1).strip() if match else name_text
        else:
            print('horse name not found')
            horse_name = 'na'  # handle missing name

        # find origin and age
        origin_label = soup.find('td', string=lambda text: text and 'Country of Origin' in text)
        origin_info = origin_label.find_next().find_next().get_text(strip=True) if origin_label else 'na'

        # find colour and sex
        colour_label = soup.find('td', string=lambda text: text and 'Colour / Sex' in text)
        colour_info = colour_label.find_next().find_next().get_text(strip=True) if colour_label else 'na'

        # find import type
        import_label = soup.find('td', string=lambda text: text and 'Import Type' in text)
        import_info = import_label.find_next().find_next().get_text(strip=True) if import_label else 'na'

        # find sire
        sire_label = soup.find('td', string=lambda text: text and 'Sire' in text)
        sire_info = sire_label.find_next().find_next().get_text(strip=True) if sire_label else 'na'

        # find dam
        dam_label = soup.find('td', string=lambda text: text and 'Dam' in text)
        dam_info = dam_label.find_next().find_next().get_text(strip=True) if dam_label else 'na'

        # find dam's sire
        dam_sire_label = soup.find('td', string=lambda text: text and "Dam's Sire" in text)
        dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip=True) if dam_sire_label else 'na'

        # find rating
        rating_label = soup.find('td', string=lambda text: text and "Current Rating" in text)
        rating_info = rating_label.find_next().find_next().get_text(strip = True) if rating_label else 'na'

        cols = [
            horse_name,
            origin_info,
            colour_info,
            import_info,
            sire_info,
            dam_info,
            dam_sire_info,
            rating_info
        ]
        rows.append(cols)

    df = pd.DataFrame(rows, columns = header)
    return df

In [43]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [50]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

# Main

In [52]:
def scrape_current_race(url):
    http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}
    response = requests.get(url, headers = http_headers)

    if response.status_code != 200:
        print(f'races not found')
    else:
        print('url found')

    soup = BeautifulSoup(response.text, 'html.parser')

    # from soup, find the race specific information
    venue, track, course, distance, race_class = get_race_info(soup)

    # get the horse url for scraping horse related information
    participating_horse_links = get_race_horse(soup)

    # scrap horse data from url
    df = scrape_horses(participating_horse_links)

    df['rc'] = venue
    df['track'] = track
    df['course'] = course
    df['Dist.'] = distance
    df['RaceClass'] = race_class

    # clean origin, age
    df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
    df = df.drop(columns = ['Origin / Age'])

    # clean colour, sex
    df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
    df = df.drop(columns = ['Colour / Sex'])

    return df

In [53]:
url = 'https://racing.hkjc.com/racing/information/English/Racing/RaceCard.aspx?RaceDate=2025/09/07&Racecourse=ST&RaceNo=1'

df = scrape_current_race(url)

url found


In [54]:
df

Unnamed: 0,Horse_name,Import type,Sire,Dam,Dam sire,Rtg.,rc,track,course,Dist.,RaceClass,origin,age,colour,sex
0,FLYING AKEED,PPG,Akeed Mofeed,Dream Beauty,Lacryma Cristi,40,ST,Turf,A,1200M,5,AUS,5,Bay,Gelding
1,COLONEL,PPG,Ferlax,Outrageous Fortune,Volksraad,38,ST,Turf,A,1200M,5,NZ,9,Brown,Gelding
2,NOBLE DELUXE,PPG,Toronado,Big Spirit,Invincible Spirit,38,ST,Turf,A,1200M,5,AUS,4,Bay,Gelding
3,ORIENTAL SURPRISE,PPG,Mikki Isle,Palatine Hill,Palace Music,38,ST,Turf,A,1200M,5,AUS,5,Chestnut,Gelding
4,SUNNY DARLING,PP,Shalaa,Honesty Prevails,Redoute's Choice,38,ST,Turf,A,1200M,5,AUS,6,Bay,Gelding
5,BRILLIANT FIRE,PPG,Justify,So You Merge,So You Think,37,ST,Turf,A,1200M,5,AUS,5,Chestnut,Gelding
6,BINGO BABE,PPG,Russian Revolution,One Mansini,Nicconi,35,ST,Turf,A,1200M,5,AUS,5,Bay,Gelding
7,MANYTHANKS FOREVER,PPG,Star Witness,Bukzel,Snitzel,33,ST,Turf,A,1200M,5,AUS,5,Chestnut,Gelding
8,SPEEDY SMARTIE,PPG,Satono Aladdin,Lemonade,Bertolini,33,ST,Turf,A,1200M,5,NZ,6,Bay,Gelding
9,THE CONCENTRATION,PPG,Hellbent,Ivy Blue,Arlington,33,ST,Turf,A,1200M,5,AUS,5,Brown,Gelding


# testing


In [4]:
http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}
response = requests.get('https://racing.hkjc.com/racing/information/English/Racing/RaceCard.aspx?RaceDate=2025/09/07&Racecourse=ST&RaceNo=1', headers = http_headers)

if response.status_code != 200:
    print(f'races not found')
else:
    print('url found')

url found


In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
venue, track, course, distance, race_class = get_race_info(soup)

In [7]:
print(venue, track, course, distance, race_class)

ST Turf A 1200M 5


In [8]:
parti_links = get_race_horse(soup)

In [9]:
parti_links

['https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J471',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2019_D090',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K052',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K028',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J171',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J441',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K115',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J065',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H108',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H465',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?

In [10]:
result = soup.find('div', class_ = 'f_fs13')

In [11]:
lines = list(result.stripped_strings)

In [12]:
lines

['Race 1 - GRASSY HILL HANDICAP',
 'Sunday, September 07, 2025, Sha Tin, 13:00',
 'Turf, "A" Course, 1200M',
 'Prize Money: $875,000, Rating: 40-0, Class 5']

In [13]:
# find venue
venue = lines[1]
venue = venue.split(',')
venue = venue[3]
venue = venue.strip()
if venue == 'Sha Tin':
    venue = 'ST'
elif venue == 'Happy Valley':
    venue = 'HV'
else:
    venue = 'not found'


In [14]:
# find track, course, and distance
info = lines[2].split(',')

track = info[0].strip()
course = info[1].replace('"', '').replace('Course', '').strip()
length = info[2].strip()

In [15]:
# find class
race_class = lines[3].split(',')
race_class = race_class[3].replace('Class', '').strip()
race_class

'5'

In [16]:
all_links = []

# find the table containing the horses url
parti_horses = soup.find('table', class_ = 'starter f_tac f_fs13 draggable hiddenable')

if not parti_horses:
    print('horses table not found')

header_row = parti_horses.find('tr')

In [17]:
headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
if 'Horse' not in headers:
    print('no horse column found')

# get the column index of the horse column
horse_col_index = headers.index('Horse')

for row in parti_horses.find_all('tr')[1:]:
    cells = row.find_all('td') # cell in the row
    if len(cells) > horse_col_index:
        td = cells[horse_col_index]
        a_tag = td.find('a', href = True)
        if a_tag:
            href = a_tag['href']
            if href.startswith('/'):
                href = 'https://racing.hkjc.com' + href
            all_links.append(href)
            

In [18]:
all_links

['https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J471',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2019_D090',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K052',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K028',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J171',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J441',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K115',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J065',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H108',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H465',
 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?

In [34]:
header = [
    'Horse_name',
    'Origin / Age',
    'Colour / Sex',
    'Import type',
    'Sire',
    'Dam',
    "Dam sire"
]  # build this once outside the loop!
rows = []

for ref in all_links:
    try:
        response = requests.get(ref, timeout=10)
    except Exception as e:
        print(f'Request exception for {ref}: {str(e)}')
        continue

    soup = BeautifulSoup(response.content, 'html.parser')

    # name of the horse
    name_tag = soup.find('span', class_='title_text')
    if name_tag:
        name_text = name_tag.get_text()
        match = re.match(r'^([^\(]+)', name_text)
        horse_name = match.group(1).strip() if match else name_text
    else:
        print('horse name not found')
        horse_name = 'na'  # handle missing name

    # find origin and age
    origin_label = soup.find('td', string=lambda text: text and 'Country of Origin' in text)
    origin_info = origin_label.find_next().find_next().get_text(strip=True) if origin_label else 'na'

    # find colour and sex
    colour_label = soup.find('td', string=lambda text: text and 'Colour / Sex' in text)
    colour_info = colour_label.find_next().find_next().get_text(strip=True) if colour_label else 'na'

    # find import type
    import_label = soup.find('td', string=lambda text: text and 'Import Type' in text)
    import_info = import_label.find_next().find_next().get_text(strip=True) if import_label else 'na'

    # find sire
    sire_label = soup.find('td', string=lambda text: text and 'Sire' in text)
    sire_info = sire_label.find_next().find_next().get_text(strip=True) if sire_label else 'na'

    # find dam
    dam_label = soup.find('td', string=lambda text: text and 'Dam' in text)
    dam_info = dam_label.find_next().find_next().get_text(strip=True) if dam_label else 'na'

    # find dam's sire
    dam_sire_label = soup.find('td', string=lambda text: text and "Dam's Sire" in text)
    dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip=True) if dam_sire_label else 'na'

    cols = [
        horse_name,
        origin_info,
        colour_info,
        import_info,
        sire_info,
        dam_info,
        dam_sire_info
    ]
    rows.append(cols)


In [36]:
print(header)

['Horse_name', 'Origin / Age', 'Colour / Sex', 'Import type', 'Sire', 'Dam', 'Dam sire']


In [35]:
print(rows)

[['FLYING AKEED', 'AUS / 5', 'Bay / Gelding', 'PPG', 'Akeed Mofeed', 'Dream Beauty', 'Lacryma Cristi'], ['COLONEL', 'NZ / 9', 'Brown / Gelding', 'PPG', 'Ferlax', 'Outrageous Fortune', 'Volksraad'], ['NOBLE DELUXE', 'AUS / 4', 'Bay / Gelding', 'PPG', 'Toronado', 'Big Spirit', 'Invincible Spirit'], ['ORIENTAL SURPRISE', 'AUS / 5', 'Chestnut / Gelding', 'PPG', 'Mikki Isle', 'Palatine Hill', 'Palace Music'], ['SUNNY DARLING', 'AUS / 6', 'Bay / Gelding', 'PP', 'Shalaa', 'Honesty Prevails', "Redoute's Choice"], ['BRILLIANT FIRE', 'AUS / 5', 'Chestnut / Gelding', 'PPG', 'Justify', 'So You Merge', 'So You Think'], ['BINGO BABE', 'AUS / 5', 'Bay / Gelding', 'PPG', 'Russian Revolution', 'One Mansini', 'Nicconi'], ['MANYTHANKS FOREVER', 'AUS / 5', 'Chestnut / Gelding', 'PPG', 'Star Witness', 'Bukzel', 'Snitzel'], ['SPEEDY SMARTIE', 'NZ / 6', 'Bay / Gelding', 'PPG', 'Satono Aladdin', 'Lemonade', 'Bertolini'], ['THE CONCENTRATION', 'AUS / 5', 'Brown / Gelding', 'PPG', 'Hellbent', 'Ivy Blue', 'Arlin