In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import re
import time

# Functions

In [2]:
def scrape_horse_links(date, venue, max_race_no = 14):

    data = [[date, venue]]

    fixture_df = pd.DataFrame(data, columns = ['Date', 'Venue'])
    
    base_url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
    http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}

    all_links = []

    # loop for date and venue
    for idx, row in fixture_df.iterrows():
        date = row['Date']
        venue = row['Venue']

        print(f'scraping date: {date} @ {venue}')

        # loop for race numbers
        for race_no in range(1, max_race_no + 1):
            params = {
                "RaceDate": date,
                "Racecourse": venue,
                "RaceNo": race_no   
            }

            print(f'scraping race no: {race_no}')
            time.sleep(1)

            # get the response from the url
            response = requests.get(base_url, params = params, headers = http_headers)
            if response.status_code != 200:
                print(f"  Race {race_no}: Failed to fetch (status {response.status_code}). Stopping.")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            table = soup.find('table', class_ = 'table_bd')
            if not table:
                print('table not found')
                break

            header_row = table.find('tr') 
            headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
            if 'Horse' not in headers:
                print("no 'horse' column found")
                break

            # get position of the horse column
            horse_col_index = headers.index('Horse')

            for row in table.find_all('tr')[1:]:  # skip header row
                cells = row.find_all('td') # get all cells in the row
                if len(cells) > horse_col_index:
                    td = cells[horse_col_index]
                    a_tag = td.find('a', href = True)
                    if a_tag:
                        href = a_tag['href']
                        if href.startswith('/'):
                            href = "https://racing.hkjc.com" + href
                        all_links.append(href) 
    
    return list(set(all_links))



In [3]:
def scraping_races(list, race_date):

    # Initiate header and rows for storage
    header = None
    rows = []

    for href in list:
        full_url = href + '&Option=1'

        try:
            response = requests.get(full_url, timeout = 10)
            print(f'scraping {full_url}')
        except Exception as e:
            print(f'Request exception for {full_url}: {str(e)}')
            continue

        match = re.search(r'([A-Z]\d{3})$', href)
        horse_id = match.group(1) if match else None

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # find origin and age
            origin_label = soup.find('td', string = lambda text : text and 'Country of Origin' in text)
            if origin_label:
                origin_info = origin_label.find_next().find_next().get_text(strip = True)
            else:
                origin_info = 'na'

            # find colour and sex
            colour_label = soup.find('td', string = lambda text : text and 'Colour / Sex' in text)
            if colour_label:
                colour_info = colour_label.find_next().find_next().get_text(strip = True)
            else:
                colour_info = 'na'

            # find import type
            import_label = soup.find('td', string = lambda text : text and 'Import Type' in text)
            if import_label:
                import_info = import_label.find_next().find_next().get_text(strip = True)
            else:
                import_info = 'na'

            # find sire
            sire_label = soup.find('td', string = lambda text : text and 'Sire' in text)
            if sire_label:
                sire_info = sire_label.find_next().find_next().get_text(strip = True)
            else:
                sire_info = 'na'

            # find dam
            dam_label = soup.find('td', string = lambda text : text and 'Dam' in text)
            if dam_label:
                dam_info = dam_label.find_next().find_next().get_text(strip = True)
            else:
                dam_info = 'na'

            # find dam's sire
            dam_sire_label = soup.find('td', string = lambda text : text and 'Dam\'s Sire' in text)
            if dam_sire_label:
                dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip = True)
            else:
                dam_sire_info = 'na'
            
            # find horse per race info
            horse_table = soup.find('table', class_='bigborder')
            if horse_table:
                if header is None:
                    header_row = horse_table.find('tr')
                    header = [th.get_text(strip=True) for th in header_row.find_all('td')]
                    header = header[:-1]
                    header.append('Horse_id')
                    header.append('Origin / Age')
                    header.append('Colour / Sex')
                    header.append('Import type')
                    header.append('Sire')
                    header.append('Dam')
                    header.append('Dam sire')

                for tr in horse_table.find_all('tr')[1:]:
                    cols = [td.get_text(strip=True) for td in tr.find_all('td')]

                    if all(not col.strip() for col in cols):
                        continue
                    first_col = cols[0].strip()
                    if 'Season' in first_col or first_col == 'Overseas':
                        continue

                    # append all info into the rows 
                    cols = cols[:-2]
                    cols.append(horse_id)
                    cols.append(origin_info)
                    cols.append(colour_info)
                    cols.append(import_info)
                    cols.append(sire_info)
                    cols.append(dam_info)
                    cols.append(dam_sire_info)
                    rows.append(cols)
            else:
                print(f'Table not found in {full_url}')


        else:
            print(f"Failed to retrieve {full_url}")

    if not header:
        raise RuntimeError("No data table header found! Check the structure of the page.")

    df = pd.DataFrame(rows, columns=header)
    df['Date'] = pd.to_datetime(df['Date'], format = 'mixed',  dayfirst = True).dt.strftime('%d/%m/%y')
    race_date = pd.to_datetime(race_date, format = 'mixed', dayfirst=True).strftime('%d/%m/%y')

    df = df[df['Date'] == race_date]

    return df


In [None]:
def append_with_new(new_df, old_df):
    """
    old_df : link for old dataframe with historical data
    """

    old_df = pd.read_csv(old_df)
    df = pd.concat([old_df, new_df])

    current_time = datetime.now().strftime('%Y%m%d')

    file_path = f'../data/race_data/race_data_{current_time}.csv'

    df.to_csv(file_path, index=False)

    return df

# Workflow

In [5]:
links = scrape_horse_links('08/10/2025', 'HV')

scraping date: 08/10/2025 @ HV
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
scraping race no: 10
table not found


In [6]:
df = scraping_races(links, '08/10/2025')

scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J266&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J371&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J421&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G471&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J164&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2019_D241&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J317&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J305&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J423&Option=1
scraping https://racing.hkjc.com/raci

In [7]:
df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,081,04,08/10/25,"HV / Turf / ""C+3""",1650,GF,3,6,67,C S Shum,...,1.38.89,1028,TT,J266,BRZ / 5,Chestnut / Gelding,PP,Courtier,Zealous Vixen,Inexplicable
20,077,05,08/10/25,"HV / Turf / ""C+3""",1650,GF,4,3,43,K H Ting,...,1.40.33,1168,V/XB,J371,AUS / 4,Brown / Gelding,PPG,Capitalist,Meri Rani,Stratum
33,083,07,08/10/25,"HV / Turf / ""C+3""",1200,GF,3,1,62,C Fownes,...,1.09.69,1095,B/TT,J421,NZ / 5,Bay / Gelding,ISG,Per Incanto,Monarch,Volksraad
44,082,06,08/10/25,"HV / Turf / ""C+3""",1200,GF,4,8,57,C S Shum,...,1.10.16,1140,B2/TT,G471,AUS / 6,Chestnut / Gelding,ISG,Deep Field,Bousquet,More Than Ready
74,084,09,08/10/25,"HV / Turf / ""C+3""",1200,GF,3,9,64,C W Chang,...,1.09.68,1115,CP-/XB/TT,J164,NZ / 5,Brown / Gelding,PPG,What's The Story,Citycenta,Elusive City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,080,07,08/10/25,"HV / Turf / ""C+3""",1000,GF,4,1,50,Y S Tsui,...,0.57.58,1147,--,E432,AUS / 8,Bay / Gelding,PP,I Am Invincible,Resonates,Teofilo
2057,080,06,08/10/25,"HV / Turf / ""C+3""",1000,GF,4,10,44,M Newnham,...,0.57.56,1022,XB,J545,AUS / 4,Brown / Gelding,PPG,Capitalist,Muchas Coronas,Onemorenomore
2065,078,08,08/10/25,"HV / Turf / ""C+3""",1200,GF,4,1,57,C W Chang,...,1.09.83,1123,--,H325,NZ / 5,Bay / Gelding,PPG,Swiss Ace,River Shannon,Mossman
2090,078,09,08/10/25,"HV / Turf / ""C+3""",1200,GF,4,9,57,A S Cruz,...,1.10.28,1097,B/TT,G451,AUS / 6,Bay / Gelding,PP,Deep Field,Murtle Turtle,Murtajill


In [8]:
new = append_with_new(df, '../data/race_data/race_data_20251007.csv')

  old_df = pd.read_csv(old_df)


In [9]:
new

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,238,12,10/12/2023,"ST / Turf / ""A""",1600,G,G1,10,--,T Yasuda,...,1.35.46,1187,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
1,623,05,30/04/2023,"ST / Turf / ""A""",2000,G,G1,7,--,T Yasuda,...,2.02.71,1179,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
2,240,02,11/12/2022,"ST / Turf / ""A""",2000,G,G1,6,--,T Yasuda,...,2.00.44,1150,--,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
3,402,11,06/02/2021,"ST / Turf / ""C""",1200,G,5,2,18,C H Yip,...,1.10.66,1045,CP-/TT-,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
4,296,05,26/12/2020,"ST / Turf / ""A+3""",1200,G,5,14,18,C H Yip,...,1.10.42,1058,CP/TT,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,080,07,08/10/25,"HV / Turf / ""C+3""",1000,GF,4,1,50,Y S Tsui,...,0.57.58,1147,--,E432,AUS / 8,Bay / Gelding,PP,I Am Invincible,Resonates,Teofilo
2057,080,06,08/10/25,"HV / Turf / ""C+3""",1000,GF,4,10,44,M Newnham,...,0.57.56,1022,XB,J545,AUS / 4,Brown / Gelding,PPG,Capitalist,Muchas Coronas,Onemorenomore
2065,078,08,08/10/25,"HV / Turf / ""C+3""",1200,GF,4,1,57,C W Chang,...,1.09.83,1123,--,H325,NZ / 5,Bay / Gelding,PPG,Swiss Ace,River Shannon,Mossman
2090,078,09,08/10/25,"HV / Turf / ""C+3""",1200,GF,4,9,57,A S Cruz,...,1.10.28,1097,B/TT,G451,AUS / 6,Bay / Gelding,PP,Deep Field,Murtle Turtle,Murtajill
