In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import re
import time

# Functions

In [2]:
def scrape_horse_links(date, venue, max_race_no = 14):

    data = [[date, venue]]

    fixture_df = pd.DataFrame(data, columns = ['Date', 'Venue'])
    
    base_url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
    http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}

    all_links = []

    # loop for date and venue
    for idx, row in fixture_df.iterrows():
        date = row['Date']
        venue = row['Venue']

        print(f'scraping date: {date} @ {venue}')

        # loop for race numbers
        for race_no in range(1, max_race_no + 1):
            params = {
                "RaceDate": date,
                "Racecourse": venue,
                "RaceNo": race_no   
            }

            print(f'scraping race no: {race_no}')
            time.sleep(1)

            # get the response from the url
            response = requests.get(base_url, params = params, headers = http_headers)
            if response.status_code != 200:
                print(f"  Race {race_no}: Failed to fetch (status {response.status_code}). Stopping.")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            table = soup.find('table', class_ = 'table_bd')
            if not table:
                print('table not found')
                break

            header_row = table.find('tr') 
            headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
            if 'Horse' not in headers:
                print("no 'horse' column found")
                break

            # get position of the horse column
            horse_col_index = headers.index('Horse')

            for row in table.find_all('tr')[1:]:  # skip header row
                cells = row.find_all('td') # get all cells in the row
                if len(cells) > horse_col_index:
                    td = cells[horse_col_index]
                    a_tag = td.find('a', href = True)
                    if a_tag:
                        href = a_tag['href']
                        if href.startswith('/'):
                            href = "https://racing.hkjc.com" + href
                        all_links.append(href) 
    
    return list(set(all_links))



In [3]:
def scraping_races(list, race_date):

    # Initiate header and rows for storage
    header = None
    rows = []

    for href in list:
        full_url = href + '&Option=1'

        try:
            response = requests.get(full_url, timeout = 10)
            print(f'scraping {full_url}')
        except Exception as e:
            print(f'Request exception for {full_url}: {str(e)}')
            continue

        match = re.search(r'([A-Z]\d{3})$', href)
        horse_id = match.group(1) if match else None

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # find origin and age
            origin_label = soup.find('td', string = lambda text : text and 'Country of Origin' in text)
            if origin_label:
                origin_info = origin_label.find_next().find_next().get_text(strip = True)
            else:
                origin_info = 'na'

            # find colour and sex
            colour_label = soup.find('td', string = lambda text : text and 'Colour / Sex' in text)
            if colour_label:
                colour_info = colour_label.find_next().find_next().get_text(strip = True)
            else:
                colour_info = 'na'

            # find import type
            import_label = soup.find('td', string = lambda text : text and 'Import Type' in text)
            if import_label:
                import_info = import_label.find_next().find_next().get_text(strip = True)
            else:
                import_info = 'na'

            # find sire
            sire_label = soup.find('td', string = lambda text : text and 'Sire' in text)
            if sire_label:
                sire_info = sire_label.find_next().find_next().get_text(strip = True)
            else:
                sire_info = 'na'

            # find dam
            dam_label = soup.find('td', string = lambda text : text and 'Dam' in text)
            if dam_label:
                dam_info = dam_label.find_next().find_next().get_text(strip = True)
            else:
                dam_info = 'na'

            # find dam's sire
            dam_sire_label = soup.find('td', string = lambda text : text and 'Dam\'s Sire' in text)
            if dam_sire_label:
                dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip = True)
            else:
                dam_sire_info = 'na'
            
            # find horse per race info
            horse_table = soup.find('table', class_='bigborder')
            if horse_table:
                if header is None:
                    header_row = horse_table.find('tr')
                    header = [th.get_text(strip=True) for th in header_row.find_all('td')]
                    header = header[:-1]
                    header.append('Horse_id')
                    header.append('Origin / Age')
                    header.append('Colour / Sex')
                    header.append('Import type')
                    header.append('Sire')
                    header.append('Dam')
                    header.append('Dam sire')

                for tr in horse_table.find_all('tr')[1:]:
                    cols = [td.get_text(strip=True) for td in tr.find_all('td')]

                    if all(not col.strip() for col in cols):
                        continue
                    first_col = cols[0].strip()
                    if 'Season' in first_col or first_col == 'Overseas':
                        continue

                    # append all info into the rows 
                    cols = cols[:-2]
                    cols.append(horse_id)
                    cols.append(origin_info)
                    cols.append(colour_info)
                    cols.append(import_info)
                    cols.append(sire_info)
                    cols.append(dam_info)
                    cols.append(dam_sire_info)
                    rows.append(cols)
            else:
                print(f'Table not found in {full_url}')


        else:
            print(f"Failed to retrieve {full_url}")

    if not header:
        raise RuntimeError("No data table header found! Check the structure of the page.")

    df = pd.DataFrame(rows, columns=header)
    df['Date'] = pd.to_datetime(df['Date'], format = 'mixed',  dayfirst = True).dt.strftime('%d/%m/%y')
    race_date = pd.to_datetime(race_date, format = 'mixed', dayfirst=True).strftime('%d/%m/%y')

    df = df[df['Date'] == race_date]

    return df


In [4]:
def append_with_new(new_df, old_df):

    old_df = pd.read_csv(old_df)
    df = pd.concat([old_df, new_df])

    current_time = datetime.now().strftime('%Y%m%d')

    file_path = f'../data/race_data/race_data_{current_time}.csv'

    df.to_csv(file_path, index=False)

    return df

# Workflow

In [9]:
links = scrape_horse_links('04/10/2025', 'ST')

scraping date: 04/10/2025 @ ST
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
scraping race no: 10
scraping race no: 11
table not found


In [10]:
df = scraping_races(links, '04/10/2025')

scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J502&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H476&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E392&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G230&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2023_J338&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H041&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K101&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G312&Option=1
scraping https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2024_K291&Option=1
scraping https://racing.hkjc.com/raci

In [11]:
df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,071,05,04/10/25,"ST / Turf / ""B+2""",1400,GF,4,9,49,D J Whyte,...,1.22.01,1175,CP-/SR1,J502,NZ / 5,Bay / Gelding,PPG,Embellish,Preetha Varma,Keeper
7,069,05,04/10/25,"ST / Turf / ""B+2""",1400,GF,4,9,42,K L Man,...,1.21.88,1204,TT,H476,AUS / 6,Bay / Gelding,ISG,I Am Invincible,Solar Moon,Pivotal
29,072,09,04/10/25,"ST / Turf / ""B+2""",1000,GF,3,3,84,A S Cruz,...,0.57.53,1162,V/TT,E392,AUS / 7,Bay / Gelding,PPG,Shalaa,Santa Rocks,Fastnet Rock
77,068,10,04/10/25,ST / AWT,1650,GD,4,1,42,K L Man,...,1.41.58,1221,B/TT,G230,IRE / 6,Chestnut / Gelding,PP,Dawn Approach,Teoirim,Teofilo
113,067,13,04/10/25,"ST / Turf / ""B+2""",1400,GF,5,7,27,C Fownes,...,1.23.62,1046,PC-/TT-,J338,AUS / 5,Chestnut / Gelding,PPG,Russian Revolution,Virani,Bernardini
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2469,066,06,04/10/25,ST / AWT,1650,GD,5,11,40,K H Ting,...,1.41.37,1113,B/TT,K069,AUS / 4,Bay / Gelding,PPG,Deep Field,Nakataan,Zabeel
2475,072,07,04/10/25,"ST / Turf / ""B+2""",1000,GF,3,6,62,C H Yip,...,0.57.11,1089,TT,K377,GB / 3,Bay / Gelding,PPG,Land Force,Phantom Spirit,Invincible Spirit
2480,069,03,04/10/25,"ST / Turf / ""B+2""",1400,GF,4,5,51,C Fownes,...,1.21.82,1157,TT,K398,NZ / 4,Bay / Gelding,PPG,Ardrossan,Dolce Amore,Sebring
2483,066,10,04/10/25,ST / AWT,1650,GD,5,6,37,P C Ng,...,1.41.52,1201,B/TT,J218,AUS / 5,Bay / Gelding,PPG,Deep Field,Alberton Park,Thorn Park


In [12]:
new = append_with_new(df, '../data/race_data/race_data_20251007.csv')

  old_df = pd.read_csv(old_df)


In [13]:
new

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,238,12,10/12/2023,"ST / Turf / ""A""",1600,G,G1,10,--,T Yasuda,...,1.35.46,1187,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
1,623,05,30/04/2023,"ST / Turf / ""A""",2000,G,G1,7,--,T Yasuda,...,2.02.71,1179,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
2,240,02,11/12/2022,"ST / Turf / ""A""",2000,G,G1,6,--,T Yasuda,...,2.00.44,1150,--,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
3,402,11,06/02/2021,"ST / Turf / ""C""",1200,G,5,2,18,C H Yip,...,1.10.66,1045,CP-/TT-,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
4,296,05,26/12/2020,"ST / Turf / ""A+3""",1200,G,5,14,18,C H Yip,...,1.10.42,1058,CP/TT,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2469,066,06,04/10/25,ST / AWT,1650,GD,5,11,40,K H Ting,...,1.41.37,1113,B/TT,K069,AUS / 4,Bay / Gelding,PPG,Deep Field,Nakataan,Zabeel
2475,072,07,04/10/25,"ST / Turf / ""B+2""",1000,GF,3,6,62,C H Yip,...,0.57.11,1089,TT,K377,GB / 3,Bay / Gelding,PPG,Land Force,Phantom Spirit,Invincible Spirit
2480,069,03,04/10/25,"ST / Turf / ""B+2""",1400,GF,4,5,51,C Fownes,...,1.21.82,1157,TT,K398,NZ / 4,Bay / Gelding,PPG,Ardrossan,Dolce Amore,Sebring
2483,066,10,04/10/25,ST / AWT,1650,GD,5,6,37,P C Ng,...,1.41.52,1201,B/TT,J218,AUS / 5,Bay / Gelding,PPG,Deep Field,Alberton Park,Thorn Park
