In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

In [25]:
os.makedirs('../data/', exist_ok = True)

# scraping race information by horses

In [36]:
hk_horse = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=HK'
conghua = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=CH'

hk_horses = requests.get(hk_horse)

soup = BeautifulSoup(hk_horses.content, 'html.parser')

In [37]:
result_table = soup.find_all('table', class_ = 'bigborder')

In [40]:
course_label = soup.find_all(['td'])


In [8]:
table = result_table[1]

# Assume you already have the second_table from previous step
links = table.find_all('a')  # find all anchor tags in the table

hrefs = [link.get('href') for link in links if link.get('href')]  # extract href attribute if it exists


In [12]:
hrefs = hrefs[:4]

In [13]:
hrefs

['/racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E486',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G180',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H302',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H399']

In [14]:
header = None
rows = []

base_url = "https://racing.hkjc.com"

for href in hrefs:
    full_url = base_url + href + '&Option=1'
    response = requests.get(full_url)

    # match the horse ID from the href
    match = re.search(r'([A-Z]\d{3})$', href)
    horse_id = match.group(1) if match else None

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        horse_table = soup.find('table', class_ = 'bigborder')

        if horse_table:

            # extract header from the table
            if header is None:
                header_row = horse_table.find('tr')
                header = [th.get_text(strip = True) for th in header_row.find_all('td')]
                header = header[:-1]
                header.append('Horse_id') # append horse id to the header

            # extract table content from the table
            for tr in horse_table.find_all('tr')[1:]:
                cols = [td.get_text(strip = True) for td in tr.find_all('td')]

                if all(not col.strip() for col in cols):
                    continue
                
                first_col = cols[0].strip()

                if 'Season' in first_col:
                    continue

                if first_col == 'Overseas':
                    continue

                cols = cols[:-2]
                cols.append(horse_id) # append horse id to the rows
                rows.append(cols)
            
        else: 
            print(f'Table not found in {full_url}')

    else:
        print(f"Failed to retrieve {full_url}")


In [15]:
df = pd.DataFrame(rows, columns = header)

In [16]:
df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,Jockey,LBW,Win Odds,Act.Wt.,RunningPosition,Finish Time,Declar.Horse Wt.,Gear,Horse_id
0,247,01,08/12/24,"ST / Turf / ""A""",2000,G,G1,1,133,C S Shum,J McDonald,1-1/2,1.1,126,4 4 4 3 1,2.00.51,1183,TT,E486
1,190,01,17/11/24,"ST / Turf / ""B+2""",2000,G,G2,1,133,C S Shum,J McDonald,4-1/4,1.1,128,4 6 5 2 1,1.59.70,1174,TT,E486
2,624,01,28/04/24,"ST / Turf / ""A""",2000,Y,G1,10,132,C S Shum,J McDonald,N,1.9,126,4 5 7 7 1,2.01.02,1178,TT,E486
3,452,01,25/02/24,"ST / Turf / ""A+3""",2000,G,G1,11,132,C S Shum,J McDonald,N,1.6,126,3 3 4 4 1,2.00.31,1173,TT,E486
4,239,01,10/12/23,"ST / Turf / ""A""",2000,G,G1,7,130,C S Shum,J McDonald,SH,2.3,126,4 4 4 2 1,2.02.00,1157,TT,E486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,339,01,13/01/24,"ST / Turf / ""C+3""",1200,G,2,5,86,P C Ng,K Teetan,1-1/2,1.2,123,6 5 1,1.08.68,1150,B,H399
77,240,01,10/12/23,"ST / Turf / ""A""",1200,G,3,3,76,P C Ng,K Teetan,2-3/4,3.1,135,9 6 1,1.09.08,1155,B,H399
78,185,01,19/11/23,"ST / Turf / ""B+2""",1200,GF,3,10,69,P C Ng,K Teetan,1-3/4,3.5,125,9 7 1,1.08.65,1168,B,H399
79,118,05,25/10/23,ST / AWT,1200,GD,3,3,69,P C Ng,K Teetan,6-3/4,5.1,124,7 6 5,1.09.29,1174,B,H399


In [None]:
df.to_csv('../data/horse_racing.csv')

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,Jockey,LBW,Win Odds,Act.Wt.,RunningPosition,Finish Time,Declar.Horse Wt.,Gear,Horse_id
0,247,01,08/12/24,"ST / Turf / ""A""",2000,G,G1,1,133,C S Shum,J McDonald,1-1/2,1.1,126,4 4 4 3 1,2.00.51,1183,TT,E486
1,190,01,17/11/24,"ST / Turf / ""B+2""",2000,G,G2,1,133,C S Shum,J McDonald,4-1/4,1.1,128,4 6 5 2 1,1.59.70,1174,TT,E486
2,624,01,28/04/24,"ST / Turf / ""A""",2000,Y,G1,10,132,C S Shum,J McDonald,N,1.9,126,4 5 7 7 1,2.01.02,1178,TT,E486
3,452,01,25/02/24,"ST / Turf / ""A+3""",2000,G,G1,11,132,C S Shum,J McDonald,N,1.6,126,3 3 4 4 1,2.00.31,1173,TT,E486
4,239,01,10/12/23,"ST / Turf / ""A""",2000,G,G1,7,130,C S Shum,J McDonald,SH,2.3,126,4 4 4 2 1,2.02.00,1157,TT,E486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,339,01,13/01/24,"ST / Turf / ""C+3""",1200,G,2,5,86,P C Ng,K Teetan,1-1/2,1.2,123,6 5 1,1.08.68,1150,B,H399
77,240,01,10/12/23,"ST / Turf / ""A""",1200,G,3,3,76,P C Ng,K Teetan,2-3/4,3.1,135,9 6 1,1.09.08,1155,B,H399
78,185,01,19/11/23,"ST / Turf / ""B+2""",1200,GF,3,10,69,P C Ng,K Teetan,1-3/4,3.5,125,9 7 1,1.08.65,1168,B,H399
79,118,05,25/10/23,ST / AWT,1200,GD,3,3,69,P C Ng,K Teetan,6-3/4,5.1,124,7 6 5,1.09.29,1174,B,H399


In [42]:
example = 'https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E486'

In [43]:
response = requests.get(example)

soup = BeautifulSoup(response.content, 'html.parser')

In [47]:
origin_label = soup.find('td', string = lambda text : text and 'Country of Origin' in text)
print(origin_label)

<td>Country of Origin / Age</td>


In [50]:
if origin_label:
    origin_info = origin_label.find_next().find_next().get_text(strip = True)
    print(origin_info)

IRE / 7


# Scraping function

In [73]:
def get_hkjc_race_records(starting_url):
    base_url = "https://racing.hkjc.com"

    response = requests.get(starting_url)
    if response.status_code != 200:
        raise RuntimeError(f"Failed to retrieve main page: {starting_url}")

    soup = BeautifulSoup(response.content, 'html.parser')
    result_table = soup.find_all('table', class_='bigborder')
    if not result_table or len(result_table) < 2:
        raise RuntimeError("Result table not found or unexpected format")

    table = result_table[1]
    links = table.find_all('a')
    hrefs = [link.get('href') for link in links if link.get('href')]

    # experimenting different length of href
    hrefs = hrefs[:3]

    header = None
    rows = []

    for href in hrefs:
        full_url = base_url + href + '&Option=1'
        try:
            response = requests.get(full_url, timeout=10)
        except Exception as e:
            print(f"Request exception for {full_url}: {str(e)}")
            continue

        match = re.search(r'([A-Z]\d{3})$', href)
        horse_id = match.group(1) if match else None

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # find origin and age
            origin_label = soup.find('td', string = lambda text : text and 'Country of Origin' in text)
            if origin_label:
                origin_info = origin_label.find_next().find_next().get_text(strip = True)
            else:
                origin_info = 'na'

            # find colour and sex
            colour_label = soup.find('td', string = lambda text : text and 'Colour / Sex' in text)
            if colour_label:
                colour_info = colour_label.find_next().find_next().get_text(strip = True)
            else:
                colour_info = 'na'

            # find import type
            import_label = soup.find('td', string = lambda text : text and 'Import Type' in text)
            if import_label:
                import_info = import_label.find_next().find_next().get_text(strip = True)
            else:
                import_info = 'na'

            # find sire
            sire_label = soup.find('td', string = lambda text : text and 'Sire' in text)
            if sire_label:
                sire_info = sire_label.find_next().find_next().get_text(strip = True)
            else:
                sire_info = 'na'

            # find dam
            dam_label = soup.find('td', string = lambda text : text and 'Dam' in text)
            if dam_label:
                dam_info = dam_label.find_next().find_next().get_text(strip = True)
            else:
                dam_info = 'na'

            # find dam's sire
            dam_sire_label = soup.find('td', string = lambda text : text and 'Dam\'s Sire' in text)
            if dam_sire_label:
                dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip = True)
            else:
                dam_sire_info = 'na'
            
            # find horse per race info
            horse_table = soup.find('table', class_='bigborder')
            if horse_table:
                if header is None:
                    header_row = horse_table.find('tr')
                    header = [th.get_text(strip=True) for th in header_row.find_all('td')]
                    header = header[:-1]
                    header.append('Horse_id')
                    header.append('Origin / Age')
                    header.append('Colour / Sex')
                    header.append('Import type')
                    header.append('Sire')
                    header.append('Dam')
                    header.append('Dam sire')

                for tr in horse_table.find_all('tr')[1:]:
                    cols = [td.get_text(strip=True) for td in tr.find_all('td')]

                    if all(not col.strip() for col in cols):
                        continue
                    first_col = cols[0].strip()
                    if 'Season' in first_col or first_col == 'Overseas':
                        continue

                    # append all info into the rows 
                    cols = cols[:-2]
                    cols.append(horse_id)
                    cols.append(origin_info)
                    cols.append(colour_info)
                    cols.append(import_info)
                    cols.append(sire_info)
                    cols.append(dam_info)
                    cols.append(dam_sire_info)
                    rows.append(cols)
            else:
                print(f'Table not found in {full_url}')


        else:
            print(f"Failed to retrieve {full_url}")

    if not header:
        raise RuntimeError("No data table header found! Check the structure of the page.")

    df = pd.DataFrame(rows, columns=header)
    return df


In [74]:
# Usage:
hk_horse_url = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=HK'
conghua_url = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=CH'

hk_race_df = get_hkjc_race_records(hk_horse_url)

In [75]:
hk_race_df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,247,01,08/12/24,"ST / Turf / ""A""",2000,G,G1,1,133,C S Shum,...,2.00.51,1183,TT,E486,IRE / 7,Bay / Gelding,ISG,Acclamation,Folk Melody,Street Cry
1,190,01,17/11/24,"ST / Turf / ""B+2""",2000,G,G2,1,133,C S Shum,...,1.59.70,1174,TT,E486,IRE / 7,Bay / Gelding,ISG,Acclamation,Folk Melody,Street Cry
2,624,01,28/04/24,"ST / Turf / ""A""",2000,Y,G1,10,132,C S Shum,...,2.01.02,1178,TT,E486,IRE / 7,Bay / Gelding,ISG,Acclamation,Folk Melody,Street Cry
3,452,01,25/02/24,"ST / Turf / ""A+3""",2000,G,G1,11,132,C S Shum,...,2.00.31,1173,TT,E486,IRE / 7,Bay / Gelding,ISG,Acclamation,Folk Melody,Street Cry
4,239,01,10/12/23,"ST / Turf / ""A""",2000,G,G1,7,130,C S Shum,...,2.02.00,1157,TT,E486,IRE / 7,Bay / Gelding,ISG,Acclamation,Folk Melody,Street Cry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,241,03,10/12/23,"ST / Turf / ""A""",1400,G,2,12,91,J Size,...,1.22.42,1104,--,H302,AUS / 6,Bay / Gelding,PP,Toronado,Paris Texas,Hinchinbrook
58,179,01,19/11/23,"ST / Turf / ""B+2""",1200,GF,2,4,85,J Size,...,1.08.49,1102,--,H302,AUS / 6,Bay / Gelding,PP,Toronado,Paris Texas,Hinchinbrook
59,816,01,09/07/23,"ST / Turf / ""C+3""",1200,GF,3,4,77,J Size,...,1.08.76,1098,--,H302,AUS / 6,Bay / Gelding,PP,Toronado,Paris Texas,Hinchinbrook
60,769,01,25/06/23,"ST / Turf / ""A""",1200,G,3,1,67,J Size,...,1.08.66,1105,--,H302,AUS / 6,Bay / Gelding,PP,Toronado,Paris Texas,Hinchinbrook


In [None]:
conghua_df = get_hkjc_race_records(conghua_url)