In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

In [2]:
os.makedirs('../data/', exist_ok = True)

# scraping race information by horses

In [3]:
hk_horse = 'https://racing.hkjc.com/racing/information/English/Horse/ListByLocation.aspx?Location=HK'

hk_horses = requests.get(hk_horse)

soup = BeautifulSoup(hk_horses.content, 'html.parser')

In [4]:
result_table = soup.find_all('table', class_ = 'bigborder')

In [5]:
table = result_table[1]

# Assume you already have the second_table from previous step
links = table.find_all('a')  # find all anchor tags in the table

hrefs = [link.get('href') for link in links if link.get('href')]  # extract href attribute if it exists


In [6]:
hrefs = hrefs[:4]

In [7]:
hrefs

['/racing/information/English/Horse/Horse.aspx?HorseId=HK_2020_E486',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2021_G180',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H302',
 '/racing/information/English/Horse/Horse.aspx?HorseId=HK_2022_H399']

In [None]:
header = None
rows = []

base_url = "https://racing.hkjc.com"

for href in hrefs:
    full_url = base_url + href + '&Option=1'
    response = requests.get(full_url)

    # match the horse ID from the href
    match = re.search(r'([A-Z]\d{3})$', href)
    horse_id = match.group(1) if match else None

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        horse_table = soup.find('table', class_ = 'bigborder')

        if horse_table:

            # extract header from the table
            if header is None:
                header_row = horse_table.find('tr')
                header = [th.get_text(strip = True) for th in header_row.find_all('td')]
                header = header[:-1]
                header.append('Horse_id') # append horse id to the header

            # extract table content from the table
            for tr in horse_table.find_all('tr')[1:]:
                cols = [td.get_text(strip = True) for td in tr.find_all('td')]

                if all(not col.strip() for col in cols):
                    continue
                
                first_col = cols[0].strip()

                if 'Season' in first_col:
                    continue

                if first_col == 'Overseas':
                    continue

                cols = cols[:-2]
                cols.append(horse_id) # append horse id to the rows
                rows.append(cols)
            
        else: 
            print(f'Table not found in {full_url}')

    else:
        print(f"Failed to retrieve {full_url}")


In [15]:
df = pd.DataFrame(rows, columns = header)

In [None]:
df.to_csv('../data/horse_racing.csv')

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,Jockey,LBW,Win Odds,Act.Wt.,RunningPosition,Finish Time,Declar.Horse Wt.,Gear,Horse_id
0,247,01,08/12/24,"ST / Turf / ""A""",2000,G,G1,1,133,C S Shum,J McDonald,1-1/2,1.1,126,4 4 4 3 1,2.00.51,1183,TT,E486
1,190,01,17/11/24,"ST / Turf / ""B+2""",2000,G,G2,1,133,C S Shum,J McDonald,4-1/4,1.1,128,4 6 5 2 1,1.59.70,1174,TT,E486
2,624,01,28/04/24,"ST / Turf / ""A""",2000,Y,G1,10,132,C S Shum,J McDonald,N,1.9,126,4 5 7 7 1,2.01.02,1178,TT,E486
3,452,01,25/02/24,"ST / Turf / ""A+3""",2000,G,G1,11,132,C S Shum,J McDonald,N,1.6,126,3 3 4 4 1,2.00.31,1173,TT,E486
4,239,01,10/12/23,"ST / Turf / ""A""",2000,G,G1,7,130,C S Shum,J McDonald,SH,2.3,126,4 4 4 2 1,2.02.00,1157,TT,E486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,339,01,13/01/24,"ST / Turf / ""C+3""",1200,G,2,5,86,P C Ng,K Teetan,1-1/2,1.2,123,6 5 1,1.08.68,1150,B,H399
77,240,01,10/12/23,"ST / Turf / ""A""",1200,G,3,3,76,P C Ng,K Teetan,2-3/4,3.1,135,9 6 1,1.09.08,1155,B,H399
78,185,01,19/11/23,"ST / Turf / ""B+2""",1200,GF,3,10,69,P C Ng,K Teetan,1-3/4,3.5,125,9 7 1,1.08.65,1168,B,H399
79,118,05,25/10/23,ST / AWT,1200,GD,3,3,69,P C Ng,K Teetan,6-3/4,5.1,124,7 6 5,1.09.29,1174,B,H399
