In [42]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from io import StringIO
import warnings
import time

In [49]:
df = pd.read_csv('../data/fixtures/fixtures.csv', index_col = 0)

In [51]:
def scrape_horse_links(fixture_df, max_race_no = 14):
    
    base_url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
    http_headers = {"User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"}

    all_links = []

    # loop for date and venue
    for idx, row in fixture_df.iterrows():
        date = row['Date']
        venue = row['Venue']

        print(f'scraping date: {date} @ {venue}')

        # loop for race numbers
        for race_no in range(1, max_race_no + 1):
            params = {
                "RaceDate": date,
                "Racecourse": venue,
                "RaceNo": race_no   
            }

            print(f'scraping race no: {race_no}')
            time.sleep(1)

            # get the response from the url
            response = requests.get(base_url, params = params, headers = http_headers)
            if response.status_code != 200:
                print(f"  Race {race_no}: Failed to fetch (status {response.status_code}). Stopping.")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            table = soup.find('table', class_ = 'table_bd')
            if not table:
                print('table not found')
                break

            header_row = table.find('tr') 
            headers = [td.get_text(strip = True) for td in header_row.find_all('td')]
            if 'Horse' not in headers:
                print("no 'horse' column found")
                break

            # get position of the horse column
            horse_col_index = headers.index('Horse')

            for row in table.find_all('tr')[1:]:  # skip header row
                cells = row.find_all('td') # get all cells in the row
                if len(cells) > horse_col_index:
                    td = cells[horse_col_index]
                    a_tag = td.find('a', href = True)
                    if a_tag:
                        href = a_tag['href']
                        if href.startswith('/'):
                            href = "https://racing.hkjc.com" + href
                        all_links.append(href) 
    
    return list(set(all_links))



In [52]:
links = scrape_horse_links(df)

scraping date: 10/09/2023 @ ST
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
scraping race no: 10
scraping race no: 11
table not found
scraping date: 13/09/2023 @ HV
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
table not found
scraping date: 17/09/2023 @ ST
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
scraping race no: 10
scraping race no: 11
table not found
scraping date: 20/09/2023 @ HV
scraping race no: 1
scraping race no: 2
scraping race no: 3
scraping race no: 4
scraping race no: 5
scraping race no: 6
scraping race no: 7
scraping race no: 8
scraping race no: 9
table not found
scraping

In [60]:
horses = pd.DataFrame(links, columns= ['links'])

In [63]:
output_dir = '../data/horses_data'
output_path = os.path.join(output_dir, 'horses.csv')

os.makedirs(output_dir, exist_ok = True)

In [64]:
horses.to_csv(output_path, index = False)