In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import camelot
from datetime import datetime
import numpy as np

# Download pdf

In [25]:
urls = {
    'fixture_23.pdf' : 'https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2023/07/fixture_23-24_en.pdf',
    'fixture_24.pdf' : 'https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2024/07/fixture_24-25_e.pdf',
    'fixture_25.pdf' : 'https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2025/07/fixture_25-26.pdf'
}

out_dir = '../data/fixtures/'

In [26]:
def download_pdf(urls, out_dir):
    
    os.makedirs(out_dir, exist_ok = True)

    for filename, url in urls.items():
        print(f'downloading {filename} from {url}')

        try:
            response = requests.get(url, stream = True, timeout=15)
            if response.status_code == 200: 
                out_path = os.path.join(out_dir + filename)
                with open(out_path, 'wb') as f:
                    f.write(response.content)
                print(f'file downloaded to {out_path}')
        
            else:
                print(f'file download failed with url: {url}')
        
        except requests.RequestException as e:
            print(f'error downloading {url}: {e}')

In [27]:
download_pdf(urls, out_dir)

downloading fixture_23.pdf from https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2023/07/fixture_23-24_en.pdf
file downloaded to ../data/fixtures/fixture_23.pdf
downloading fixture_24.pdf from https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2024/07/fixture_24-25_e.pdf
file downloaded to ../data/fixtures/fixture_24.pdf
downloading fixture_25.pdf from https://res.hkjc.com/racingnews/wp-content/uploads/sites/3/2025/07/fixture_25-26.pdf
file downloaded to ../data/fixtures/fixture_25.pdf


# Function to clean fixture

In [20]:
# clean the date format
def format_date(date_str, year):
    try:
        # remove line break in 'Date'
        date_str = date_str.replace('\n', ' ')
        
        # seperate into different parts for extraction
        parts = date_str.split()

        if len(parts) >= 3:
            day = parts[1]
            month = parts[2]
        else: 
            return date_str
        
        dt = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
        return dt.strftime("%d/%m/%Y")

    except Exception:
        return date_str

In [21]:
def clean_fixture(fixture):
    
    full_frame = pd.DataFrame()

    year_pattern = f'(2022|2023|2024|2025|2026)'

    for f in fixture:
        # read each pdf
        table = camelot.read_pdf(f, pages = 'all', flavor = 'stream')

        # extract table 0 for fixture_23, table 1 for others
        if 'fixture_23' in f:
            fixture = table[0].df
        else:
            fixture = table[1].df

        left = fixture[[0, 1]].rename(columns = {0: 'Date', 1: 'Venue'})
        right = fixture[[2, 3]].rename(columns = {2: 'Date', 3: 'Venue'})

        # concat two parts into one list
        combined = pd.concat([left, right], ignore_index = True)
        conbined = combined.dropna(how = 'all')

        combined['year'] = None
        current_year = None

        for idx, val in combined['Date'].items():
            if pd.notna(val) and str(val).strip().isdigit() and re.match(year_pattern, str(val).strip()):
                current_year = str(val).strip()
                combined.at[idx, 'Date'] = None
            else:
                combined.at[idx, 'year'] = current_year

        combined = combined.dropna(subset = ['Date']).reset_index(drop = True)

        combined['Date'] = combined.apply(lambda row: format_date(row['Date'], row['year']), axis = 1)
        combined = combined.drop(columns = ['year'])

        combined = combined.replace(r'^\s*$', np.nan, regex=True)      
        combined = combined.dropna(subset=['Date', 'Venue']).reset_index(drop=True)  

        combined = combined[~combined['Date'].str.contains('DATE', case = False, na = False)].reset_index(drop=True)

        # Normalise venue column
        combined['Venue'] = combined['Venue'].str.strip().replace(
            {
                r'(?i)^.*sha\s*tin.*$': 'ST',
                r'(?i)^.*happy\s*valley.*$': 'HV'},
            regex=True
        )

        full_frame = pd.concat([full_frame,combined], ignore_index = True)

    return full_frame

In [22]:
pdf_list = [
    "../data/fixtures/fixture_23.pdf",
    "../data/fixtures/fixture_24.pdf",
    "../data/fixtures/fixture_25.pdf"
]

df = clean_fixture(pdf_list)

# scrape by date

In [11]:
date = ['2025/07/16']
venue = ['HV', 'ST']

In [12]:
params = {
    'dates': date,
    'venues': venue
}

In [None]:
def fetch_race_data_by_venue(date, venues, max_race_no=9):
 
    base_url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
    results = {}

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; HKJCScraper/1.0)"
    }

    for venue in venues:
        print(f"Checking venue {venue} race no 1...")
        params = {
            "RaceDate": date,
            "Racecourse": venue,
            "RaceNo": 1
        }

        response = requests.get(base_url, params=params, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page for venue {venue} race 1, status: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        # Check for the presence of the race result table, often with class 'table_bd' or id
        table = soup.find('table', class_='table_bd')

        if not table:
            print(f"No race data found for venue {venue} race 1; skipping rest of races for this venue.")
            continue  # Skip race 2-9 for this venue

        # Save race 1 result
        results[(venue, 1)] = soup
        print(f"Race data found for venue {venue} race 1; fetching races 2 to {max_race_no}...")

        # Now fetch the rest of the races for this venue
        for race_no in range(2, max_race_no + 1):
            params["RaceNo"] = race_no
            response = requests.get(base_url, params=params, headers=headers)
            if response.status_code != 200:
                print(f"Failed to fetch page for venue {venue} race {race_no}, status: {response.status_code}")
                results[(venue, race_no)] = None
                continue

            soup = BeautifulSoup(response.text, "html.parser")
            table = soup.find('table', class_='table_bd')
            if not table:
                print(f"No race data found for venue {venue} race {race_no}.")
                results[(venue, race_no)] = None
            else:
                results[(venue, race_no)] = soup

    return results

# Example of calling the function
if __name__ == "__main__":
    race_date = '2025/07/16'
    venue_list = ['HV', 'ST']

    data = fetch_race_data_by_venue(race_date, venue_list)
    # 'data' contains BeautifulSoup objects for successful pages or None entries for misses
