In [2]:
# %% 
import os
print("Current working directory:", os.getcwd())

from datetime import date, timedelta
from io import StringIO

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ─── CONFIG ────────────────────────────────────────────────────────────────
START_YEAR = 2025

# Selenium: headless Chrome (change path if needed)
chrome_opts = Options()
chrome_opts.add_argument("--headless")
chrome_opts.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_opts)

# XPath to the results table on each page
TABLE_XPATH = '//*[@id="innerContent"]/div[2]/div[5]/table'

# ─── DETERMINE LAST FULL YEAR ──────────────────────────────────────────────
today = date.today()
today = date(2025, 1, 2)
LAST_FULL_YEAR = today.year - 1
END_YEAR = today.year  # includes the current year

# ─── YEAR LOOP ──────────────────────────────────────────────────────────────
for year in range(START_YEAR, END_YEAR + 1):
    print(f"\n===== SCRAPING YEAR {year} =====")
    df_year = None

    # define 1-Jan and 31-Dec for this year
    day_cursor = date(year, 1, 1)
    year_end   = date(year, 12, 31)
    one_day    = timedelta(days=1)

    # LOOP ALL DATES IN YEAR
    while day_cursor <= year_end:
        ds = day_cursor.strftime("%Y/%m/%d")
        print(f"→ Date: {ds}")

        # for each racecourse
        for course in ("HV", "ST"):
            print(f"  • Course: {course}")

            # Scraping Rule: Race 1 must exist
            url1 = (
              "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
              f"?RaceDate={ds}&Racecourse={course}&RaceNo=1"
            )
            driver.get(url1)

            try:
                tbl1 = driver.find_element(By.XPATH, TABLE_XPATH)
            except:
                print(f"    – no Race#1 → skipping {course} on {ds}")
                continue   # next course

            # scrape Race#1
            html1 = tbl1.get_attribute("outerHTML")
            tables = pd.read_html(StringIO(html1))
            if not tables:
                print(f"    – Race#1 empty → skipping {course} on {ds}")
                continue
            df = tables[0]
            df["Date"]       = ds
            df["Course"]     = course
            df["RaceNumber"] = 1
            df_year = df if df_year is None else pd.concat([df_year, df], ignore_index=True)
            print(f"    ✔ Race#1: {len(df)} rows")

            # scrape Race#2…Race#12 until one is missing
            for race_no in range(2, 13):
                url = (
                  "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx"
                  f"?RaceDate={ds}&Racecourse={course}&RaceNo={race_no}"
                )
                driver.get(url)
                # time.sleep(1)

                try:
                    tbl = driver.find_element(By.XPATH, TABLE_XPATH)
                except:
                    print(f"    – Race#{race_no} missing → stop this course")
                    break

                html = tbl.get_attribute("outerHTML")
                tbls = pd.read_html(StringIO(html))
                if not tbls:
                    print(f"    – Race#{race_no} empty → stop this course")
                    break

                df = tbls[0]
                df["Date"]       = ds
                df["Course"]     = course
                df["RaceNumber"] = race_no
                try:
                    extra_tbl_xpath = '//*[@id="innerContent"]/div[2]/div[4]/table'
                    extra_tbl = driver.find_element(By.XPATH, extra_tbl_xpath)
                    extra_html = extra_tbl.get_attribute("outerHTML")
                    extra_tables = pd.read_html(StringIO(extra_html))
                    if extra_tables:
                        meta_df = extra_tables[0]
                        meta_texts = meta_df.astype(str).values.flatten()
                        meta_dict = {}

                        for i, text in enumerate(meta_texts):
                            if "Class" in text and "-" in text:
                                parts = text.split(" - ")
                                meta_dict["Class"] = parts[0].replace("Class", "").strip()
                                meta_dict["Distance"] = parts[1].split()[0].strip()
                                meta_dict["Score range"] = parts[1].split("-")[-1].strip("()")
                            if "Going" in text:
                                meta_dict["Going"] = meta_texts[i+1].strip()
                            if "Course" in text:
                                meta_dict["Course Detail"] = meta_texts[i+1].strip()
                            if "Time :" in text:
                                times = [t.strip("()") for t in meta_texts[i+1:i+6] if t.startswith("(")]
                                for idx, val in enumerate(times, start=1):
                                    meta_dict[f"Time {idx}"] = val
                                for idx in range(len(times)+1, 6):
                                    meta_dict[f"Time {idx}"] = float("nan")
                            if "Sectional Time" in text:
                                sects = [meta_texts[i+j].strip() for j in range(1, 6) if i+j < len(meta_texts) and meta_texts[i+j].strip()]
                                for idx, val in enumerate(sects, start=1):
                                    meta_dict[f"Sectional Time {idx}"] = val
                                for idx in range(len(sects)+1, 6):
                                    meta_dict[f"Sectional Time {idx}"] = float("nan")

                        for key, val in meta_dict.items():
                            df[key] = val
                except:
                    print("    – metadata extraction failed → continuing")
                df_year = df if df_year is None else pd.concat([df_year, df], ignore_index=True)
                print(f"    ✔ Race#{race_no}: {len(df)} rows")
        # Stop scraping if today is reached
        if day_cursor == today:
            print(f"\n Reached today's date ({today}), stopping.")
            break
            
        # next day
        day_cursor += one_day

    # after finishing the year, write it out
    if df_year is not None:
        out_fn = f"RacePlaceData_{year}.csv"
        df_year.to_csv(out_fn, index=False)
        print(f"\n Year {year} complete: saved {len(df_year)} rows → {out_fn}")
    else:
        print(f"\n Year {year} yielded no data, skipping file.")

# cleanup
driver.quit()
print("\nAll done!")


===== SCRAPING YEAR 2025 =====
→ Date: 2025/01/01
  • Course: HV
    – no Race#1 → skipping HV on 2025/01/01
  • Course: ST
    ✔ Race#1: 12 rows
    ✔ Race#2: 14 rows
    ✔ Race#3: 12 rows
    ✔ Race#4: 6 rows
    ✔ Race#5: 14 rows
    ✔ Race#6: 14 rows
    ✔ Race#7: 14 rows
    ✔ Race#8: 11 rows
    ✔ Race#9: 12 rows
    ✔ Race#10: 12 rows
    ✔ Race#11: 15 rows
    – Race#12 missing → stop this course
→ Date: 2025/01/02
  • Course: HV
    – no Race#1 → skipping HV on 2025/01/02
  • Course: ST
    – no Race#1 → skipping ST on 2025/01/02

 Reached today's date (2025-01-02), stopping.

 Year 2025 complete: saved 136 rows → RacePlaceData_2025.csv

All done!
