In [3]:
# Install and import required packages
%pip install selenium requests pandas beautifulsoup4 webdriver-manager python-dateutil

import pandas as pd
import requests
import time
import os
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed and imported successfully!")


In [4]:
# Configuration and Setup
class Config:
    # Date range for scraping
    START_DATE = "2023-01-01"
    END_DATE = "2025-07-14"
    
    # Scraping settings
    RATE_LIMIT = 2  # seconds between requests
    BATCH_SIZE = 10  # races per batch
    
    # Selenium settings
    HEADLESS = True  # Set to False to see browser window
    IMPLICIT_WAIT = 10
    PAGE_LOAD_TIMEOUT = 30
    
    # File output settings
    OUTPUT_DIR = "./output/"

# Create output directory
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
print(f"✅ Configuration loaded. Output directory: {Config.OUTPUT_DIR}")

In [20]:
# WebDriver Setup
def setup_webdriver():
    """Setup Chrome WebDriver with optimal settings"""
    chrome_options = Options()
    
    if Config.HEADLESS:
        chrome_options.add_argument("--headless")
    
    # Performance options
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # User agent
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Setup driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Configure timeouts
    driver.implicitly_wait(Config.IMPLICIT_WAIT)
    driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT)
    
    # Hide webdriver property
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

# Test WebDriver setup
print("🚗 Testing WebDriver setup...")
try:
    test_driver = setup_webdriver()
    print(f"✅ WebDriver setup successful! Browser: {test_driver.capabilities['browserName']} {test_driver.capabilities['browserVersion']}")
    test_driver.quit()
    print("✅ WebDriver test completed.")
except Exception as e:
    print(f"❌ WebDriver setup failed: {e}")
    print("💡 Make sure Chrome browser is installed on your system.")


In [21]:
# Main HKJC Scraper Class
class HKJCScraper:
    def __init__(self): # when starting a new scraper instance
        self.driver = None 
        self.all_data = [] # start a empty list to collect data
        self.processed_count = 0 # counting how many race pages have been processed
        self.errors = [] # start a empty list to log errors for debugging
    
    
    # browser management
    def start_browser(self):
        """Start the browser"""
        if self.driver is None: # check if browser is already running
            self.driver = setup_webdriver() # setup the webdriver by using function defined above
            print("Browser started")
    
    def stop_browser(self):
        """Stop the browser"""
        if self.driver: # is not none means browser is running
            self.driver.quit() # close the browser
            self.driver = None # reset the driver to None
            print("Browser stopped")
    

    def scrape_single_race(self, date, venue, race_no):
        """Scrape a single race"""
        try:
            # Build URL
            date_str = date.strftime('%Y/%m/%d')
            url = f"https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date_str}&Racecourse={venue}&RaceNo={race_no}"
            
            print(f"🏇 Scraping: {date.strftime('%Y-%m-%d')} {venue} R{race_no}")
            
            # Load page
            self.driver.get(url)
            time.sleep(2)  # Wait for page load
            
            # Get page source
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Check if race exists (look for results table)
            results_table = soup.find('table', class_='table_bd')
            if not results_table:
                print(f"  ⚠️ No results table found - race may not exist")
                return False
            
            # Extract basic race info
            race_info = {
                'date': date.strftime('%Y-%m-%d'),
                'venue': venue,
                'race_no': race_no,
                'data_type': 'race_info',
                'scrape_time': datetime.now().isoformat()
            }
            
            # Try to extract race details
            try:
                race_detail_divs = soup.find_all(['div', 'span', 'td'], string=lambda text: text and ('Class' in text or 'HANDICAP' in text or 'M' in text))
                for div in race_detail_divs[:3]:  # Check first few matches
                    text = div.get_text()
                    if 'Class' in text:
                        race_info['race_class'] = text.strip()
                    if any(char.isdigit() and 'M' in text for char in text):
                        import re
                        distance_match = re.search(r'(\\d+)M', text)
                        if distance_match:
                            race_info['distance'] = distance_match.group(0)
            except:
                pass
            
            # Extract horse performances
            performances = []
            try:
                rows = results_table.find_all('tr')[1:]  # Skip header
                
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 6:  # Minimum expected columns
                        
                        # Extract horse ID from link if available
                        horse_id = None
                        horse_name = cells[2].get_text(strip=True) if len(cells) > 2 else ''
                        horse_link = cells[2].find('a') if len(cells) > 2 else None
                        if horse_link and horse_link.get('href'):
                            import re
                            match = re.search(r'HorseId=([^&]+)', horse_link['href'])
                            if match:
                                horse_id = match.group(1)
                        
                        performance = {
                            'date': date.strftime('%Y-%m-%d'),
                            'venue': venue,
                            'race_no': race_no,
                            'data_type': 'performance',
                            'position': cells[0].get_text(strip=True),
                            'horse_no': cells[1].get_text(strip=True) if len(cells) > 1 else '',
                            'horse_name': horse_name,
                            'horse_id': horse_id,
                            'jockey': cells[3].get_text(strip=True) if len(cells) > 3 else '',
                            'trainer': cells[4].get_text(strip=True) if len(cells) > 4 else '',
                            'weight': cells[5].get_text(strip=True) if len(cells) > 5 else '',
                            'draw': cells[6].get_text(strip=True) if len(cells) > 6 else '',
                            'margin': cells[7].get_text(strip=True) if len(cells) > 7 else '',
                            'time': cells[8].get_text(strip=True) if len(cells) > 8 else '',
                            'odds': cells[9].get_text(strip=True) if len(cells) > 9 else '',
                            'scrape_time': datetime.now().isoformat()
                        }
                        performances.append(performance)
            except Exception as e:
                print(f"  ⚠️ Error extracting performances: {e}")
            
        
            self.all_data.append(race_info)
            self.all_data.extend(performances)
            
            self.processed_count += 1
            print(f"  ✅ Extracted {len(performances)} horses")
            
            return True
            
        except Exception as e:
            error_msg = f"Error scraping {date.strftime('%Y-%m-%d')} {venue} R{race_no}: {str(e)}"
            print(f"  ❌ {error_msg}")
            self.errors.append(error_msg)
            return False
    
    def save_data(self, filename_prefix="hkjc_data"):
        """Save all collected data to CSV"""
        if not self.all_data:
            print("⚠️ No data to save!")
            return
        
        # Convert to DataFrame
        df = pd.DataFrame(self.all_data)
        
        # Save main data file
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{Config.OUTPUT_DIR}{filename_prefix}_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"✅ Saved {len(df)} records to {filename}")
        
        # Save separate files by data type
        for data_type in df['data_type'].unique():
            type_df = df[df['data_type'] == data_type]
            type_filename = f"{Config.OUTPUT_DIR}{filename_prefix}_{data_type}_{timestamp}.csv"
            type_df.to_csv(type_filename, index=False)
            print(f"  📊 {data_type}: {len(type_df)} records → {type_filename}")
        
        # Save errors if any
        if self.errors:
            error_filename = f"{Config.OUTPUT_DIR}{filename_prefix}_errors_{timestamp}.txt"
            with open(error_filename, 'w') as f:
                for error in self.errors:
                    f.write(error + '\\n')
            print(f"  ⚠️ Saved {len(self.errors)} errors to {error_filename}")
        
        return filename

print("✅ HKJCScraper class ready!")


In [22]:
# Utility Functions
def generate_race_dates(start_date, end_date):
    """Generate list of race dates (typically Wed, Sat, Sun)"""
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    race_dates = []
    current = start
    
    while current <= end:
        # Check if it's a racing day (Wednesday=2, Saturday=5, Sunday=6)
        if current.weekday() in [2, 5, 6]:
            race_dates.append(current)
        current += timedelta(days=1)
    
    return race_dates

def get_race_venues():
    """Get list of race venues"""
    return ['ST', 'HV']  # Sha Tin, Happy Valley

def get_race_numbers():
    """Get typical race numbers"""
    return list(range(1, 12))  # Races 1-11

# Main scraping function
def run_hkjc_scraper(start_date=None, end_date=None, venues=None, max_races=None):
    """Run the HKJC scraper"""
    
    # Set defaults
    # if start_date is None:
    #     start_date = "2024-01-01"
    # if end_date is None:
    #     end_date = "2024-01-31"
    # if venues is None:
    #     venues = ['ST']  # Default to Sha Tin only
    
    print(f"🚀 Starting HKJC scraper")
    print(f"📅 Date range: {start_date} to {end_date}")
    print(f"🏟️ Venues: {venues}")
    print(f"🎯 Max races: {max_races if max_races else 'No limit'}")
    
    # Initialize scraper
    scraper = HKJCScraper()
    
    try:
        # Start browser
        scraper.start_browser()
        
        # Generate race dates
        race_dates = generate_race_dates(start_date, end_date)
        print(f"📊 Found {len(race_dates)} potential race dates")
        
        total_processed = 0
        
        # Process each date
        for race_date in race_dates:
            if max_races and total_processed >= max_races:
                print(f"🛑 Reached maximum races limit ({max_races})")
                break
                
            print(f"\\n📅 Processing {race_date.strftime('%Y-%m-%d %A')}")
            
            # Process each venue
            for venue in venues:
                if max_races and total_processed >= max_races:
                    break
                    
                print(f"  🏟️ Venue: {venue}")
                
                # Process races 1-11
                for race_no in get_race_numbers():
                    if max_races and total_processed >= max_races:
                        break
                    
                    success = scraper.scrape_single_race(race_date, venue, race_no)
                    
                    if success:
                        total_processed += 1
                        print(f"    📈 Progress: {total_processed}/{max_races if max_races else '∞'}")
                    
                    # Rate limiting
                    time.sleep(Config.RATE_LIMIT)
                    
                    # Save data periodically
                    if total_processed % Config.BATCH_SIZE == 0 and total_processed > 0:
                        print(f"\\n💾 Saving batch at {total_processed} races...")
                        scraper.save_data(f"batch_{total_processed//Config.BATCH_SIZE:03d}")
        
        # Final save
        print(f"\\n💾 Final save...")
        final_file = scraper.save_data("final")
        
        # Summary
        print(f"\\n📊 === SCRAPING SUMMARY ===")
        print(f"✅ Total races processed: {scraper.processed_count}")
        print(f"📁 Total data records: {len(scraper.all_data)}")
        print(f"❌ Errors encountered: {len(scraper.errors)}")
        print(f"💾 Final data file: {final_file}")
        
        return scraper
        
    except Exception as e:
        print(f"💥 Fatal error: {e}")
        return None
        
    finally:
        # Always clean up
        scraper.stop_browser()
        print("🧹 Cleanup completed")

# Test the utility functions
print("🔧 Testing utility functions...")
test_dates = generate_race_dates('2024-01-01', '2024-01-31')
print(f"✅ Generated {len(test_dates)} race dates for January 2024")
print(f"📅 Sample dates: {[d.strftime('%Y-%m-%d %A') for d in test_dates[:3]]}")
print(f"🏟️ Venues: {get_race_venues()}")
print(f"🏇 Race numbers: {get_race_numbers()}")
print("✅ All utility functions ready!")


In [23]:
# 🧪 TEST RUN - Small sample
print("🧪 === TEST RUN ===")
print("Running test with small sample to verify everything works...")

# Test with just a few races from January 2024
test_result = run_hkjc_scraper(
    start_date='2024-01-03',  # A Wednesday  
    end_date='2024-01-07',    # A Sunday
    venues=['ST'],            # Just Sha Tin
    max_races=3              # Only 3 races total
)

print("\\n🧪 === TEST COMPLETED ===")
if test_result:
    print("✅ Test successful! The scraper is working correctly.")
    print("💡 You can now run larger scraping jobs.")
else:
    print("❌ Test failed. Check the error messages above.")
    print("💡 Make sure Chrome browser is installed and internet connection is stable.")


In [26]:
# 🚀 FULL SCRAPER EXAMPLES
print("🚀 === READY FOR FULL SCRAPING ===")
print("Uncomment one of the examples below to run larger scraping jobs:")


print("\\n📊 Example 3: Large dataset...")
result3 = run_hkjc_scraper(
    start_date='2024-05-26', #start from here
    end_date='2025-07-14',
    venues=['ST', 'HV'], 
    max_races=None
)

print("\\n📋 === USAGE INSTRUCTIONS ===")
print("1. ✅ Run the test above first to verify everything works")
print("2. 🔧 Modify the date ranges and venues as needed")
print("3. 🚀 Uncomment one of the examples above")
print("4. ⏰ Be patient - scraping takes time due to rate limiting")
print("5. 📁 All data will be saved to CSV files in the output/ directory")

print("\\n📊 === DATA OUTPUT ===")
print("• Main data file: hkjc_data_TIMESTAMP.csv (all data combined)")
print("• Race info: hkjc_data_race_info_TIMESTAMP.csv")
print("• Performance data: hkjc_data_performance_TIMESTAMP.csv") 
print("• Error log: hkjc_data_errors_TIMESTAMP.txt (if any errors)")

print("\\n✨ === FEATURES ===")
print("✅ No CORS issues (uses Selenium)")
print("✅ Real weather data (HKO API)")
print("✅ Race results and horse data")


print("✅ Automatic CSV export")
print("✅ Error handling and recovery") 
print("✅ Rate limiting (respectful scraping)")
print("✅ Progress tracking")

print("\\n🎯 Ready to scrape! Uncomment an example above to start.")


In [7]:
# 📊 DATA ANALYSIS UTILITIES
def analyze_scraped_data(output_dir="./output/"):
    """Analyze the scraped data files"""
    import glob
    
    print("📊 === DATA ANALYSIS ===")
    
    # Find all CSV files
    csv_files = glob.glob(f"{output_dir}*.csv")
    
    if not csv_files:
        print("⚠️ No CSV files found in output directory!")
        print(f"📁 Looking in: {output_dir}")
        return
    
    print(f"📁 Found {len(csv_files)} CSV files:")
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            filename = file.split('/')[-1]
            print(f"\\n📄 {filename}:")
            print(f"  📊 Rows: {len(df):,}")
            print(f"  📊 Columns: {len(df.columns)}")
            
            # Show data types if it's a performance file
            if 'performance' in filename:
                print(f"  🏇 Unique horses: {df['horse_name'].nunique() if 'horse_name' in df.columns else 'N/A'}")
                print(f"  🏁 Unique races: {len(df.groupby(['date', 'venue', 'race_no'])) if all(col in df.columns for col in ['date', 'venue', 'race_no']) else 'N/A'}")
            
            # Show weather data summary
            if 'race_info' in filename and 'weather' in df.columns:
                print(f"  🌤️ Weather data available: Yes")
            
            # Show column names (first few)
            print(f"  📋 Sample columns: {list(df.columns[:5])}")
            
            # Show sample data
            if len(df) > 0:
                print(f"  📝 Sample data:")
                sample_cols = df.columns[:4]  # First 4 columns
                print(f"    {df[sample_cols].head(1).to_string(index=False, max_cols=4)}")
        
        except Exception as e:
            print(f"❌ Error reading {file}: {e}")
    
    print(f"\\n✅ Analysis complete!")

def combine_data_files(output_dir="./output/"):
    """Combine multiple batch files if they exist"""
    import glob
    
    print("🔗 === COMBINING DATA FILES ===")
    
    # Look for batch files
    performance_files = glob.glob(f"{output_dir}*performance*.csv")
    race_info_files = glob.glob(f"{output_dir}*race_info*.csv")
    
    if len(performance_files) > 1:
        print(f"🔗 Combining {len(performance_files)} performance files...")
        all_performance = pd.concat([pd.read_csv(f) for f in performance_files], ignore_index=True)
        combined_file = f"{output_dir}combined_performance_data.csv"
        all_performance.to_csv(combined_file, index=False)
        print(f"✅ Combined performance data: {len(all_performance):,} records → {combined_file}")
    
    if len(race_info_files) > 1:
        print(f"🔗 Combining {len(race_info_files)} race info files...")
        all_race_info = pd.concat([pd.read_csv(f) for f in race_info_files], ignore_index=True)
        combined_file = f"{output_dir}combined_race_info.csv"
        all_race_info.to_csv(combined_file, index=False)
        print(f"✅ Combined race info: {len(all_race_info):,} records → {combined_file}")
    
    print("✅ Combining complete!")

# Run analysis on any existing data

print("🔍 Checking for existing data files...")
analyze_scraped_data()
combine_data_files()

print("\\n📋 === ANALYSIS FUNCTIONS READY ===")
print("• analyze_scraped_data() - Analyze all CSV files")  
print("• combine_data_files() - Combine multiple batch files")
print("\\n💡 Run these functions after scraping to analyze your data!")
