In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

In [9]:
df = pd.read_csv('../data/horses_data/horses.csv')

In [12]:
def scraping_races(df):

    hrefs = df['links'].tolist()

    # Initiate header and rows for storage
    header = None
    rows = []

    for href in hrefs:
        full_url = href + '&Option=1'

        try:
            response = requests.get(full_url, timeout = 10)
        except Exception as e:
            print(f'Request exception for {full_url}: {str(e)}')
            continue

        match = re.search(r'([A-Z]\d{3})$', href)
        horse_id = match.group(1) if match else None

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # find origin and age
            origin_label = soup.find('td', string = lambda text : text and 'Country of Origin' in text)
            if origin_label:
                origin_info = origin_label.find_next().find_next().get_text(strip = True)
            else:
                origin_info = 'na'

            # find colour and sex
            colour_label = soup.find('td', string = lambda text : text and 'Colour / Sex' in text)
            if colour_label:
                colour_info = colour_label.find_next().find_next().get_text(strip = True)
            else:
                colour_info = 'na'

            # find import type
            import_label = soup.find('td', string = lambda text : text and 'Import Type' in text)
            if import_label:
                import_info = import_label.find_next().find_next().get_text(strip = True)
            else:
                import_info = 'na'

            # find sire
            sire_label = soup.find('td', string = lambda text : text and 'Sire' in text)
            if sire_label:
                sire_info = sire_label.find_next().find_next().get_text(strip = True)
            else:
                sire_info = 'na'

            # find dam
            dam_label = soup.find('td', string = lambda text : text and 'Dam' in text)
            if dam_label:
                dam_info = dam_label.find_next().find_next().get_text(strip = True)
            else:
                dam_info = 'na'

            # find dam's sire
            dam_sire_label = soup.find('td', string = lambda text : text and 'Dam\'s Sire' in text)
            if dam_sire_label:
                dam_sire_info = dam_sire_label.find_next().find_next().get_text(strip = True)
            else:
                dam_sire_info = 'na'
            
            # find horse per race info
            horse_table = soup.find('table', class_='bigborder')
            if horse_table:
                if header is None:
                    header_row = horse_table.find('tr')
                    header = [th.get_text(strip=True) for th in header_row.find_all('td')]
                    header = header[:-1]
                    header.append('Horse_id')
                    header.append('Origin / Age')
                    header.append('Colour / Sex')
                    header.append('Import type')
                    header.append('Sire')
                    header.append('Dam')
                    header.append('Dam sire')

                for tr in horse_table.find_all('tr')[1:]:
                    cols = [td.get_text(strip=True) for td in tr.find_all('td')]

                    if all(not col.strip() for col in cols):
                        continue
                    first_col = cols[0].strip()
                    if 'Season' in first_col or first_col == 'Overseas':
                        continue

                    # append all info into the rows 
                    cols = cols[:-2]
                    cols.append(horse_id)
                    cols.append(origin_info)
                    cols.append(colour_info)
                    cols.append(import_info)
                    cols.append(sire_info)
                    cols.append(dam_info)
                    cols.append(dam_sire_info)
                    rows.append(cols)
            else:
                print(f'Table not found in {full_url}')


        else:
            print(f"Failed to retrieve {full_url}")

    if not header:
        raise RuntimeError("No data table header found! Check the structure of the page.")

    df = pd.DataFrame(rows, columns=header)
    return df


In [13]:
race_df = scraping_races(df)

In [20]:
output_dir = '../data/race_data/'
output_path = os.path.join(output_dir, 'race_data.csv')

os.makedirs(output_dir, exist_ok = True)

race_df.to_csv(output_path, index=False)

Date of some horses are not in the same format. need to correct further