In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [2]:
df = pd.read_csv('../data/race_data/race_data_20251007.csv')

  df = pd.read_csv('../data/race_data/race_data_20251007.csv')


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66472 entries, 0 to 66471
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   RaceIndex         66472 non-null  int64 
 1   Pla.              66472 non-null  object
 2   Date              66472 non-null  object
 3   RC/Track/Course   66472 non-null  object
 4   Dist.             66472 non-null  int64 
 5   G                 66472 non-null  object
 6   RaceClass         66472 non-null  object
 7   Dr.               66472 non-null  object
 8   Rtg.              66472 non-null  object
 9   Trainer           66472 non-null  object
 10  Jockey            66423 non-null  object
 11  LBW               66472 non-null  object
 12  Win Odds          66472 non-null  object
 13  Act.Wt.           66472 non-null  int64 
 14  RunningPosition   66472 non-null  object
 15  Finish Time       66472 non-null  object
 16  Declar.Horse Wt.  66472 non-null  object
 17  Gear        

In [4]:
df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,238,12,10/12/2023,"ST / Turf / ""A""",1600,G,G1,10,--,T Yasuda,...,1.35.46,1187,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
1,623,05,30/04/2023,"ST / Turf / ""A""",2000,G,G1,7,--,T Yasuda,...,2.02.71,1179,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
2,240,02,11/12/2022,"ST / Turf / ""A""",2000,G,G1,6,--,T Yasuda,...,2.00.44,1150,--,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
3,402,11,06/02/2021,"ST / Turf / ""C""",1200,G,5,2,18,C H Yip,...,1.10.66,1045,CP-/TT-,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
4,296,05,26/12/2020,"ST / Turf / ""A+3""",1200,G,5,14,18,C H Yip,...,1.10.42,1058,CP/TT,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66467,66,06,04/10/25,ST / AWT,1650,GD,5,11,40,K H Ting,...,1.41.37,1113,B/TT,K069,AUS / 4,Bay / Gelding,PPG,Deep Field,Nakataan,Zabeel
66468,72,07,04/10/25,"ST / Turf / ""B+2""",1000,GF,3,6,62,C H Yip,...,0.57.11,1089,TT,K377,GB / 3,Bay / Gelding,PPG,Land Force,Phantom Spirit,Invincible Spirit
66469,69,03,04/10/25,"ST / Turf / ""B+2""",1400,GF,4,5,51,C Fownes,...,1.21.82,1157,TT,K398,NZ / 4,Bay / Gelding,PPG,Ardrossan,Dolce Amore,Sebring
66470,66,10,04/10/25,ST / AWT,1650,GD,5,6,37,P C Ng,...,1.41.52,1201,B/TT,J218,AUS / 5,Bay / Gelding,PPG,Deep Field,Alberton Park,Thorn Park


# Basic Cleaning

## utility function

In [5]:
# combine race index with date to form new index
def combine_index_date(df, col1, col2, new_col):

    df['race_index'] = df[col1].astype(str) + df[col2].dt.strftime('%d%m%y')

    df = df.drop(columns = ['RaceIndex'])

    return df


In [6]:
# convert to new race index format
def convert_year(date_str):
    day, month, year = date_str.split('/')

    if len(year) == 4:
        year = year[2:]

    date = f'{day}/{month}/{year}'
    
    return pd.to_datetime(date, format = '%d/%m/%y')

In [7]:
# clean RC, Track, Course
def clean_rc_track_course(text):
    parts = str(text).split('/')

    if len(parts) < 3:
        rc = parts[0].strip()
        track = parts[1].strip()
        course = None
    
    else:
        rc, track, course = parts[0].strip(), parts[1].strip(), parts[2].strip().replace('"', '')

    return rc, track, course

In [8]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [9]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

## Workflow

In [10]:
exclude = ['WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
df = df[~df['Pla.'].isin(exclude)].copy()

In [11]:
df['Date'] = df['Date'].apply(convert_year)

In [12]:
df = combine_index_date(df, 'RaceIndex', 'Date', 'race_index')

In [13]:
# exponential decay for top finishers
df['target'] = df.groupby('race_index')['Pla.'].transform(
    lambda x: x.apply(lambda place: np.exp(-(int(place)-1)/2) if str(place).isdigit() and int(place) <= 4 else 0)
)

In [14]:
df[['rc', 'track', 'course']] = df['RC/Track/Course'].apply(clean_rc_track_course).apply(pd.Series)
df = df.drop(columns=['RC/Track/Course'])

In [15]:
df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
df = df.drop(columns = ['Origin / Age'])

In [16]:
# clean colour, sex
df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
df = df.drop(columns = ['Colour / Sex'])

In [17]:
# clean rating
df['Rtg.'] = pd.to_numeric(df['Rtg.'], errors = 'coerce')

In [18]:
# rename columns
df = df.rename(columns = {'Dr.' : 'gate_position', 'G' : 'track_condition'})

In [19]:
df = df[['Date', 'race_index', 'RaceClass', 'rc', 'track', 'course', 'Dist.', 'track_condition', 'Horse_id', 'Declar.Horse Wt.', 'Act.Wt.', 'gate_position', 'Rtg.', 'age', 'colour', 'sex', 'origin', 'Import type', 'Trainer', 'Jockey', 'Sire', 'Dam', 'Dam sire', 'Finish Time', 'Gear', 'target']]

In [20]:
for col in ['race_index', 'Declar.Horse Wt.', 'age']:
    df[col] = pd.to_numeric(df[col], errors = 'coerce')

In [21]:
df = df[df['Date'] > pd.to_datetime('04/09/20', format = '%d/%m/%y')]

In [22]:
df

Unnamed: 0,Date,race_index,RaceClass,rc,track,course,Dist.,track_condition,Horse_id,Declar.Horse Wt.,...,origin,Import type,Trainer,Jockey,Sire,Dam,Dam sire,Finish Time,Gear,target
0,2023-12-10,238101223,G1,ST,Turf,A,1600,G,H811,1187,...,JPN,VIS,T Yasuda,Y Kitamura,Just A Way,Epic Love,Dansili,1.35.46,H,0.000000
1,2023-04-30,623300423,G1,ST,Turf,A,2000,G,H811,1179,...,JPN,VIS,T Yasuda,C Y Ho,Just A Way,Epic Love,Dansili,2.02.71,H,0.000000
2,2022-12-11,240111222,G1,ST,Turf,A,2000,G,H811,1150,...,JPN,VIS,T Yasuda,Y Kitamura,Just A Way,Epic Love,Dansili,2.00.44,--,0.606531
3,2021-02-06,402060221,5,ST,Turf,C,1200,G,C017,1045,...,AUS,PPG,C H Yip,C Wong,Smart Missile,Pyrography,Danzero,1.10.66,CP-/TT-,0.000000
4,2020-12-26,296261220,5,ST,Turf,A+3,1200,G,C017,1058,...,AUS,PPG,C H Yip,M F Poon,Smart Missile,Pyrography,Danzero,1.10.42,CP/TT,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66467,2025-10-04,66041025,5,ST,AWT,,1650,GD,K069,1113,...,AUS,PPG,K H Ting,P N Wong,Deep Field,Nakataan,Zabeel,1.41.37,B/TT,0.000000
66468,2025-10-04,72041025,3,ST,Turf,B+2,1000,GF,K377,1089,...,GB,PPG,C H Yip,Y L Chung,Land Force,Phantom Spirit,Invincible Spirit,0.57.11,TT,0.000000
66469,2025-10-04,69041025,4,ST,Turf,B+2,1400,GF,K398,1157,...,NZ,PPG,C Fownes,L Ferraris,Ardrossan,Dolce Amore,Sebring,1.21.82,TT,0.367879
66470,2025-10-04,66041025,5,ST,AWT,,1650,GD,J218,1201,...,AUS,PPG,P C Ng,K Teetan,Deep Field,Alberton Park,Thorn Park,1.41.52,B/TT,0.000000


# Feature engineering

In [23]:
df = df.sort_values(['Horse_id', 'Date'])

n = 3

df['recent_3_win_rate_horse'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=n, min_periods = 3).mean())
)

In [24]:
df = df.sort_values(['Jockey', 'Date'])

df['recent_3_win_rate_jockey'] = (
    df.groupby('Jockey')['target']
    .transform(lambda x: x.shift().rolling(window=n, min_periods = 3).mean())
)

In [25]:
# Expand your rolling statistics beyond just win rate
df['recent_5_avg_finish_pos'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=5, min_periods=2).mean())
)

df['recent_3_consistency'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=3, min_periods=2).std())
)

# Jockey-Trainer combination performance
df['jockey_trainer_combo_rate'] = (
    df.groupby(['Jockey', 'Trainer'])['target']
    .transform(lambda x: x.shift().expanding(min_periods=5).mean())
)

# Horse performance on specific track/distance combinations
df['horse_track_distance_rate'] = (
    df.groupby(['Horse_id', 'track', 'Dist.'])['target']
    .transform(lambda x: x.shift().expanding(min_periods=2).mean())
)

In [26]:
overall_mean_win_rate = np.mean(df['target'] == 4)
df['recent_3_win_rate_horse'] = df['recent_3_win_rate_horse'].fillna(overall_mean_win_rate)
df['recent_3_win_rate_jockey'] = df['recent_3_win_rate_jockey'].fillna(overall_mean_win_rate)
df['recent_5_avg_finish_pos'] = df['recent_5_avg_finish_pos'].fillna(overall_mean_win_rate)
df['recent_3_consistency'] = df['recent_3_consistency'].fillna(overall_mean_win_rate)
df['jockey_trainer_combo_rate'] = df['jockey_trainer_combo_rate'].fillna(overall_mean_win_rate)
df['horse_track_distance_rate'] = df['horse_track_distance_rate'].fillna(overall_mean_win_rate)

In [27]:
df.sort_values('race_index', ascending=False)

Unnamed: 0,Date,race_index,RaceClass,rc,track,course,Dist.,track_condition,Horse_id,Declar.Horse Wt.,...,Dam sire,Finish Time,Gear,target,recent_3_win_rate_horse,recent_3_win_rate_jockey,recent_5_avg_finish_pos,recent_3_consistency,jockey_trainer_combo_rate,horse_track_distance_rate
61481,2025-07-16,847160725,3,HV,Turf,B,1200,GF,H396,1056,...,He's A Decoy,1.09.37,H/XB,0.000000,0.074377,0.407710,0.365932,0.128824,0.194644,0.279518
56454,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J537,1176,...,Choisir,1.09.88,CP,0.000000,0.074377,0.407710,0.244626,0.128824,0.174437,0.305783
63166,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J401,1120,...,Bernardini,1.09.53,CP,0.000000,0.455960,0.658137,0.000000,0.000000,0.084676,0.000000
58494,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J393,1035,...,Nayef,1.09.22,TT,0.367879,0.535510,0.202177,0.000000,0.000000,0.112289,0.000000
15631,2025-07-16,847160725,3,HV,Turf,B,1200,GF,G016,1081,...,Keeper,1.09.67,TT,0.000000,0.000000,0.000000,0.044626,0.000000,0.170994,0.339009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42204,2021-09-05,1050921,5,ST,Turf,A,1400,G,A217,1091,...,Anabaa,1.23.22,B,0.000000,0.000000,0.000000,0.273576,0.000000,0.099228,0.000000
13160,2021-09-05,1050921,5,ST,Turf,A,1400,G,B288,1048,...,High Chaparral,1.22.66,--,1.000000,0.122626,0.000000,0.118202,0.212395,0.123311,0.116306
8400,2021-09-05,1050921,5,ST,Turf,A,1400,G,D190,1054,...,Green Desert,1.24.43,--,0.000000,0.000000,0.333333,0.000000,0.000000,0.061043,0.000000
7835,2021-09-05,1050921,5,ST,Turf,A,1400,G,D307,984,...,Cadeaux Genereux,1.22.76,H/TT,0.606531,0.000000,0.407710,0.394882,0.503769,0.270866,0.000000


In [28]:
current_time = datetime.now().strftime('%Y%m%d')

output_dir = '../data/'
output_path = os.path.join(output_dir, f'cleaned_data_{current_time}.csv')

os.makedirs(output_dir, exist_ok = True)

df.to_csv(output_path, index = False)