In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
df = pd.read_csv('../data/race_data/race_data.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65523 entries, 0 to 65522
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   RaceIndex         65523 non-null  int64 
 1   Pla.              65523 non-null  object
 2   Date              65523 non-null  object
 3   RC/Track/Course   65523 non-null  object
 4   Dist.             65523 non-null  int64 
 5   G                 65523 non-null  object
 6   RaceClass         65523 non-null  object
 7   Dr.               65523 non-null  object
 8   Rtg.              65523 non-null  object
 9   Trainer           65523 non-null  object
 10  Jockey            65474 non-null  object
 11  LBW               65523 non-null  object
 12  Win Odds          65523 non-null  object
 13  Act.Wt.           65523 non-null  int64 
 14  RunningPosition   65523 non-null  object
 15  Finish Time       65523 non-null  object
 16  Declar.Horse Wt.  65523 non-null  object
 17  Gear        

In [8]:
df

Unnamed: 0,RaceIndex,Pla.,Date,RC/Track/Course,Dist.,G,RaceClass,Dr.,Rtg.,Trainer,...,Finish Time,Declar.Horse Wt.,Gear,Horse_id,Origin / Age,Colour / Sex,Import type,Sire,Dam,Dam sire
0,238,12,10/12/2023,"ST / Turf / ""A""",1600,G,G1,10,--,T Yasuda,...,1.35.46,1187,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
1,623,05,30/04/2023,"ST / Turf / ""A""",2000,G,G1,7,--,T Yasuda,...,2.02.71,1179,H,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
2,240,02,11/12/2022,"ST / Turf / ""A""",2000,G,G1,6,--,T Yasuda,...,2.00.44,1150,--,H811,JPN,Bay / Horse,VIS,Just A Way,Epic Love,Dansili
3,402,11,06/02/2021,"ST / Turf / ""C""",1200,G,5,2,18,C H Yip,...,1.10.66,1045,CP-/TT-,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
4,296,05,26/12/2020,"ST / Turf / ""A+3""",1200,G,5,14,18,C H Yip,...,1.10.42,1058,CP/TT,C017,AUS,Brown / Gelding,PPG,Smart Missile,Pyrography,Danzero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65518,278,03,23/12/23,"ST / Turf / ""C""",1400,G,3,11,68,C Fownes,...,1.22.88,1152,--,H459,AUS / 6,Bay / Gelding,PP,Impending,Isola Blu,Blackfriars
65519,240,02,10/12/23,"ST / Turf / ""A""",1200,G,3,10,68,C Fownes,...,1.09.52,1153,--,H459,AUS / 6,Bay / Gelding,PP,Impending,Isola Blu,Blackfriars
65520,185,02,19/11/23,"ST / Turf / ""B+2""",1200,GF,3,6,68,C Fownes,...,1.08.94,1155,--,H459,AUS / 6,Bay / Gelding,PP,Impending,Isola Blu,Blackfriars
65521,801,06,01/07/25,"ST / Turf / ""C""",1200,G,4,11,53,M Newnham,...,1.10.15,1127,B1,K334,AUS / 4,Grey / Gelding,PPG,Street Boss,Varanasi,Encosta de Lago


# Basic Cleaning

## utility function

In [9]:
# combine race index with date to form new index
def combine_index_date(df, col1, col2, new_col):

    df['race_index'] = df[col1].astype(str) + df[col2].dt.strftime('%d%m%y')

    df = df.drop(columns = ['RaceIndex'])

    return df


In [10]:
# convert to new race index format
def convert_year(date_str):
    day, month, year = date_str.split('/')

    if len(year) == 4:
        year = year[2:]

    date = f'{day}/{month}/{year}'
    
    return pd.to_datetime(date, format = '%d/%m/%y')

In [11]:
# clean RC, Track, Course
def clean_rc_track_course(text):
    parts = str(text).split('/')

    if len(parts) < 3:
        rc = parts[0].strip()
        track = parts[1].strip()
        course = None
    
    else:
        rc, track, course = parts[0].strip(), parts[1].strip(), parts[2].strip().replace('"', '')

    return rc, track, course

In [12]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [13]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

## Workflow

In [14]:
exclude = ['WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
df = df[~df['Pla.'].isin(exclude)].copy()

In [15]:
df['Date'] = df['Date'].apply(convert_year)

In [16]:
df = combine_index_date(df, 'RaceIndex', 'Date', 'race_index')

In [18]:
# exponential decay for top finishers
df['target'] = df.groupby('race_index')['Pla.'].transform(
    lambda x: x.apply(lambda place: np.exp(-(int(place)-1)/2) if str(place).isdigit() and int(place) <= 4 else 0)
)

In [19]:
df[['rc', 'track', 'course']] = df['RC/Track/Course'].apply(clean_rc_track_course).apply(pd.Series)
df = df.drop(columns=['RC/Track/Course'])

In [20]:
df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
df = df.drop(columns = ['Origin / Age'])

In [21]:
# clean colour, sex
df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
df = df.drop(columns = ['Colour / Sex'])

In [22]:
# clean rating
df['Rtg.'] = pd.to_numeric(df['Rtg.'], errors = 'coerce')

In [23]:
# rename columns
df = df.rename(columns = {'Dr.' : 'gate_position', 'G' : 'track_condition'})

In [24]:
df = df[['Date', 'race_index', 'RaceClass', 'rc', 'track', 'course', 'Dist.', 'track_condition', 'Horse_id', 'Declar.Horse Wt.', 'Act.Wt.', 'gate_position', 'Rtg.', 'age', 'colour', 'sex', 'origin', 'Import type', 'Trainer', 'Jockey', 'Sire', 'Dam', 'Dam sire', 'Finish Time', 'Gear', 'target']]

In [25]:
for col in ['race_index', 'Declar.Horse Wt.', 'age']:
    df[col] = pd.to_numeric(df[col], errors = 'coerce')

In [26]:
df = df[df['Date'] > pd.to_datetime('04/09/20', format = '%d/%m/%y')]

In [27]:
df

Unnamed: 0,Date,race_index,RaceClass,rc,track,course,Dist.,track_condition,Horse_id,Declar.Horse Wt.,...,origin,Import type,Trainer,Jockey,Sire,Dam,Dam sire,Finish Time,Gear,target
0,2023-12-10,238101223,G1,ST,Turf,A,1600,G,H811,1187,...,JPN,VIS,T Yasuda,Y Kitamura,Just A Way,Epic Love,Dansili,1.35.46,H,0.000000
1,2023-04-30,623300423,G1,ST,Turf,A,2000,G,H811,1179,...,JPN,VIS,T Yasuda,C Y Ho,Just A Way,Epic Love,Dansili,2.02.71,H,0.000000
2,2022-12-11,240111222,G1,ST,Turf,A,2000,G,H811,1150,...,JPN,VIS,T Yasuda,Y Kitamura,Just A Way,Epic Love,Dansili,2.00.44,--,0.606531
3,2021-02-06,402060221,5,ST,Turf,C,1200,G,C017,1045,...,AUS,PPG,C H Yip,C Wong,Smart Missile,Pyrography,Danzero,1.10.66,CP-/TT-,0.000000
4,2020-12-26,296261220,5,ST,Turf,A+3,1200,G,C017,1058,...,AUS,PPG,C H Yip,M F Poon,Smart Missile,Pyrography,Danzero,1.10.42,CP/TT,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65518,2023-12-23,278231223,3,ST,Turf,C,1400,G,H459,1152,...,AUS,PP,C Fownes,C Y Ho,Impending,Isola Blu,Blackfriars,1.22.88,--,0.367879
65519,2023-12-10,240101223,3,ST,Turf,A,1200,G,H459,1153,...,AUS,PP,C Fownes,C Y Ho,Impending,Isola Blu,Blackfriars,1.09.52,--,0.606531
65520,2023-11-19,185191123,3,ST,Turf,B+2,1200,GF,H459,1155,...,AUS,PP,C Fownes,C Y Ho,Impending,Isola Blu,Blackfriars,1.08.94,--,0.606531
65521,2025-07-01,801010725,4,ST,Turf,C,1200,G,K334,1127,...,AUS,PPG,M Newnham,Z Purton,Street Boss,Varanasi,Encosta de Lago,1.10.15,B1,0.000000


# Feature engineering

In [28]:
df = df.sort_values(['Horse_id', 'Date'])

n = 3

df['recent_3_win_rate_horse'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=n, min_periods = 3).apply(lambda y: np.mean(y == 4),raw = True))
)

In [29]:
df = df.sort_values(['Jockey', 'Date'])

df['recent_3_win_rate_jockey'] = (
    df.groupby('Jockey')['target']
    .transform(lambda x: x.shift().rolling(window=n, min_periods = 3).apply(lambda y: np.mean(y == 4),raw = True))
)

In [30]:
# Expand your rolling statistics beyond just win rate
df['recent_5_avg_finish_pos'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=5, min_periods=2).mean())
)

df['recent_3_consistency'] = (
    df.groupby('Horse_id')['target']
    .transform(lambda x: x.shift().rolling(window=3, min_periods=2).std())
)

# Jockey-Trainer combination performance
df['jockey_trainer_combo_rate'] = (
    df.groupby(['Jockey', 'Trainer'])['target']
    .transform(lambda x: x.shift().expanding(min_periods=5).mean())
)

# Horse performance on specific track/distance combinations
df['horse_track_distance_rate'] = (
    df.groupby(['Horse_id', 'track', 'Dist.'])['target']
    .transform(lambda x: x.shift().expanding(min_periods=2).mean())
)

In [31]:
overall_mean_win_rate = np.mean(df['target'] == 4)
df['recent_3_win_rate_horse'] = df['recent_3_win_rate_horse'].fillna(overall_mean_win_rate)
df['recent_3_win_rate_jockey'] = df['recent_3_win_rate_jockey'].fillna(overall_mean_win_rate)
df['recent_5_avg_finish_pos'] = df['recent_5_avg_finish_pos'].fillna(overall_mean_win_rate)
df['recent_3_consistency'] = df['recent_3_consistency'].fillna(overall_mean_win_rate)
df['jockey_trainer_combo_rate'] = df['jockey_trainer_combo_rate'].fillna(overall_mean_win_rate)
df['horse_track_distance_rate'] = df['horse_track_distance_rate'].fillna(overall_mean_win_rate)

In [32]:
df.sort_values('race_index', ascending=False)

Unnamed: 0,Date,race_index,RaceClass,rc,track,course,Dist.,track_condition,Horse_id,Declar.Horse Wt.,...,Dam sire,Finish Time,Gear,target,recent_3_win_rate_horse,recent_3_win_rate_jockey,recent_5_avg_finish_pos,recent_3_consistency,jockey_trainer_combo_rate,horse_track_distance_rate
61481,2025-07-16,847160725,3,HV,Turf,B,1200,GF,H396,1056,...,He's A Decoy,1.09.37,H/XB,0.00000,0.0,0.0,0.365932,0.128824,0.194644,0.279518
58594,2025-07-16,847160725,3,HV,Turf,B,1200,GF,K258,1101,...,Kingman,1.09.13,--,1.00000,0.0,0.0,0.000000,0.000000,0.189579,0.000000
58655,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J394,1104,...,Diktat,1.10.26,TT,0.00000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
59032,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J317,1178,...,Foreplay,1.09.48,B,0.00000,0.0,0.0,0.473576,0.364955,0.379739,0.597441
62772,2025-07-16,847160725,3,HV,Turf,B,1200,GF,J396,1084,...,Duke Of Marmalade,1.09.26,TT,0.22313,0.0,0.0,0.273576,0.577350,0.124638,0.341970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8400,2021-09-05,1050921,5,ST,Turf,A,1400,G,D190,1054,...,Green Desert,1.24.43,--,0.00000,0.0,0.0,0.000000,0.000000,0.061043,0.000000
43529,2021-09-05,1050921,5,ST,Turf,A,1400,G,E104,1063,...,Danehill,1.22.97,XB,0.00000,0.0,0.0,0.444626,0.577350,0.087333,0.000000
16206,2021-09-05,1050921,5,ST,Turf,A,1400,G,D502,1191,...,Kitten's Joy,1.24.13,TT,0.00000,0.0,0.0,0.091970,0.212395,0.389762,0.183940
1071,2021-09-05,1050921,5,ST,Turf,A,1400,G,E214,1063,...,Marju,1.25.05,B/TT,0.00000,0.0,0.0,0.111565,0.157777,0.219613,0.000000


In [33]:
output_dir = '../data/'
output_path = os.path.join(output_dir, 'cleaned_data_new.csv')

os.makedirs(output_dir, exist_ok = True)

df.to_csv(output_path, index = False)