In [1]:
import pandas as pd
import os 

In [16]:
pd.set_option('display.max_columns', None)

In [17]:
df = pd.read_csv('../data/race_data/race_17-25.csv')

# Utility function

In [18]:
# convert to new race index format
def convert_year(date_str):
    day, month, year = date_str.split('/')

    if len(year) == 4:
        year = year[2:]
    
    return f'{day}/{month}/{year}'

In [19]:
# combine race index with date to form new index
def combine_index_date(df, col1, col2, new_col):

    df['race_index'] = df[col1].astype(str) + df[col2].str.replace('/', '').astype(str)

    df = df.drop(columns = ['RaceIndex', 'Date'])

    return df


In [20]:
# clean RC, Track, Course
def clean_rc_track_course(text):
    parts = str(text).split('/')

    if len(parts) < 3:
        rc = parts[0].strip()
        track = parts[1].strip()
        course = None
    
    else:
        rc, track, course = parts[0].strip(), parts[1].strip(), parts[2].strip().replace('"', '')

    return rc, track, course

In [21]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [22]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

In [23]:
def encode_gears(gear_str, gear_symbols):
    # split the gear string by '/'
    parts = gear_str.split('/') if isinstance(gear_str, str) else []
    present_gears = set()

    for part in parts:
        part = part.strip()
        if part.endswith('-'):
            continue
        if len(part) > 1 and part[-1] in ['1', '2']:
            part = part[:-1]
        if part in gear_symbols:
            present_gears.add(part)
    return present_gears

# Main function

In [24]:
def engineering(df):

    exclude = ['WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
    df = df[~df['Pla.'].isin(exclude)].copy()

    # clean format of date
    df['Date'] = df['Date'].apply(convert_year)

    # filter dates only after 01/09/2017
    df['date_dt'] = pd.to_datetime(df['Date'], format = '%d/%m/%y')
    df = df[df['date_dt'] > pd.to_datetime('01/09/17', format = '%d/%m/%y')]
    df = df.drop(columns = ['date_dt'])

    # combine the columns to new index
    df = combine_index_date(df, 'RaceIndex', 'Date', 'race_index')

    # clean rc, track, course
    df[['rc', 'track', 'course']] = df['RC/Track/Course'].apply(clean_rc_track_course).apply(pd.Series)
    df = df.drop(columns = ['RC/Track/Course'])

    # clean origin, age
    df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
    df = df.drop(columns = ['Origin / Age'])

    # clean colour, sex
    df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
    df = df.drop(columns = ['Colour / Sex'])

    # clean rating
    df['Rtg.'] = pd.to_numeric(df['Rtg.'], errors = 'coerce')

    # encode gears
    # initiate gears columns
    gear_symbols = ['B', 'BO', 'CC', 'CP', 'CO', 'E', 'H', 'P', 'PC', 'PS', 'SB', 'SR', 'TT', 'V', 'VO', 'XB']
    for gear in gear_symbols:
        df[gear] = False

    for idx, row in df.iterrows():
        gears = encode_gears(row['Gear'], gear_symbols)
        for gear in gears:
            df.at[idx, gear] = True # set True to the corresponding columns for the row
    # drop gear column
    df = df.drop(columns = ['Gear'])

    # drop columns that will not be able to obtained prior matches
    df = df.drop(columns = ['LBW', 'RunningPosition', 'Finish Time'])

    # rename columns
    df = df.rename(columns = {'Dr.' : 'gate_position', 'G' : 'track_condition'})

    return df

In [25]:
df = engineering(df)

In [26]:
df

Unnamed: 0.1,Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB
0,0,12,1600,G,G1,10,,T Yasuda,Y Kitamura,64,126,1187,H811,VIS,Just A Way,Epic Love,Dansili,238101223,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,1,05,2000,G,G1,7,,T Yasuda,C Y Ho,11,126,1179,H811,VIS,Just A Way,Epic Love,Dansili,623300423,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2,2,02,2000,G,G1,6,,T Yasuda,Y Kitamura,15,126,1150,H811,VIS,Just A Way,Epic Love,Dansili,240111222,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,3,11,1200,G,5,2,18.0,C H Yip,C Wong,7.1,108,1045,C017,PPG,Smart Missile,Pyrography,Danzero,402060221,ST,Turf,C,AUS,,Brown,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,05,1200,G,5,14,18.0,C H Yip,M F Poon,42,113,1058,C017,PPG,Smart Missile,Pyrography,Danzero,296261220,ST,Turf,A+3,AUS,,Brown,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95405,29882,02,1200,G,4,5,51.0,L Ho,M F Poon,9,118,1066,B027,PPG,Helmet,Umoya,Nashwan,580180418,HV,Turf,B,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
95406,29883,03,1200,GF,4,4,51.0,L Ho,M F Poon,15,119,1056,B027,PPG,Helmet,Umoya,Nashwan,532280318,HV,Turf,C+3,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
95407,29884,09,1000,G,4,3,52.0,L Ho,C Schofield,12,125,1061,B027,PPG,Helmet,Umoya,Nashwan,496140318,HV,Turf,B,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
95408,29885,06,1200,G,4,6,52.0,L Ho,H T Mo,200,119,1072,B027,PPG,Helmet,Umoya,Nashwan,431180218,ST,Turf,A,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [27]:
output_dir = '../data/'
output_path = os.path.join(output_dir, 'cleaned_data_17-25.csv')

os.makedirs(output_dir, exist_ok = True)

df.to_csv(output_path, index = False)