In [1]:
import pandas as pd
import os 

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/race_data/race_data.csv')

# Utility function

In [4]:
# convert to new race index format
def convert_year(date_str):
    day, month, year = date_str.split('/')

    if len(year) == 4:
        year = year[2:]
    
    return f'{day}/{month}/{year}'

In [5]:
# combine race index with date to form new index
def combine_index_date(df, col1, col2, new_col):

    df['race_index'] = df[col1].astype(str) + df[col2].str.replace('/', '').astype(str)

    df = df.drop(columns = ['RaceIndex', 'Date'])

    return df


In [6]:
# clean RC, Track, Course
def clean_rc_track_course(text):
    parts = str(text).split('/')

    if len(parts) < 3:
        rc = parts[0].strip()
        track = parts[1].strip()
        course = None
    
    else:
        rc, track, course = parts[0].strip(), parts[1].strip(), parts[2].strip().replace('"', '')

    return rc, track, course

In [7]:
# clean origin, age
def clean_origin_age(text):
    parts = str(text).split('/')

    if len(parts) < 2:
        origin = parts[0].strip()
        age = None
    
    else:
        origin = parts[0].strip()
        age = parts[1].strip()
    
    return origin, age

In [8]:
# clean colour, sex
def clean_colour_sex(text):
    parts = str(text).split('/')

    colour = parts[0].strip()
    sex = parts[-1].strip()

    return colour, sex

In [9]:
def encode_gears(gear_str, gear_symbols):
    # split the gear string by '/'
    parts = gear_str.split('/') if isinstance(gear_str, str) else []
    present_gears = set()

    for part in parts:
        part = part.strip()
        if part.endswith('-'):
            continue
        if len(part) > 1 and part[-1] in ['1', '2']:
            part = part[:-1]
        if part in gear_symbols:
            present_gears.add(part)
    return present_gears

# Main function

In [10]:
def engineering(df):

    exclude = ['WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
    df = df[~df['Pla.'].isin(exclude)].copy()

    # clean format of date
    df['Date'] = df['Date'].apply(convert_year)

    # filter dates only after 04/09/2020
    df['date_dt'] = pd.to_datetime(df['Date'], format = '%d/%m/%y')
    df = df[df['date_dt'] > pd.to_datetime('04/09/20', format = '%d/%m/%y')]
    df = df.drop(columns = ['date_dt'])

    # combine the columns to new index
    df = combine_index_date(df, 'RaceIndex', 'Date', 'race_index')

    # clean rc, track, course
    df[['rc', 'track', 'course']] = df['RC/Track/Course'].apply(clean_rc_track_course).apply(pd.Series)
    df = df.drop(columns = ['RC/Track/Course'])

    # clean origin, age
    df[['origin', 'age']] = df['Origin / Age'].apply(clean_origin_age).apply(pd.Series)
    df = df.drop(columns = ['Origin / Age'])

    # clean colour, sex
    df[['colour', 'sex']] = df['Colour / Sex'].apply(clean_colour_sex).apply(pd.Series)
    df = df.drop(columns = ['Colour / Sex'])

    # encode gears
    # initiate gears columns
    gear_symbols = ['B', 'BO', 'CC', 'CP', 'CO', 'E', 'H', 'P', 'PC', 'PS', 'SB', 'SR', 'TT', 'V', 'VO', 'XB']
    for gear in gear_symbols:
        df[gear] = False

    for idx, row in df.iterrows():
        gears = encode_gears(row['Gear'], gear_symbols)
        for gear in gears:
            df.at[idx, gear] = True # set True to the corresponding columns for the row
    # drop gear column
    df = df.drop(columns = ['Gear'])

    # drop columns that will not be able to obtained prior matches
    df = df.drop(columns = ['LBW', 'RunningPosition', 'Finish Time'])

    # rename columns
    df = df.rename(columns = {'Dr.' : 'gate_position', 'G' : 'track_condition'})

    return df

In [11]:
df = engineering(df)

In [12]:
output_dir = '../data/'
output_path = os.path.join(output_dir, 'cleaned_data.csv')

os.makedirs(output_dir, exist_ok = True)

df.to_csv(output_path, index = False)