In [28]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import matplotlib.pyplot as plt

# Create DataFrame from data obtained from Kaggle
races = pd.read_csv(r"races.csv", delimiter=",", header=0, index_col='race_id')
races = races[['venue', 'config', 'surface', 'distance', 'going', 'race_class']]

# Remove any NaN results in the data
print(races[races.isnull().any(axis=1)])
races = races.dropna()

Empty DataFrame
Columns: [venue, config, surface, distance, going, race_class]
Index: []


In [29]:
# For the columns that are strings, change them to numbers, so the model can appropriately be trained

de = preprocessing.OrdinalEncoder()
races['config'] = de.fit_transform(races['config'].values.reshape(-1, 1))
ge = preprocessing.OrdinalEncoder()
races['going'] = ge.fit_transform(races['going'].values.reshape(-1, 1))
ve = preprocessing.LabelEncoder()
races['venue'] = ve.fit_transform(races['venue'])

In [30]:
# Essentially do the same as above, but with the runs.csv

# Create DataFrame from data obtained from Kaggle
runs = pd.read_csv(r"runs.csv", delimiter=",", header=0)
runs = runs[['race_id', 'draw', 'horse_age', 'horse_country', 'horse_type', 'horse_rating', 'declared_weight', 'actual_weight', 'win_odds', 'result']] 

# Remove any NaN results in the data
print(runs[runs.isnull().any(axis=1)])
runs = runs.dropna()

# Clean odd data in draw column
badDraw = runs[runs['draw'] > 14].index
runs = runs.drop(badDraw)

# For the columns that are strings, change them to numbers, so the model can appropriately be trained
hce = preprocessing.LabelEncoder()
runs['horse_country'] = hce.fit_transform(runs['horse_country'])
hte = preprocessing.LabelEncoder()
runs['horse_type'] = hte.fit_transform(runs['horse_type'])

     race_id  draw  horse_age horse_country horse_type  horse_rating  \
182       13    12          3           NaN        NaN            60   
846       69     1          3           NaN        NaN            60   

     declared_weight  actual_weight  win_odds  result  
182           1107.0            120      28.0       5  
846           1105.0            119      14.0      11  


In [31]:
# Function to sort columns
def group_horse_and_result(element):
    if element[0] == 'result':
        return 100 + element[1]
    else:
        return element[1]   
    
# Adjust columns to group by horse and result
runs = runs.pivot(index='race_id', columns='draw', values=runs.columns[2:])
rearranged_columns = sorted(list(runs.columns.values), key=group_horse_and_result)
runs = runs[rearranged_columns]

# Fill NaNs with 0
runs = runs.fillna(0)

In [34]:
# Join the dataframes
data = races.join(runs, on='race_id', how='right')
X = data[data.columns[:-14]] 

# Flatten column names
X.columns = [f'{col[0]}, {col[1]}' for col in X.columns]

# Standardize the data
ss = preprocessing.StandardScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)

# Find the winner of each race
y_won = data[data.columns[-14:]].applymap(lambda x: 1.0 if 0.5 < x < 1.5 else 0.0) 

# Train sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_won, train_size=0.8, test_size=0.2, random_state=1)

  data = races.join(runs, on='race_id', how='right')
