In [1]:
import pandas as pd
import numpy as np
#import tensorflow as tf
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import matplotlib.pyplot as plt

In [2]:
races_df = pd.read_csv(r"C:\Users\chase\Desktop\HongKongHorseModel\races.csv", delimiter=",", header=0, index_col='race_id')
races_df = races_df[['venue', 'config', 'surface', 'distance', 'going', 'race_class']]

# check to see if we have NaN, then drop NaN
print(races_df[races_df.isnull().any(axis=1)])
races_df = races_df.dropna()

# encode ordinal columns: config, going, 
config_encoder = preprocessing.OrdinalEncoder()
races_df['config'] = config_encoder.fit_transform(races_df['config'].values.reshape(-1, 1))
going_encoder = preprocessing.OrdinalEncoder()
races_df['going'] = going_encoder.fit_transform(races_df['going'].values.reshape(-1, 1))

# encode nominal column: venue
venue_encoder = preprocessing.LabelEncoder()
races_df['venue'] = venue_encoder.fit_transform(races_df['venue'])


Empty DataFrame
Columns: [venue, config, surface, distance, going, race_class]
Index: []


In [3]:
races_df

Unnamed: 0_level_0,venue,config,surface,distance,going,race_class
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0.0,0,1400,2.0,5
1,1,0.0,0,1200,2.0,5
2,1,0.0,0,1400,2.0,4
3,1,0.0,0,1200,2.0,1
4,1,0.0,0,1600,2.0,4
...,...,...,...,...,...,...
6344,1,0.0,0,1400,1.0,1
6345,1,0.0,0,1600,1.0,11
6346,1,0.0,0,2000,1.0,11
6347,1,0.0,0,1200,1.0,2


In [4]:
runs_df = pd.read_csv(r"C:\Users\chase\Desktop\HongKongHorseModel\runs.csv", delimiter=",", header=0)
runs_df = runs_df[['race_id', 'draw', 
                   'horse_age', 'horse_country', 'horse_type', 'horse_rating', 'declared_weight', 'actual_weight', 'win_odds', 
                   'result']] 

# check to see if we have NaN, then drop NaN
print(runs_df[runs_df.isnull().any(axis=1)])
runs_df = runs_df.dropna()

# not sure why, but we got some strange draw in the dataset. Maximum shall be 14
strange_draw_index = runs_df[runs_df['draw'] > 14].index
# delete these row indexes from dataFrame
runs_df = runs_df.drop(strange_draw_index)

# encode nominal columns: horse_country, horse_type
horse_country_encoder = preprocessing.LabelEncoder()
runs_df['horse_country'] = horse_country_encoder.fit_transform(runs_df['horse_country'])
horse_type_encoder = preprocessing.LabelEncoder()
runs_df['horse_type'] = horse_type_encoder.fit_transform(runs_df['horse_type'])

     race_id  draw  horse_age horse_country horse_type  horse_rating  \
182       13    12          3           NaN        NaN            60   
846       69     1          3           NaN        NaN            60   

     declared_weight  actual_weight  win_odds  result  
182           1107.0            120      28.0       5  
846           1105.0            119      14.0      11  


In [5]:
def group_horse_and_result(element):
    if element[0] == 'result':
        return 100 + element[1] # to make sure results are put near the end
    else:
        return element[1]   

runs_df = runs_df.pivot(index='race_id', columns='draw', values=runs_df.columns[2:])
rearranged_columns = sorted(list(runs_df.columns.values), key=group_horse_and_result)
runs_df = runs_df[rearranged_columns]
print(runs_df.head())

# quite some NaNs appreared in the dataframe, reason is some races didnt have full 14 horses participating
# fill with 0
runs_df = runs_df.fillna(0)

        horse_age horse_country horse_type horse_rating declared_weight  \
draw           1             1          1            1               1    
race_id                                                                   
0             3.0          14.0        3.0         60.0          1089.0   
1             3.0           1.0        3.0         60.0          1059.0   
2             3.0           1.0        3.0         60.0          1028.0   
3             3.0          14.0        5.0         60.0          1074.0   
4             3.0          11.0        3.0         60.0           988.0   

        actual_weight win_odds horse_age horse_country horse_type  ... result  \
draw               1        1         2             2          2   ...     5    
race_id                                                            ...          
0               120.0      5.4       3.0           1.0        3.0  ...    3.0   
1               121.0     10.0       3.0          11.0        3.0  ...    8

In [7]:
data = races_df.join(runs_df, on='race_id', how='right')
X = data[data.columns[:-14]] 
ss = preprocessing.StandardScaler()
X = pd.DataFrame(ss.fit_transform(X),columns = X.columns)

y = data[data.columns[-14:]].applymap(lambda x: 1.0 if 0.5 < x < 1.5 else 0.0) 

print(X.shape)
print(y.shape)

# split data into train and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

(6348, 104)
(6348, 14)


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(96, activation='relu', input_shape=(104,)),
    tf.keras.layers.Dense(14, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(5e-04),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.Precision(name='precision')])
print(model.summary())