In [54]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score

import time

import warnings
warnings.filterwarnings("ignore")

 # Classification Modelling
 1. Classify if horse can win 1st position
 2. Classify if horse can win top 3 positions
 3. Classify if horse can be ranked in the top half

In [55]:
# Read the train and test files
df_train = pd.read_csv('df_train.csv', index_col=0)
df_test = pd.read_csv('df_test.csv', index_col=0)

In [56]:
# View the shape of the train and test files
print(df_train.shape)
print(df_test.shape)

(24155, 27)
(5209, 27)


In [57]:
# View the first 2 rows of the train file
df_train.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,1.0,DOUBLE DRAGON,K019,B Prebble,D Cruz,133,1032,1,-,...,,2014-001,1,1.0,1400,1,1,1,6.081266,7.393481
1,2,2.0,PLAIN BLUE BANNER,S070,D Whyte,D E Ferraris,133,1075,13,2,...,,2014-001,2,2.0,1400,0,1,1,5.848828,6.61134


In [58]:
# View the first 2 rows of the test files
df_test.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
24155,11,3.0,GLORIOUS PARTNERS,A089,T H So,D J Hall,125,1112,9,9-1/4,...,,2016-377,11/10,10.5,1400,0,0,0,8.331781,6.638824
24156,12,7.0,APPLAUSE,A023,O Doleuze,K L Man,124,1067,7,10-1/2,...,,2016-377,12/13/12,12.333333,1400,0,0,0,6.819133,7.088702


In [59]:

# Keep the features we want to train our model on
X_train = df_train[['actual_weight', 'declared_horse_weight',
                    'draw', 'win_odds', 'jockey_ave_rank',
                    'trainer_ave_rank', 'recent_ave_rank', 'race_distance']]

y_train = df_train[['HorseWin', 'HorseRankTop3', 'HorseRankTop50Percent']]

# Keep the features we want to train our model on
X_test = df_test[['actual_weight', 'declared_horse_weight',
                   'draw', 'win_odds','jockey_ave_rank',
                    'trainer_ave_rank', 'recent_ave_rank', 'race_distance']]
                    
y_test = df_test[['HorseWin', 'HorseRankTop3', 'HorseRankTop50Percent']]

In [60]:
# View the shape of the train and test files
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(24155, 8)
(24155, 3)
(5209, 8)
(5209, 3)


In [61]:
# Check the mean of the target variable
y_train.mean()

HorseWin                 0.080108
HorseRankTop3            0.239743
HorseRankTop50Percent    0.499690
dtype: float64

We can see that there is data imbalance for the HorseWin and HorseRankTop3 variables, so we need to account for these later when modelling.

In [62]:
# Specify kfold cross validation
kfold = KFold(n_splits=5)

### Model 1: Logistic Regression

In [63]:
# Initialize the model
lr = LogisticRegression()

In [64]:
# Calculate the cross validation score
# Round the score to 3 decimal places

score_lr_win = cross_val_score(lr, X_train, y_train['HorseWin'],
                              cv = kfold, scoring = 'f1').mean()
score_lr_win = round(score_lr_win, 3)

score_lr_top3 = cross_val_score(lr, X_train,y_train['HorseRankTop3'],
                              cv = kfold,scoring = 'f1').mean()
score_lr_top3 = round(score_lr_top3, 3)

score_lr_top50 = cross_val_score(lr,X_train, y_train['HorseRankTop50Percent'],
                              cv = kfold,scoring = 'f1').mean()
score_lr_top50 = round(score_lr_top50, 3)

print("Cross Validation mean score for Logistic Regression:",'\n',
      "Horse win:", score_lr_win,'\n',\
      "Horse in Top 3:", score_lr_top3,'\n',\
      "Horse in Top 50%:", score_lr_top50)

# Create table to store the results
cross_val_score_table = pd.DataFrame(columns = ['Model', 'HorseWin', 'HorseRankTop3', 'HorseRankTop50Percent'])

# Add scores to the table
cross_val_score_table = cross_val_score_table.append({'Model': 'Logistic Regression',
                                                         'HorseWin': score_lr_win,
                                                            'HorseRankTop3': score_lr_top3,
                                                            'HorseRankTop50Percent': score_lr_top50},
                                                            ignore_index = True)



Cross Validation mean score for Logistic Regression: 
 Horse win: 0.046 
 Horse in Top 3: 0.439 
 Horse in Top 50%: 0.721


In [12]:
# Get the classification predictions (1 or 0)
# start time
start_time = time.time()

lr.fit(X_train, y_train['HorseWin'])
lr_win = lr.predict(X_test)

lr.fit(X_train, y_train['HorseRankTop3'])
lr_top3 = lr.predict(X_test)

lr.fit(X_train,y_train['HorseRankTop50Percent'])
lr_top50 = lr.predict(X_test)

print('Running time for logistic regression is:', round(time.time() - start_time, 3), 'seconds')    

Running time for logistic regression is: 0.35 seconds


In [13]:
# Create new dataframe for predictions
lr_pred = pd.DataFrame()
lr_pred['RaceID'] = df_test['race_id']
lr_pred['HorseID'] = df_test['horse_id']

lr_pred['HorseWin'] = lr_win
lr_pred['HorseRankTop3'] = lr_top3
lr_pred['HorseRankTop50Percent'] = lr_top50

# Write predictions into csv file.
lr_pred.to_csv('lr_pred.csv')

For imbalanced data, 1 is more important than 0. The model may try to increase accuracy by predicting all 0. F1 score will be close to 0 while accuracy is close to 1. So for imbalanced data, F1 score (similarly TNR, NPV) is good choice.

F1 = 2 * (precision * recall) / (precision + recall)

Precision P = TP / (TP + FP), probability that one classified positive instance is classified correctly.

Recall R = TP / (TP +FN) , percentage of truly positive instances correctly classified. 


In [14]:
# F1 score for logistic regression
f1_win = round(f1_score(y_test['HorseWin'], lr_win), 3)

f1_top3 = round(f1_score(y_test['HorseRankTop3'], lr_top3), 3)

f1_top50 = round(f1_score(y_test['HorseRankTop50Percent'], lr_top50), 3)

# Print the F1 score
print("F1 score for Logistic Regression:",'\n',
      "Horse win:", f1_win,'\n',\
      "Horse in Top 3:", f1_top3,'\n',\
      "Horse in Top 50%:", f1_top50)

F1 score for Logistic Regression: 
 Horse win: 0.068 
 Horse in Top 3: 0.349 
 Horse in Top 50%: 0.696


In [15]:
# Create a table to compare the F1 score
f1_score_table = pd.DataFrame({'Logistic Regression':[f1_win, f1_top3, f1_top50]},
                                index = ['Horse win', 'Horse in Top 3', 'Horse in Top 50%'])

print(f1_score_table)

                  Logistic Regression
Horse win                       0.068
Horse in Top 3                  0.349
Horse in Top 50%                0.696


### Model 2: Naïve Bayes

In [16]:
# Initialize the model
gnb = GaussianNB()

In [17]:
# Calculate the cross validation score
# Round the score to 3 decimal places

score_gnb_win = cross_val_score(gnb, X_train, y_train['HorseWin'],
                              cv = kfold, scoring = 'f1').mean()
score_gnb_win = round(score_gnb_win, 3)

score_gnb_top3 = cross_val_score(gnb, X_train,y_train['HorseRankTop3'],
                              cv = kfold, scoring = 'f1').mean() 
score_gnb_top3 = round(score_gnb_top3, 3)

score_gnb_top50 = cross_val_score(gnb,X_train, y_train['HorseRankTop50Percent'],
                              cv = kfold, scoring = 'f1').mean()
score_gnb_top50 = round(score_gnb_top50, 3)

# Print the cross validation score
print("Cross Validation mean score for Gaussian Naive Bayes:",'\n',
      "Horse win:", score_gnb_win,'\n',\
      "Horse in Top 3:", score_gnb_top3,'\n',\
      "Horse in Top 50%:", score_gnb_top50)

# Append the results to the table
cross_val_score_table = cross_val_score_table.append({'Model': 'Gaussian Naive Bayes',
                                                            'HorseWin': score_gnb_win,    
                                                            'HorseRankTop3': score_gnb_top3,    
                                                            'HorseRankTop50Percent': score_gnb_top50},
                                                            ignore_index = True)      

Cross Validation mean score for Gaussian Naive Bayes: 
 Horse win: 0.315 
 Horse in Top 3: 0.541 
 Horse in Top 50%: 0.729


In [18]:
# Get the classification predictions (1 or 0)
start_time2 = time.time()

gnb.fit(X_train, y_train['HorseWin'])
gnb_win = gnb.predict(X_test)

gnb.fit(X_train, y_train['HorseRankTop3'])
gnb_top3 = gnb.predict(X_test)

gnb.fit(X_train,y_train['HorseRankTop50Percent'])
gnb_top50 = gnb.predict(X_test)

print('Running time for Gaussian Naive Bayes is:', round(time.time() - start_time2, 3), 'seconds')

Running time for Gaussian Naive Bayes is: 0.017 seconds


In [19]:
# Create new dataframe for predictions

gnb_pred = pd.DataFrame()
gnb_pred['RaceID'] = df_test['race_id']
gnb_pred['HorseID'] = df_test['horse_id']

gnb_pred['HorseWin'] = gnb_win
gnb_pred['HorseRankTop3'] = gnb_top3
gnb_pred['HorseRankTop50Percent'] = gnb_top50

# Write predictions into csv file.
gnb_pred.to_csv('gnb_pred.csv')

In [20]:
# F1 score for Gaussian Naive Bayes
f1_win = round(f1_score(y_test['HorseWin'], gnb_win), 3)

f1_top3 = round(f1_score(y_test['HorseRankTop3'], gnb_top3), 3)

f1_top50 = round(f1_score(y_test['HorseRankTop50Percent'], gnb_top50), 3)

# Print the F1 score
print("F1 score for Gaussian Naive Bayes:",'\n',
        "Horse win:", f1_win,'\n',\
        "Horse in Top 3:", f1_top3,'\n',\
        "Horse in Top 50%:", f1_top50)

F1 score for Gaussian Naive Bayes: 
 Horse win: 0.278 
 Horse in Top 3: 0.504 
 Horse in Top 50%: 0.719


In [21]:
# Append the F1 score to the table
f1_score_table['Gaussian Naive Bayes'] = [f1_win, f1_top3, f1_top50]

### Model 3: Random Forest Classifier

In [22]:
# Initialize the model
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [23]:
# Calculate the cross validation score
# Round the score to 3 decimal places

score_rfc_win = cross_val_score(rfc, X_train, y_train['HorseWin'],
                                cv = kfold, scoring = 'f1').mean()
score_rfc_win = round(score_rfc_win, 3)

score_rfc_top3 = cross_val_score(rfc, X_train, y_train['HorseRankTop3'],
                                    cv = kfold, scoring = 'f1').mean()
score_rfc_top3 = round(score_rfc_top3, 3)

score_rfc_top50 = cross_val_score(rfc, X_train, y_train['HorseRankTop50Percent'],
                                    cv = kfold, scoring = 'f1').mean()
score_rfc_top50 = round(score_rfc_top50, 3)

# Print the cross validation score
print("Cross Validation mean score for Random Forest Classifier:",'\n',
        "Horse win:", score_rfc_win,'\n',\
        "Horse in Top 3:", score_rfc_top3,'\n',\
        "Horse in Top 50%:", score_rfc_top50)

# Add the cross validation score to the table
cross_val_score_table = cross_val_score_table.append({'Model': 'Random Forest Classifier',
                                                        'HorseWin': score_rfc_win,
                                                        'HorseRankTop3': score_rfc_top3,
                                                        'HorseRankTop50Percent': score_rfc_top50},
                                                        ignore_index = True)

Cross Validation mean score for Random Forest Classifier: 
 Horse win: 0.183 
 Horse in Top 3: 0.456 
 Horse in Top 50%: 0.709


In [24]:
# Get the classification predictions (1 or 0)
start_time3 = time.time()

rfc.fit(X_train, y_train['HorseWin'])
rfc_win = rfc.predict(X_test)

rfc.fit(X_train, y_train['HorseRankTop3'])
rfc_top3 = rfc.predict(X_test)

rfc.fit(X_train, y_train['HorseRankTop50Percent'])
rfc_top50 = rfc.predict(X_test)

print('Running time for Random Forest Classifier is:', round(time.time() - start_time3, 3), 'seconds')


Running time for Random Forest Classifier is: 6.751 seconds


In [25]:
# Create new dataframe for predictions
rfc_pred = pd.DataFrame()
rfc_pred['RaceID'] = df_test['race_id']
rfc_pred['HorseID'] = df_test['horse_id']

rfc_pred['HorseWin'] = rfc_win
rfc_pred['HorseRankTop3'] = rfc_top3
rfc_pred['HorseRankTop50Percent'] = rfc_top50

# Write predictions into csv file.
rfc_pred.to_csv('rfc_pred.csv')

In [26]:
# F1 score for Random Forest Classifier
f1_win = round(f1_score(y_test['HorseWin'], rfc_win), 3)
f1_top3 = round(f1_score(y_test['HorseRankTop3'], rfc_top3), 3)
f1_top50 = round(f1_score(y_test['HorseRankTop50Percent'], rfc_top50), 3)

# Print the F1 score
print("F1 score for Random Forest Classifier:",'\n',
        "Horse win:", f1_win,'\n',\
        "Horse in Top 3:", f1_top3,'\n',\
        "Horse in Top 50%:", f1_top50)

# Append the F1 score to the table
f1_score_table['Random Forest Classifier'] = [f1_win, f1_top3, f1_top50]

F1 score for Random Forest Classifier: 
 Horse win: 0.168 
 Horse in Top 3: 0.377 
 Horse in Top 50%: 0.692


In [27]:
# View the cross validation score table
cross_val_score_table

Unnamed: 0,Model,HorseWin,HorseRankTop3,HorseRankTop50Percent
0,Logistic Regression,0.046,0.439,0.721
1,Gaussian Naive Bayes,0.315,0.541,0.729
2,Random Forest Classifier,0.183,0.456,0.709


###  Model 4: Smote + Random Forest Classifier

In [28]:
# Import the library
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [29]:
# Time the model
start_time4 = time.time()

# Smote the training data
sm = SMOTE(random_state = 42)
rfc = RandomForestClassifier(random_state = 42)

# Steps for the pipeline
steps = [('smote', sm), ('rfc', rfc)]

# Create the pipeline
pipeline = Pipeline(steps = steps)

# Create the parameter grid

param_grid = {'rfc__max_depth': [5, 20, 30],
            'rfc__min_samples_leaf': [5, 10, 20]}
            
# Create the grid search object
grid = GridSearchCV(pipeline, param_grid = param_grid, cv = kfold, scoring = 'f1')

# Fit the grid search
grid.fit(X_train, y_train['HorseWin'])

# Print the running time
print('Running time is:', round(time.time() - start_time4, 3), 'seconds')


Running time is: 127.59 seconds


In [30]:
# Print the best parameters
print("Tuned Random Forest Classifier Parameters: {}".format(grid.best_params_))

# Print the best score
print("Best score is {}".format(grid.best_score_))

Tuned Random Forest Classifier Parameters: {'rfc__max_depth': 20, 'rfc__min_samples_leaf': 10}
Best score is 0.31964430969538316


In [53]:
# Calculate the cross validation score
# Round the score to 3 decimal places

score_grid_win = cross_val_score(grid, X_train, y_train['HorseWin'],
                                cv = kfold, scoring = 'f1').mean()
score_grid_win = round(score_grid_win, 3)

score_grid_top3 = cross_val_score(grid, X_train, y_train['HorseRankTop3'],
                                    cv = kfold, scoring = 'f1').mean()
score_grid_top3 = round(score_grid_top3, 3)

score_grid_top50 = cross_val_score(grid, X_train, y_train['HorseRankTop50Percent'],
                                    cv = kfold, scoring = 'f1').mean()
score_grid_top50 = round(score_grid_top50, 3)

# Print the cross validation score
print("Cross Validation mean score for RFC + Smote:",'\n',
        "Horse win:", score_grid_win,'\n',\
        "Horse in Top 3:", score_grid_top3,'\n',\
        "Horse in Top 50%:", score_grid_top50)

# Add the cross validation score to the table
cross_val_score_table = cross_val_score_table.append({'Model': 'RFC + Smote',
                                                        'HorseWin': score_grid_win,
                                                        'HorseRankTop3': score_grid_top3,
                                                        'HorseRankTop50Percent': score_grid_top50},
                                                        ignore_index = True)

KeyboardInterrupt: 

In [31]:
# Get the classification predictions (1 or 0)
start_time5 = time.time()

grid.fit(X_train, y_train['HorseWin'])
grid_win = grid.predict(X_test)

grid.fit(X_train, y_train['HorseRankTop3'])
grid_top3 = grid.predict(X_test)

grid.fit(X_train, y_train['HorseRankTop50Percent'])
grid_top50 = grid.predict(X_test)

print('Running time for RFC + Smote is:', round(time.time() - start_time5, 3), 'seconds')

Running time for RFC + Smote is: 285.386 seconds


In [32]:
# Create new dataframe for predictions
grid_pred = pd.DataFrame()
grid_pred['RaceID'] = df_test['race_id']
grid_pred['HorseID'] = df_test['horse_id']

grid_pred['HorseWin'] = grid_win
grid_pred['HorseRankTop3'] = grid_top3
grid_pred['HorseRankTop50Percent'] = grid_top50

# Write predictions into csv file.
grid_pred.to_csv('grid_pred.csv')

In [33]:
# f1 score for RFC + Smote
f1_win = round(f1_score(y_test['HorseWin'], grid_win), 3)
f1_top3 = round(f1_score(y_test['HorseRankTop3'], grid_top3), 3)
f1_top50 = round(f1_score(y_test['HorseRankTop50Percent'], grid_top50), 3)

# Print the F1 score
print("F1 score for RFC + Smote:",'\n',
        "Horse win:", f1_win,'\n',\
        "Horse in Top 3:", f1_top3,'\n',\
        "Horse in Top 50%:", f1_top50)

# Append the F1 score to the table
f1_score_table['RFC_Smote'] = [f1_win, f1_top3, f1_top50]

F1 score for RFC + Smote: 
 Horse win: 0.303 
 Horse in Top 3: 0.537 
 Horse in Top 50%: 0.717


In [34]:
# View the cross validation score table
cross_val_score_table

Unnamed: 0,Model,HorseWin,HorseRankTop3,HorseRankTop50Percent
0,Logistic Regression,0.046,0.439,0.721
1,Gaussian Naive Bayes,0.315,0.541,0.729
2,Random Forest Classifier,0.183,0.456,0.709


In [43]:
f1_score_table

Unnamed: 0,Logistic Regression,Gaussian Naive Bayes,Random Forest Classifier,RFC_Smote
Horse win,0.068,0.278,0.168,0.303
Horse in Top 3,0.349,0.504,0.377,0.537
Horse in Top 50%,0.696,0.719,0.692,0.717


## Regression Modelling

In [35]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import math
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

### Pre-processing of Train and Test Data

In [36]:
# Read in the data
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

X_train = df_train[['actual_weight', 'declared_horse_weight',
                    'draw','win_odds','jockey_ave_rank','trainer_ave_rank',
                    'recent_ave_rank','race_distance']]

# Define the target
y_train = df_train['finish_time']

# Convert the target to seconds
y_train = y_train.apply(lambda x: x.split('.'))
y_train = y_train.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [37]:
y_train.head()

0    82.33
1    82.65
2    82.66
3    82.66
4    83.02
Name: finish_time, dtype: float64

In [38]:
# Define the testing set
X_test = df_test[['actual_weight', 'declared_horse_weight',
                    'draw', 'win_odds', 'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance']]

In [39]:
# Define the target
y_test = df_test['finish_time']

# Convert the target to seconds
y_test = y_test.apply(lambda x: x.split('.'))
y_test = y_test.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

#### Evaluation:
1. MSE
2. Top_1: the percentage/probality when your prediction of top_1 horse(horse with shortest finish_time) for each race is actually the true top_1 horse.

3. Top_3: percentage/probability when your prediction of top_1 horse for each race is actually within true top_3 horses for each race. 

4. Average_rank: the average true rank of top_1 horse based on your prediction over all races.

For example, when you predict for 3 races and your predicted top_1 horse is actually ranking 1, 3, 5 in these races. Top_1 is 1/3, Top_3 is 2/3 and Average_Rank is 3.0.


In [40]:
# Create a list of index of Top 1 Position from the testing set
top1_index = df_test.index[df_test['finishing_position']==1].tolist()
print(top1_index[:5])

[4, 13, 25, 37, 49]


In [41]:
# Define a function to evaluate the model
# Takes in the prediction and evaluates with the RMSE, top1 & top3 probability and avarage rank

def evaluation(y_pred):
    top1_predict_index = []

    for i in range(len(top1_index)-1):
        
        # Find the min value in the prediction
        temp = np.argmin(y_pred[top1_index[i]:top1_index[i + 1]])
        
        top1_predict_index.append(top1_index[i] + temp)

    temp0 = np.argmin(y_pred[top1_index[len(top1_index) - 1]:])
    top1_predict_index.append(top1_index[len(top1_index) - 1] + temp0)

    rmse = math.sqrt(sum((np.array(y_pred) - np.array(y_test)) ** 2)) / len(y_test)
    rmse = round(rmse, 3)

    top_1 = float(len(set(top1_predict_index) & set(top1_index))) / len(top1_predict_index)
    top_1  = round(top_1, 3)

    top_3 = (df_test['finishing_position'][top1_predict_index].tolist().count(1) + df_test['finishing_position'][top1_predict_index].tolist().count(2)\
          + df_test['finishing_position'][top1_predict_index].tolist().count(3)) / float(len(top1_predict_index))
    top_3 = round(top_3, 3)

    avg_rank = sum(df_test['finishing_position'][top1_predict_index]) / float(len(top1_predict_index))
    avg_rank = round(avg_rank, 3)

    return (rmse, top_1, top_3, avg_rank)


## Model 1: Ridge Regression

In [42]:
# Ridge Regression

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit the model
ridge = Ridge(alpha = 0.1)
ridge.fit(X_train_scaled, y_train)

# Standardize the testing set
X_test_scaled = scaler.transform(X_test)

# Make prediction
ridge_pred = ridge.predict(X_test_scaled)

# Evaluate the model
evaluation(ridge_pred)

# 



NameError: name 'Ridge' is not defined

## Model 2: Gradient Boosting Regression Tree Model (GBRT)

Gradient Boosting Regression Tree Model is a generalization of boosting technique to arbitrary differentiable loss functions.
It is used here becauese of its natural handling of data of mixed type, great predictive power and robustness to outliers in output space (via robust loss functions).

#### Tuning Parameters for GBRT:
Loss function: It has various loss functions including ls, lad, huber, quantile. Choose loss=’qunatile’, since for default values of other parameters, this loss function performs best according to TOP_1 and TOP_3 evaluation statistics.

learning_rate: controls the contribution of each weak classifier (tree).

n_estimators: represents the number of weak learners (tree). Since boosting combines the output of many weak classifiers, the larger n_estimators, the more robust the model is and the better results are.

max_depth: maximum nodes of the tree

I chose learning_rate = 0.01, n_estimators = 10, max_depth = 2, since I found by assigning these three values to parameters, TOP_1 = 0.99, TOP_3 = 1, which performs the best in predicting winner of horse races.

In [None]:
# Instantiate the model
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=10, learning_rate=0.01,
                                 random_state=42, loss='quantile')

# Fit the model
gbrt.fit(X_train, y_train)

# Make predictions
gbrt_pred = gbrt.predict(X_test)

# Evaluate the model
gbrt_eval = evaluation(gbrt_pred)

# Print the evaluation result
print("RMSE:", gbrt_eval[0],'\n',\
        "Top 1 Probability:", gbrt_eval[1],'\n',\
        "Top 3 Probability:", gbrt_eval[2],'\n',\
        "Average Rank:", gbrt_eval[3])

# Original model
#gbrt_model = GradientBoostingRegressor(loss = 'quantile',learning_rate = 0.01, n_estimators = 10, max_depth = 2)

In [None]:
y_train

In [None]:
# Find the mean-squared error
from sklearn.metrics import mean_squared_error

# Instantiate the model
rmse_train = mean_squared_error(y_train, gbrt.predict(X_train))
rmse_test = mean_squared_error(y_test, gbrt.predict(X_test))

# Print the RMSE
print("RMSE for training set:", rmse_train,'\n',\
        "RMSE for testing set:", rmse_test)

# Generalization error
print("Generalization Error:", (rmse_test - rmse_train)/rmse_train)

In [None]:
# Save predictions to a dataframe
gbrt_pred = pd.DataFrame(gbrt_pred)

# Save the predictions to a csv file
gbrt_pred.to_csv('gbrt_pred.csv')

# V. Betting Strategy

Betting strategy is to bet all $1 for the predicted winning horse for each race. 

Concretely, if our prediction is correct for the winning horse, we will receive $1 × odds money. 

Otherwise, we will lose $1. 

The final result is positive if we win some money and negative if we lose.

For 4 classification models, if there are more than 1 HorseWin in a race in predictions, I will choose the one with smallest odds, since as odds increase, winning probability decreases.

In [None]:
import pandas as pd
import numpy as np

In [None]:
testing = pd.read_csv('testing.csv')
champion_index = testing[testing['HorseWin'] == 1].index.tolist()
champion_odds = testing[testing['HorseWin'] == 1]['win_odds'].tolist()

In [None]:
def count_range_in_list(li, min, max):
    ctr = 0
    for x in li:
        if min <= x <= max:
            ctr += 1
    return ctr

In [None]:
def ele_in_list(li, min, max):
    ele = []
    for x in li:
        if min <= x <= max:
            ele.append(x)
    return ele

In [None]:
def betting_result(champion_odds,champion_index,prediction):
    money=0
    for i in range(len(champion_index)-1):
        ctr= count_range_in_list(prediction,champion_index[i],champion_index[i+1]-1)
        if ctr==0:
            money=money-1
        elif ctr==1:
            money=money-1+champion_odds[i]
        else:
            ele_list=ele_in_list(prediction,champion_index[i],champion_index[i+1]-1)
            if min(ele_list)==champion_index[i]:
                money=money-1+champion_odds[i]
            else:
                money=money-1
    ctr = count_range_in_list(prediction, champion_index[len(champion_index)-1],len(testing['HorseWin'])-1)
    if ctr == 0:
        money = money - 1
    elif ctr == 1:
        money = money - 1 + champion_odds[len(champion_index)-1]
    else:
        ele_list = ele_in_list(prediction, champion_index[len(champion_index)-1], len(testing['HorseWin'])-1)
        if min(ele_list)==champion_index[len(champion_index)-1]:
            money = money - 1 + champion_odds[len(champion_index)-1]
        else:
            money = money - 1
    return money

In [None]:
lr = pd.read_csv('lr_predictions.csv')
lr_index = lr[lr['HorseWin'] == 1].index.tolist()
print('Betting result for Logistic Regression model:',betting_result(champion_odds,champion_index,lr_index))

In [None]:
nb = pd.read_csv('nb_predictions.csv')
nb_index = nb[nb['HorseWin'] == 1].index.tolist()
print('Betting result for Naive Bayes model:', betting_result(champion_odds,champion_index,nb_index))


In [None]:
rf = pd.read_csv('rf_predictions.csv')
rf_index = rf[rf['HorseWin'] == 1].index.tolist()
print('Betting result for Random Forest model:',betting_result(champion_odds,champion_index,rf_index))

In [None]:
svm = pd.read_csv('svm_predictions.csv')
svm_index = svm[svm['HorseWin'] == 1].index.tolist()
print('Betting result for SVM model:',betting_result(champion_odds,champion_index,svm_index))


For 4 regression models, I choose the horse with shortest predicted finish_time as the unique winning horse. 

In [None]:
def prediction(predict):
    top1_predict_index = []
    for i in range(len(champion_index)-1):
        temp = np.argmin(predict[champion_index[i]:champion_index[i + 1]])
        top1_predict_index.append(champion_index[i]+temp)
    temp0 = np.argmin(predict[champion_index[len(champion_index) - 1]:])
    top1_predict_index.append(champion_index[len(champion_index) - 1] + temp0)
    return top1_predict_index


In [None]:
reg_prediction = pd.read_csv('reg_prediction.csv')
reg_svr = reg_prediction['svr_predict']
reg_svr_norm = reg_prediction['svr_predict_norm']
reg_gbrt = reg_prediction['gbrt_predict']
reg_gbrt_norm = reg_prediction['gbrt_predict_norm']

In [None]:
svr_index = prediction(reg_svr)
svr_norm_index = prediction(reg_svr_norm)
gbrt_index = prediction(reg_gbrt)
gbrt_norm_index = prediction(reg_gbrt_norm)

In [None]:
print('Betting result for SVR model:',betting_result(champion_odds,champion_index,svr_index))
print('Betting result for SVR (Normalized) model:',betting_result(champion_odds,champion_index,svr_norm_index))
print('Betting result for GBRT model:',betting_result(champion_odds,champion_index,gbrt_index))
print('Betting result for GBRT (Normalized) model:',betting_result(champion_odds,champion_index,gbrt_norm_index))

#### It seems 2 regression algorithms perform well and normalization improves performance of SVR.

#### Improvement:

I set threshold for the average rank and odds. For example, we only bet the horse whose odd is in the smallest 5, and recent_ave_rank is also in smallest 5. This means we decreases the risk of betting in horses with bad recent performance. If the horse cannot satisfy the criteria, we do not bet.

In [None]:
def imp_betting(champion_odds,champion_index,prediction):
    money = 0
    for i in range(len(champion_index) - 1):
        ctr = count_range_in_list(prediction, champion_index[i], champion_index[i + 1] - 1)
        if ctr >= 1:
            temp_odds = testing['win_odds'].tolist()[champion_index[i]:champion_index[i + 1]]
            temp_ave_rank = testing['recent_ave_rank'].tolist()[champion_index[i]:champion_index[i + 1]]
            seq_odds = sorted(temp_odds)
            seq_ave_rank = sorted(temp_ave_rank)
            ele_list = ele_in_list(prediction,champion_index[i],champion_index[i+1]-1)
            if (seq_odds.index(testing['win_odds'][ele_list[0]]) <= 5) and (seq_ave_rank.index(testing['recent_ave_rank'][ele_list[0]]) <= 5):
                money = money - 1
                if ele_list[0] == champion_index[i]:
                    money = money + champion_odds[i]
    return money


In [None]:
print('Improved betting result for Logistic Regression model:',imp_betting(champion_odds,champion_index,lr_index))
print('Improved betting result for Naive Bayes model:',imp_betting(champion_odds,champion_index,nb_index))
print('Improved betting result for Random Forest model:',imp_betting(champion_odds,champion_index,rf_index))
print('Improved betting result for SVR model:',imp_betting(champion_odds,champion_index,svr_index))
print('Improved betting result fo GBRT model:',imp_betting(champion_odds,champion_index,gbrt_index))


However, it seems that setting threshold cannot improve the results for those method whose results are already positive. It only decreases losses by decreasing risk.

# VI. Visualization

### 6.1 Line Chart of Recent Racing Result

Visualize the history racing result of some specific horse.

Interactive: takes a horse ID as input, and outputs a line chart that shows the finishing positions of 6 recent races that the horse attended.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def linechart(horse_id):
    recent_6_runs = training[training.horse_id == horse_id]['recent_6_runs'][-1:].tolist()[0]

    recent_6_runs = list(map(int,recent_6_runs.split('/')))[::-1]
    print(recent_6_runs)
    game_id = training[training.horse_id == horse_id][['race_id']][-6:]
    print(game_id)
    plt.plot(game_id.iloc[:,0], recent_6_runs, marker = '+')
    plt.xlabel('Game_id')
    plt.ylabel('Ranks of recent 6 runs')
    plt.title('Line Chart of recent 6 runs'+'- Horse ' + horse_id)
    plt.ylim((0, 14))
    plt.show()


In [None]:
training = pd.read_csv('training.csv')
horse_id = 'S047'
linechart(horse_id)

### 6.2 Scatter Plot of Win Rate and Number of Wins 

The x-axis is the win rate, and the y-axis is the number of wins. 

Set a threshold and label the name of the horses (or jockeys) who reach the threshold. E.g., if a horse’s win rate is larger than 0.5, and wins more than 4 games, then you should annotate the point of this horse with its name. 

Goal: to find the “best” horse and the “best” jockey. Intuitively, the “best” one should have a high win rate and have won a large number of games.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
training = pd.read_csv('training.csv')
jockey = training.jockey.unique()
a = pd.DataFrame()
a['jockey'] = jockey
a['no_win'] = 0
a['win_rate'] = 0.0
for i in range(len(jockey)):
    ranks = training[training.jockey == jockey[i]]['finishing_position'].tolist()
    a['no_win'][i] = ranks.count(1)
    a['win_rate'][i] = ranks.count(1) / float(len(ranks))

horse = training.horse_name.unique()
b = pd.DataFrame()
b['horse'] = horse
b['no_win'] = 0
b['win_rate'] = 0.0
for i in range(len(jockey)):
    ranks=training[training.horse_name == horse[i]]['finishing_position'].tolist()
    b['no_win'][i] = ranks.count(1)
    b['win_rate'][i] = ranks.count(1) / float(len(ranks))

figure(num = None, figsize = (12, 12), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.subplot(2,1,1)
plt.scatter(a['win_rate'],a['no_win'],alpha = 0.3)
plt.title('Scatter plot for jockeys')
plt.xlabel('Win Rate')
plt.ylabel('Number of Wins')
for i in range(len(jockey)):
    if a['no_win'][i] >= 10 and a['win_rate'][i] >= 0.06:
        plt.annotate(a['jockey'][i],(a['win_rate'][i],a['no_win'][i]),size = 7)


plt.subplot(2,1,2)
plt.scatter(b['win_rate'],b['no_win'],alpha=0.3)
plt.title('Scatter plot for horses')
plt.xlabel('Win Rate')
plt.ylabel('Number of Wins')
for i in range(len(horse)):
    if b['no_win'][i] >= 2 and b['win_rate'][i] >= 0.15:
        plt.annotate(b['horse'][i],(b['win_rate'][i],b['no_win'][i]),size = 7)
        

plt.show()

The best jockey is J Moreira. Since he has the highest number of wins and very high win rate.


The best horse is Romantic Cash, since it has the highest win rate and its ranks are very stable.

### 6.3 Pie Chart of the Draw Bias Effect

Pie chart is a way to visualize the distribution of categorical data

#### Goal: explore the effect of draw bias in horse racing. 

The draw refers to the stall a horse will start the race from. The draw is normally chosen at random on the day the horses are declared to run. Obviously, the inside lane would hold an edge over the field as they have a shorter distance to the bend, in comparison to the other lanes.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
training = pd.read_csv('training.csv')

win_prob = []
for i in range(1,16,1):
    win_prob.append(training[training.draw == i]['finishing_position'].tolist().count(1) / float(len(training[training.draw == i])))

print(win_prob)


In [None]:
labels = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15']
figure(num = None, figsize = (8, 8), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.pie(win_prob,labels = labels,autopct = '%1.1f%%', colors = sns.color_palette("cubehelix"))
plt.title('Pie Chart of the Draw Bias Effect (Number represents No. of lane the horse will run)')
plt.show()

#### Low draws indeed have a considerable advantage, as we can see that as draw increases, the winning probability decreases.

### 6.4 Bar Chart of the Feature Importances

Use random forest classifier to evaluate the importance of the features, which measures how much each feature decreases the weighted impurity in a tree. 

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
training = pd.read_csv('training.csv')
rf_model = RandomForestClassifier()
X_train = training[['actual_weight','declared_horse_weight','draw','win_odds','jockey_ave_rank','trainer_ave_rank',
'recent_ave_rank','race_distance']]
y_train = training[['HorseWin','HorseRankTop3','HorseRankTop50Percent']]
rf_model.fit(X_train,y_train['HorseWin'])
features = 'actual_weight','declared_horse_weight','draw','win_odds','jockey_ave_rank','trainer_ave_rank','recent_ave_rank','race_distance'
importance = rf_model.feature_importances_
indices = np.argsort(importance)[::-1]
print(importance[indices])
print(indices)

In [None]:
figure(num = None, figsize = (8, 6), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.bar(range(len(features)),importance[indices],color = sns.color_palette("RdBu_r", 8))
plt.xticks(range(len(features)),features)
plt.xlabel('Feature names')
plt.ylabel('Importance')
plt.title('Bar Chart of the Feature Importance')
plt.show()

#### We find that actual_weight, declared_horse_weight and draw affect the most, while race_distance has the least effect

### 6.5 Visualize SVM

Since it is hard to visualize high-dimensional data, for the input data X, we only consider these two features: recent_rank and jockey_ave_rank. Also, for the target y, we only care about whether the finishing position is in top 50%. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import matplotlib.patches as mpatches

In [None]:
training = pd.read_csv('training.csv')
X = training[['recent_ave_rank','jockey_ave_rank']]
y = training['HorseRankTop50Percent']
svm_model = SVC(kernel = 'linear')
svm_model.fit(X,y)


In [None]:
def make_meshgrid(x, y, h = .02):

    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy

In [None]:
def plot_contours(clf, xx, yy, **params):

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    return out


In [None]:
X0, X1 = X['recent_ave_rank'], X['jockey_ave_rank']
xx, yy = make_meshgrid(X0, X1)

figure(num = None, figsize = (8, 6), dpi = 90, facecolor = 'w', edgecolor = 'k')
plot_contours(svm_model,xx, yy, alpha = 0.8)
plt.scatter(X0, X1, c = y,  s = 20, edgecolors = 'k')
plt.title('Visualized SVM')
plt.xlabel('Recent average rank')
plt.ylabel('Jockey average rank')
patch = mpatches.Patch(color = 'purple',label = 'SVC(kernel=\'linear\')')
plt.legend(handles = [patch])
plt.show()

Linear kernel seems not bad in two-feature SVM classification. But there are still plenty of points cross the margin which cannot be classified correctly.