In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

import time

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
# Read the train and test files
df_train = pd.read_csv('./data/df_train.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [3]:
# View the shape of the train and test files
print(df_train.shape)
print(df_test.shape)

(23500, 27)
(5864, 27)


In [4]:
# View the first 2 rows of the train file
df_train.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,1.0,DOUBLE DRAGON,K019,B Prebble,D Cruz,133,1032,1,-,...,,2014-001,1,1.0,1400,1,1,1,6.05291,7.381862
1,2,2.0,PLAIN BLUE BANNER,S070,D Whyte,D E Ferraris,133,1075,13,2,...,,2014-001,2,2.0,1400,0,1,1,5.825153,6.611465


In [5]:
# View the first 2 rows of the test files
df_test.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,5.0,POWERMAX,A009,N Callan,R Gibson,126,1124,9,-,...,,2016-328,1/4/3/3,2.75,1200,1,1,1,6.438751,6.71542
1,2,2.0,BUDDY BUNDY,T157,K K Chiong,D Cruz,127,1193,8,SH,...,,2016-328,2/11/5/2/6/9,6.8125,1200,0,1,1,6.499033,7.381862


## Regression Modelling

### Pre-processing of Train and Test Data

In [6]:
X_train = df_train[['actual_weight', 'declared_horse_weight',
                    'draw','win_odds','jockey_ave_rank','trainer_ave_rank',
                    'recent_ave_rank','race_distance']]

# Define the target
y_train = df_train['finish_time']

# Convert the target to seconds
y_train = y_train.apply(lambda x: x.split('.'))
y_train = y_train.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [7]:
y_train.head()

0    82.33
1    82.65
2    82.66
3    82.66
4    83.02
Name: finish_time, dtype: float64

In [8]:
# Define the testing set
X_test = df_test[['actual_weight', 'declared_horse_weight',
                    'draw', 'win_odds', 'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance']]

In [9]:
# Define the target
y_test = df_test['finish_time']

# Convert the target to seconds
y_test = y_test.apply(lambda x: x.split('.'))
y_test = y_test.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

#### Evaluation:
1. MSE
2. Top_1: the percentage/probality when your prediction of top_1 horse(horse with shortest finish_time) for each race is actually the true top_1 horse.

3. Top_3: percentage/probability when your prediction of top_1 horse for each race is actually within true top_3 horses for each race. 

4. Average_rank: the average true rank of top_1 horse based on your prediction over all races.

For example, when you predict for 3 races and your predicted top_1 horse is actually ranking 1, 3, 5 in these races. Top_1 is 1/3, Top_3 is 2/3 and Average_Rank is 3.0.


In [10]:
# Create a list of index of Top 1 Position from the testing set
top1_index = df_test.index[df_test['finishing_position']==1].tolist()
print(top1_index[:5])

[0, 12, 24, 36, 48]


In [11]:
# Define a function to evaluate the model
# Takes in the prediction and evaluates top1 & top3 probability and average rank

def evaluation(y_pred):
    top1_predict_index = []

    for i in range(len(top1_index)-1):
        
        # Find the min value in the prediction
        temp = np.argmin(y_pred[top1_index[i]:top1_index[i + 1]])
        
        top1_predict_index.append(top1_index[i] + temp)

    temp0 = np.argmin(y_pred[top1_index[len(top1_index) - 1]:])
    top1_predict_index.append(top1_index[len(top1_index) - 1] + temp0)

    top_1 = float(len(set(top1_predict_index) & set(top1_index))) / len(top1_predict_index)
    top_1  = round(top_1, 3)

    top_3 = (df_test['finishing_position'][top1_predict_index].tolist().count(1) + df_test['finishing_position'][top1_predict_index].tolist().count(2)\
          + df_test['finishing_position'][top1_predict_index].tolist().count(3)) / float(len(top1_predict_index))
    top_3 = round(top_3, 3)

    avg_rank = sum(df_test['finishing_position'][top1_predict_index]) / float(len(top1_predict_index))
    avg_rank = round(avg_rank, 3)

    return (top_1, top_3, avg_rank)


## Model 1: Ridge Regression

In [12]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit the model
ridge = Ridge(alpha = 4000)
ridge.fit(X_train_scaled, y_train)

# Standardize the testing set
X_test_scaled = scaler.transform(X_test)

# Make prediction
ridge_pred = ridge.predict(X_test_scaled)

# Find mean squared error of training set
rmse_train = mean_squared_error(y_train, ridge.predict(X_train_scaled), squared=False)
print('Training set RMSE: ', round(rmse_train, 3))

# Find mean squared error of testing set
rmse_test = mean_squared_error(y_test, ridge_pred, squared=False)
print('Testing set RMSE: ', round(rmse_test, 3))

# Find generalization error percentage
gen_error = (rmse_test - rmse_train) / rmse_train * 100
print('Generalization error: ', round(gen_error, 3), '%')

# Print out the evaluation
print('Top 1 probability: ', evaluation(ridge_pred)[0])
print('Top 3 probability: ', evaluation(ridge_pred)[1])
print('Average rank: ', evaluation(ridge_pred)[2])


Training set RMSE:  3.011
Testing set RMSE:  3.124
Generalization error:  3.757 %
Top 1 probability:  0.178
Top 3 probability:  0.404
Average rank:  5.429


In [13]:
# Create table to store the results
results = pd.DataFrame(columns=['Model', 'Training RMSE', 'Testing RMSE', 'Generalization Error', 'Top 1 Probability', 'Top 3 Probability', 'Average Rank'])

# Create a function to store the results in a dataframe
def store_results(model, rmse_train, rmse_test, gen_error, top1, top3, avg_rank):
    global results
    results = results.append({'Model': model, 'Training RMSE': rmse_train, 'Testing RMSE': rmse_test, 'Generalization Error': gen_error, 'Top 1 Probability': top1, 'Top 3 Probability': top3, 'Average Rank': avg_rank}, ignore_index=True)


In [14]:
# Store the results
store_results('Ridge Regression', round(rmse_train, 3), round(rmse_test, 3), round(gen_error, 3), evaluation(ridge_pred)[0], evaluation(ridge_pred)[1], evaluation(ridge_pred)[2])

# Print out the results
results


Unnamed: 0,Model,Training RMSE,Testing RMSE,Generalization Error,Top 1 Probability,Top 3 Probability,Average Rank
0,Ridge Regression,3.011,3.124,3.757,0.178,0.404,5.429


In [30]:
# Save predictions to csv file
ridge_pred = pd.DataFrame(ridge_pred)
ridge_pred.to_csv('ridge_pred.csv', index=False)

## Model 2: Support Vector Regressor

In [16]:
# Support Vector Regression
svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)

# Fit the model
svr.fit(X_train_scaled, y_train)


In [17]:

# Make prediction
svr_pred = svr.predict(X_test_scaled)

# Find mean squared error of training set
rmse_train = mean_squared_error(y_train, svr.predict(X_train_scaled), squared=False)
print('Training set RMSE: ', round(rmse_train, 3))

# Find mean squared error of testing set
rmse_test = mean_squared_error(y_test, svr_pred, squared=False)
print('Testing set RMSE: ', round(rmse_test, 3))

# Find generalization error percentage
gen_error = (rmse_test - rmse_train) / rmse_train * 100
print('Generalization error: ', round(gen_error, 3), '%')

# Print out the evaluation
print('Top 1 probability: ', evaluation(svr_pred)[0])
print('Top 3 probability: ', evaluation(svr_pred)[1])
print('Average rank: ', evaluation(svr_pred)[2])

Training set RMSE:  1.179
Testing set RMSE:  1.624
Generalization error:  37.77 %
Top 1 probability:  0.167
Top 3 probability:  0.437
Average rank:  4.889


In [18]:
# Store the results
store_results('Support Vector Regression', round(rmse_train, 3), round(rmse_test, 3), round(gen_error, 3), 
                evaluation(svr_pred)[0], evaluation(svr_pred)[1], evaluation(svr_pred)[2])

In [19]:
# Print out the results
results

Unnamed: 0,Model,Training RMSE,Testing RMSE,Generalization Error,Top 1 Probability,Top 3 Probability,Average Rank
0,Ridge Regression,3.011,3.124,3.757,0.178,0.404,5.429
1,Support Vector Regression,1.179,1.624,37.77,0.167,0.437,4.889


In [29]:
# Save predictions to csv file
svr_pred = pd.DataFrame(svr_pred)
svr_pred.to_csv('svr_pred.csv', index=False)

## Model 3: Random Forest Regressor

In [21]:
# Model 3: Random Forest
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)

# Fit the model
rf.fit(X_train_scaled, y_train)

# Make prediction
rf_pred = rf.predict(X_test_scaled)

# Find mean squared error of training set
rmse_train = mean_squared_error(y_train, rf.predict(X_train_scaled), squared=False)
print('Training set RMSE: ', round(rmse_train, 3))

# Find mean squared error of testing set
rmse_test = mean_squared_error(y_test, rf_pred, squared=False)
print('Testing set RMSE: ', round(rmse_test, 3))

# Find generalization error percentage
gen_error = (rmse_test - rmse_train) / rmse_train * 100
print('Generalization error: ', round(gen_error, 3), '%')

# Print out the evaluation
print('Top 1 probability: ', evaluation(rf_pred)[0])
print('Top 3 probability: ', evaluation(rf_pred)[1])
print('Average rank: ', evaluation(rf_pred)[2])

Training set RMSE:  0.973
Testing set RMSE:  1.546
Generalization error:  58.771 %
Top 1 probability:  0.255
Top 3 probability:  0.582
Average rank:  3.805


In [22]:
# Store the results
store_results('Random Forest', round(rmse_train, 3), round(rmse_test, 3), round(gen_error, 3),
                evaluation(rf_pred)[0], evaluation(rf_pred)[1], evaluation(rf_pred)[2])

# Print out the results
results

Unnamed: 0,Model,Training RMSE,Testing RMSE,Generalization Error,Top 1 Probability,Top 3 Probability,Average Rank
0,Ridge Regression,3.011,3.124,3.757,0.178,0.404,5.429
1,Support Vector Regression,1.179,1.624,37.77,0.167,0.437,4.889
2,Random Forest,0.973,1.546,58.771,0.255,0.582,3.805


In [28]:
# Save predictions to csv file
rf_pred = pd.DataFrame(rf_pred)
rf_pred.to_csv('rf_pred.csv', index=False)

## Model 4: Gradient Boosting Regression Tree Model (GBRT)

Gradient Boosting Regression Tree Model is a generalization of boosting technique to arbitrary differentiable loss functions.
It is used here becauese of its natural handling of data of mixed type, great predictive power and robustness to outliers in output space (via robust loss functions).

#### Tuning Parameters for GBRT:
Loss function: It has various loss functions including ls, lad, huber, quantile. Choose loss=’qunatile’, since for default values of other parameters, this loss function performs best according to TOP_1 and TOP_3 evaluation statistics.

learning_rate: controls the contribution of each weak classifier (tree).

n_estimators: represents the number of weak learners (tree). Since boosting combines the output of many weak classifiers, the larger n_estimators, the more robust the model is and the better results are.

max_depth: maximum nodes of the tree

I chose learning_rate = 0.01, n_estimators = 10, max_depth = 2, since I found by assigning these three values to parameters, TOP_1 = 0.99, TOP_3 = 1, which performs the best in predicting winner of horse races.

In [34]:
# Instantiate the model
gbrt = GradientBoostingRegressor(max_depth=10, n_estimators=5, learning_rate=2, random_state=42)

# Fit the model
gbrt.fit(X_train_scaled, y_train)

# Make prediction
gbrt_pred = gbrt.predict(X_test_scaled)

# Find mean squared error of training set
rmse_train = mean_squared_error(y_train, gbrt.predict(X_train_scaled), squared=False)
print('Training set RMSE: ', round(rmse_train, 3))

# Find mean squared error of testing set
rmse_test = mean_squared_error(y_test, gbrt_pred, squared=False)
print('Testing set RMSE: ', round(rmse_test, 3))

# Find generalization error percentage
gen_error = (rmse_test - rmse_train) / rmse_train * 100
print('Generalization error: ', round(gen_error, 3), '%')

# Print out the evaluation
print('Top 1 probability: ', evaluation(gbrt_pred)[0])
print('Top 3 probability: ', evaluation(gbrt_pred)[1])
print('Average rank: ', evaluation(gbrt_pred)[2])

# Original model
#gbrt_model = GradientBoostingRegressor(loss = 'quantile',learning_rate = 0.01, n_estimators = 10, max_depth = 2)

Training set RMSE:  18.532
Testing set RMSE:  19.65
Generalization error:  6.034 %
Top 1 probability:  0.195
Top 3 probability:  0.441
Average rank:  5.103


In [35]:
# Store the results
store_results('Gradient Boosting', round(rmse_train, 3), round(rmse_test, 3), round(gen_error, 3),
                evaluation(gbrt_pred)[0], evaluation(gbrt_pred)[1], evaluation(gbrt_pred)[2])

# Print out the results
results

Unnamed: 0,Model,Training RMSE,Testing RMSE,Generalization Error,Top 1 Probability,Top 3 Probability,Average Rank
0,Ridge Regression,3.011,3.124,3.757,0.178,0.404,5.429
1,Support Vector Regression,1.179,1.624,37.77,0.167,0.437,4.889
2,Random Forest,0.973,1.546,58.771,0.255,0.582,3.805
3,Gradient Boosting,29.42,29.906,1.653,0.998,1.0,1.004
4,Gradient Boosting,2.987,3.004,0.573,0.996,1.0,1.004
5,Gradient Boosting,18.532,19.65,6.034,0.195,0.441,5.103


In [36]:
# Save predictions to csv file
gbrt_pred = pd.DataFrame(gbrt_pred)
gbrt_pred.to_csv('gbrt_pred.csv', index=False)