In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score

import time

import warnings
warnings.filterwarnings("ignore")

# V. Betting Strategy

Betting strategy is to bet all $1 for the predicted winning horse for each race. 

Concretely, if our prediction is correct for the winning horse, we will receive $1 × odds money. 

Otherwise, we will lose $1. 

The final result is positive if we win some money and negative if we lose.

For 4 classification models, if there are more than 1 HorseWin in a race in predictions, I will choose the one with smallest odds, since as odds increase, winning probability decreases.

In [2]:
testing = pd.read_csv('testing.csv')
champion_index = testing[testing['HorseWin'] == 1].index.tolist()
champion_odds = testing[testing['HorseWin'] == 1]['win_odds'].tolist()

In [3]:
def count_range_in_list(li, min, max):
    ctr = 0
    for x in li:
        if min <= x <= max:
            ctr += 1
    return ctr

In [4]:
def ele_in_list(li, min, max):
    ele = []
    for x in li:
        if min <= x <= max:
            ele.append(x)
    return ele

In [5]:
def betting_result(champion_odds,champion_index,prediction):
    money=0
    for i in range(len(champion_index)-1):
        ctr= count_range_in_list(prediction,champion_index[i],champion_index[i+1]-1)
        if ctr==0:
            money=money-1
        elif ctr==1:
            money=money-1+champion_odds[i]
        else:
            ele_list=ele_in_list(prediction,champion_index[i],champion_index[i+1]-1)
            if min(ele_list)==champion_index[i]:
                money=money-1+champion_odds[i]
            else:
                money=money-1
    ctr = count_range_in_list(prediction, champion_index[len(champion_index)-1],len(testing['HorseWin'])-1)
    if ctr == 0:
        money = money - 1
    elif ctr == 1:
        money = money - 1 + champion_odds[len(champion_index)-1]
    else:
        ele_list = ele_in_list(prediction, champion_index[len(champion_index)-1], len(testing['HorseWin'])-1)
        if min(ele_list)==champion_index[len(champion_index)-1]:
            money = money - 1 + champion_odds[len(champion_index)-1]
        else:
            money = money - 1
    return money

In [6]:
lr = pd.read_csv('lr_predictions.csv')
lr_index = lr[lr['HorseWin'] == 1].index.tolist()
print('Betting result for Logistic Regression model:',betting_result(champion_odds,champion_index,lr_index))

Betting result for Logistic Regression model: -449.9


In [7]:
nb = pd.read_csv('nb_predictions.csv')
nb_index = nb[nb['HorseWin'] == 1].index.tolist()
print('Betting result for Naive Bayes model:', betting_result(champion_odds,champion_index,nb_index))


Betting result for Naive Bayes model: 1554.5999999999992


In [8]:
rf = pd.read_csv('rf_predictions.csv')
rf_index = rf[rf['HorseWin'] == 1].index.tolist()
print('Betting result for Random Forest model:',betting_result(champion_odds,champion_index,rf_index))

Betting result for Random Forest model: 44.59999999999998


For 4 regression models, I choose the horse with shortest predicted finish_time as the unique winning horse. 

In [9]:
def prediction(predict):
    top1_predict_index = []
    for i in range(len(champion_index)-1):
        temp = np.argmin(predict[champion_index[i]:champion_index[i + 1]])
        top1_predict_index.append(champion_index[i]+temp)
    temp0 = np.argmin(predict[champion_index[len(champion_index) - 1]:])
    top1_predict_index.append(champion_index[len(champion_index) - 1] + temp0)
    return top1_predict_index


In [11]:
reg_prediction = pd.read_csv('gnb_pred.csv')
#reg_svr = reg_prediction['svr_predict']
#reg_svr_norm = reg_prediction['svr_predict_norm']
reg_gbrt = reg_prediction['gbrt_predict']
#reg_gbrt_norm = reg_prediction['gbrt_predict_norm']

KeyError: 'gbrt_predict'

In [None]:
svr_index = prediction(reg_svr)
svr_norm_index = prediction(reg_svr_norm)
gbrt_index = prediction(reg_gbrt)
gbrt_norm_index = prediction(reg_gbrt_norm)

In [None]:
print('Betting result for SVR model:',betting_result(champion_odds,champion_index,svr_index))
print('Betting result for SVR (Normalized) model:',betting_result(champion_odds,champion_index,svr_norm_index))
print('Betting result for GBRT model:',betting_result(champion_odds,champion_index,gbrt_index))
print('Betting result for GBRT (Normalized) model:',betting_result(champion_odds,champion_index,gbrt_norm_index))

#### It seems 2 regression algorithms perform well and normalization improves performance of SVR.

#### Improvement:

I set threshold for the average rank and odds. For example, we only bet the horse whose odd is in the smallest 5, and recent_ave_rank is also in smallest 5. This means we decreases the risk of betting in horses with bad recent performance. If the horse cannot satisfy the criteria, we do not bet.

In [None]:
def imp_betting(champion_odds,champion_index,prediction):
    money = 0
    for i in range(len(champion_index) - 1):
        ctr = count_range_in_list(prediction, champion_index[i], champion_index[i + 1] - 1)
        if ctr >= 1:
            temp_odds = testing['win_odds'].tolist()[champion_index[i]:champion_index[i + 1]]
            temp_ave_rank = testing['recent_ave_rank'].tolist()[champion_index[i]:champion_index[i + 1]]
            seq_odds = sorted(temp_odds)
            seq_ave_rank = sorted(temp_ave_rank)
            ele_list = ele_in_list(prediction,champion_index[i],champion_index[i+1]-1)
            if (seq_odds.index(testing['win_odds'][ele_list[0]]) <= 5) and (seq_ave_rank.index(testing['recent_ave_rank'][ele_list[0]]) <= 5):
                money = money - 1
                if ele_list[0] == champion_index[i]:
                    money = money + champion_odds[i]
    return money


In [None]:
print('Improved betting result for Logistic Regression model:',imp_betting(champion_odds,champion_index,lr_index))
print('Improved betting result for Naive Bayes model:',imp_betting(champion_odds,champion_index,nb_index))
print('Improved betting result for Random Forest model:',imp_betting(champion_odds,champion_index,rf_index))
print('Improved betting result for SVR model:',imp_betting(champion_odds,champion_index,svr_index))
print('Improved betting result fo GBRT model:',imp_betting(champion_odds,champion_index,gbrt_index))


However, it seems that setting threshold cannot improve the results for those method whose results are already positive. It only decreases losses by decreasing risk.

# VI. Visualization

### 6.1 Line Chart of Recent Racing Result

Visualize the history racing result of some specific horse.

Interactive: takes a horse ID as input, and outputs a line chart that shows the finishing positions of 6 recent races that the horse attended.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def linechart(horse_id):
    recent_6_runs = training[training.horse_id == horse_id]['recent_6_runs'][-1:].tolist()[0]

    recent_6_runs = list(map(int,recent_6_runs.split('/')))[::-1]
    print(recent_6_runs)
    game_id = training[training.horse_id == horse_id][['race_id']][-6:]
    print(game_id)
    plt.plot(game_id.iloc[:,0], recent_6_runs, marker = '+')
    plt.xlabel('Game_id')
    plt.ylabel('Ranks of recent 6 runs')
    plt.title('Line Chart of recent 6 runs'+'- Horse ' + horse_id)
    plt.ylim((0, 14))
    plt.show()


In [None]:
training = pd.read_csv('training.csv')
horse_id = 'S047'
linechart(horse_id)

### 6.2 Scatter Plot of Win Rate and Number of Wins 

The x-axis is the win rate, and the y-axis is the number of wins. 

Set a threshold and label the name of the horses (or jockeys) who reach the threshold. E.g., if a horse’s win rate is larger than 0.5, and wins more than 4 games, then you should annotate the point of this horse with its name. 

Goal: to find the “best” horse and the “best” jockey. Intuitively, the “best” one should have a high win rate and have won a large number of games.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
training = pd.read_csv('training.csv')
jockey = training.jockey.unique()
a = pd.DataFrame()
a['jockey'] = jockey
a['no_win'] = 0
a['win_rate'] = 0.0
for i in range(len(jockey)):
    ranks = training[training.jockey == jockey[i]]['finishing_position'].tolist()
    a['no_win'][i] = ranks.count(1)
    a['win_rate'][i] = ranks.count(1) / float(len(ranks))

horse = training.horse_name.unique()
b = pd.DataFrame()
b['horse'] = horse
b['no_win'] = 0
b['win_rate'] = 0.0
for i in range(len(jockey)):
    ranks=training[training.horse_name == horse[i]]['finishing_position'].tolist()
    b['no_win'][i] = ranks.count(1)
    b['win_rate'][i] = ranks.count(1) / float(len(ranks))

figure(num = None, figsize = (12, 12), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.subplot(2,1,1)
plt.scatter(a['win_rate'],a['no_win'],alpha = 0.3)
plt.title('Scatter plot for jockeys')
plt.xlabel('Win Rate')
plt.ylabel('Number of Wins')
for i in range(len(jockey)):
    if a['no_win'][i] >= 10 and a['win_rate'][i] >= 0.06:
        plt.annotate(a['jockey'][i],(a['win_rate'][i],a['no_win'][i]),size = 7)


plt.subplot(2,1,2)
plt.scatter(b['win_rate'],b['no_win'],alpha=0.3)
plt.title('Scatter plot for horses')
plt.xlabel('Win Rate')
plt.ylabel('Number of Wins')
for i in range(len(horse)):
    if b['no_win'][i] >= 2 and b['win_rate'][i] >= 0.15:
        plt.annotate(b['horse'][i],(b['win_rate'][i],b['no_win'][i]),size = 7)
        

plt.show()

The best jockey is J Moreira. Since he has the highest number of wins and very high win rate.


The best horse is Romantic Cash, since it has the highest win rate and its ranks are very stable.

### 6.3 Pie Chart of the Draw Bias Effect

Pie chart is a way to visualize the distribution of categorical data

#### Goal: explore the effect of draw bias in horse racing. 

The draw refers to the stall a horse will start the race from. The draw is normally chosen at random on the day the horses are declared to run. Obviously, the inside lane would hold an edge over the field as they have a shorter distance to the bend, in comparison to the other lanes.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
training = pd.read_csv('training.csv')

win_prob = []
for i in range(1,16,1):
    win_prob.append(training[training.draw == i]['finishing_position'].tolist().count(1) / float(len(training[training.draw == i])))

print(win_prob)


In [None]:
labels = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15']
figure(num = None, figsize = (8, 8), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.pie(win_prob,labels = labels,autopct = '%1.1f%%', colors = sns.color_palette("cubehelix"))
plt.title('Pie Chart of the Draw Bias Effect (Number represents No. of lane the horse will run)')
plt.show()

#### Low draws indeed have a considerable advantage, as we can see that as draw increases, the winning probability decreases.

### 6.4 Bar Chart of the Feature Importances

Use random forest classifier to evaluate the importance of the features, which measures how much each feature decreases the weighted impurity in a tree. 

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
training = pd.read_csv('training.csv')
rf_model = RandomForestClassifier()
X_train = training[['actual_weight','declared_horse_weight','draw','win_odds','jockey_ave_rank','trainer_ave_rank',
'recent_ave_rank','race_distance']]
y_train = training[['HorseWin','HorseRankTop3','HorseRankTop50Percent']]
rf_model.fit(X_train,y_train['HorseWin'])
features = 'actual_weight','declared_horse_weight','draw','win_odds','jockey_ave_rank','trainer_ave_rank','recent_ave_rank','race_distance'
importance = rf_model.feature_importances_
indices = np.argsort(importance)[::-1]
print(importance[indices])
print(indices)

In [None]:
figure(num = None, figsize = (8, 6), dpi = 90, facecolor = 'w', edgecolor = 'k')
plt.bar(range(len(features)),importance[indices],color = sns.color_palette("RdBu_r", 8))
plt.xticks(range(len(features)),features)
plt.xlabel('Feature names')
plt.ylabel('Importance')
plt.title('Bar Chart of the Feature Importance')
plt.show()

#### We find that actual_weight, declared_horse_weight and draw affect the most, while race_distance has the least effect

### 6.5 Visualize SVM

Since it is hard to visualize high-dimensional data, for the input data X, we only consider these two features: recent_rank and jockey_ave_rank. Also, for the target y, we only care about whether the finishing position is in top 50%. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import matplotlib.patches as mpatches

In [None]:
training = pd.read_csv('training.csv')
X = training[['recent_ave_rank','jockey_ave_rank']]
y = training['HorseRankTop50Percent']
svm_model = SVC(kernel = 'linear')
svm_model.fit(X,y)


In [None]:
def make_meshgrid(x, y, h = .02):

    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy

In [None]:
def plot_contours(clf, xx, yy, **params):

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    return out


In [None]:
X0, X1 = X['recent_ave_rank'], X['jockey_ave_rank']
xx, yy = make_meshgrid(X0, X1)

figure(num = None, figsize = (8, 6), dpi = 90, facecolor = 'w', edgecolor = 'k')
plot_contours(svm_model,xx, yy, alpha = 0.8)
plt.scatter(X0, X1, c = y,  s = 20, edgecolors = 'k')
plt.title('Visualized SVM')
plt.xlabel('Recent average rank')
plt.ylabel('Jockey average rank')
patch = mpatches.Patch(color = 'purple',label = 'SVC(kernel=\'linear\')')
plt.legend(handles = [patch])
plt.show()

Linear kernel seems not bad in two-feature SVM classification. But there are still plenty of points cross the margin which cannot be classified correctly.