In [37]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline

In [38]:
picklefile = 'oct_20.pkl'
df = pd.read_pickle(picklefile)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88977 entries, 0 to 88976
Data columns (total 94 columns):
surface                 88767 non-null object
tourney_level           88977 non-null object
tourney_date            88977 non-null datetime64[ns]
game_index              88977 non-null int64
player1_name            88977 non-null object
player1_seed            26263 non-null float64
player1_ht              76490 non-null float64
player1_age             79023 non-null float64
player1_rank            79974 non-null float64
player1_rank_points     80235 non-null float64
player1_ace             80060 non-null float64
player1_df              80055 non-null float64
player1_svpt            79988 non-null float64
player1_1stIn           79969 non-null float64
player1_1stWon          79887 non-null float64
player1_2ndWon          81253 non-null float64
player1_SvGms           80303 non-null float64
player1_bpSaved         66873 non-null float64
player1_bpFaced         35127 non-null floa

In [40]:
# delta_seed only has ~7000 nonnull observations.
# Drop seed columns so that dropping null values in
# the DF doesn't significantly cut the DF down
# in size.

df = df.drop(['delta_seed', 'player1_seed', 'player2_seed'], axis=1)

In [59]:
# delta_mavg_df = df[['label', 'delta_ace_mavg', 'delta_df_mavg',
#        'delta_svpt_mavg', 'delta_1stIn_mavg', 'delta_1stWon_mavg',
#        'delta_2ndWon_mavg', 'delta_SvGms_mavg', 'delta_bpSaved_mavg',
#        'delta_bpFaced_mavg']].dropna()

# delta_expw_df = df[['label', 'delta_ace_expw', 'delta_df_expw',
#        'delta_svpt_expw', 'delta_1stIn_expw', 'delta_1stWon_expw',
#        'delta_2ndWon_expw', 'delta_SvGms_expw', 'delta_bpSaved_expw',
#        'delta_bpFaced_expw']].dropna()


delta_mavg_df = df[['label', 
        'delta_ht', 'delta_age', 'delta_rank', 'delta_rank_points',
                    'delta_ace_mavg', 'delta_df_mavg',
       'delta_svpt_mavg', 'delta_1stIn_mavg', 'delta_1stWon_mavg',
       'delta_2ndWon_mavg', 'delta_SvGms_mavg', 'delta_bpSaved_mavg',
       'delta_bpFaced_mavg']].dropna()

delta_expw_df = df[['label',
       'delta_ht', 'delta_age', 'delta_rank', 'delta_rank_points',             
                    'delta_ace_expw', 'delta_df_expw',
       'delta_svpt_expw', 'delta_1stIn_expw', 'delta_1stWon_expw',
       'delta_2ndWon_expw', 'delta_SvGms_expw', 'delta_bpSaved_expw',
       'delta_bpFaced_expw']].dropna()

In [60]:
# For expw model.
Xcols = [
     'delta_ht', 'delta_age', 'delta_rank', 'delta_rank_points',
        'delta_ace_expw', 'delta_df_expw',
       'delta_svpt_expw', 'delta_1stIn_expw', 'delta_1stWon_expw',
       'delta_2ndWon_expw', 'delta_SvGms_expw', 'delta_bpSaved_expw',
       'delta_bpFaced_expw']

# For mavg model.
# Xcols = ['delta_ht', 'delta_age', 'delta_rank', 'delta_rank_points',
#                     'delta_ace_mavg', 'delta_df_mavg',
#        'delta_svpt_mavg', 'delta_1stIn_mavg', 'delta_1stWon_mavg',
#        'delta_2ndWon_mavg', 'delta_SvGms_mavg', 'delta_bpSaved_mavg',
#        'delta_bpFaced_mavg']

ycols = 'label'

X = delta_expw_df[Xcols]
y = delta_expw_df[ycols]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [62]:
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

In [63]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Training Accuracy: {0:.3f}".format(logreg.score(X_train, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 0.631
Test Accuracy: 0.634


In [69]:
# SVM

svc = SVC()
svc.fit(X_train_scaled, y_train)
y_pred = svc.predict(X_test_scaled)
print("Training Accuracy: {0:.3f}".format(svc.score(X_train_scaled, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 0.643
Test Accuracy: 0.638


In [64]:
# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Training Accuracy: {0:.3f}".format(rf.score(X_train, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 0.985
Test Accuracy: 0.602


In [65]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Training Accuracy: {0:.3f}".format(nb.score(X_train, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 0.630
Test Accuracy: 0.630


In [68]:
# KNN

knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
print("Training Accuracy: {0:.3f}".format(knn.score(X_train_scaled, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 0.660
Test Accuracy: 0.616


In [66]:
# Decision Tree Classifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("Training Accuracy: {0:.3f}".format(dt.score(X_train, y_train)))
print("Test Accuracy: {0:.3f}".format(accuracy_score(y_test, y_pred)))

Training Accuracy: 1.000
Test Accuracy: 0.555


In [67]:
X.columns, X.shape

(Index(['delta_ht', 'delta_age', 'delta_rank', 'delta_rank_points',
        'delta_ace_expw', 'delta_df_expw', 'delta_svpt_expw',
        'delta_1stIn_expw', 'delta_1stWon_expw', 'delta_2ndWon_expw',
        'delta_SvGms_expw', 'delta_bpSaved_expw', 'delta_bpFaced_expw'],
       dtype='object'), (71106, 13))

In [None]:
### SVM is giving highest test accuracy: 56.3% with all expw features.

In [None]:
### Things to try:
# Run simulation of betting strategy on 2017 season