In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Import libraries
from IPython.display import Audio, display
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU, BatchNormalization, GaussianNoise
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.regularizers import l1
from tensorflow.keras import initializers
# Random weights kernel_initializer=initializers.RandomNormal(stddev=0.01), 

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer

# Evaluating
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt

# Saving model
from keras.models import load_model

imputer = SimpleImputer()
MMS = MinMaxScaler()
RS = RobustScaler()
SS = StandardScaler()
Norm = Normalizer()

In [2]:
# Get data and merge together

data_current = pd.read_csv('./Scraping/Current Stats and Games.csv', parse_dates=['Date'])
data_prev = pd.read_csv('./Scraping/Previous Stats and Games.csv', parse_dates=['Date'])

print('Previous Length: ',len(data_prev))
print('Current length: ',len(data_current))

print('Current NA values: ',data_current.isna().sum().sum())
print('Prev Year NA values: ',data_prev.isna().sum().sum())

Previous Length:  96588
Current length:  96588
Current NA values:  52828
Prev Year NA values:  42270


In [3]:
# Home win = 49% baseline
# Fav win = 46.6% baseline

y = data_current['Home Win']
x_current = data_current.loc[:,'H  #Bat':]
y = data_prev['Home Win']
x_prev = data_prev.loc[:,'H  #Bat':]
# x_with_odds = x.copy()
# x_with_odds[['Home Odds', 'Vis Odds']] = data_current[['Home Odds', 'Vis Odds']]
# x_dates = x.copy()

In [4]:
# Add a prefix to the prev data
data_prev_prefix = data_prev.add_prefix('prev_')

# Concat vertically
all_stats = pd.concat([data_current, data_prev_prefix], axis=1)

# Make sure they are the same. Use other cols as well
all_stats['Vis Odds'].equals(all_stats['prev_Vis Odds'])


True

In [5]:
# make X and class_y, reg_y
x_total = all_stats.drop(['Date', 'Home Odds', 'Vis Odds', 'Home', 'Visitor', 'Home PTS', 'Vis PTS', 'Home Points Dif', 'Home Win',
                         'prev_Date', 'prev_Home Odds', 'prev_Vis Odds', 'prev_Home', 'prev_Visitor', 'prev_Home PTS', 'prev_Vis PTS',
                          'prev_Home Points Dif', 'prev_Home Win'], axis=1)

y_class = all_stats['Home Win']
y_reg = all_stats['Home Points Dif']

In [6]:
# Split columns with hyphens
# Create function to use on both current and prev stats
def transform(x, y):
    
    x = x.astype(str)
    cols_to_delim = []
    for col in x.columns:
        result = x[col].str.contains(pat='\d-\d')
        if result.any():
            cols_to_delim.append(col)

    for col in cols_to_delim:
            x[[col + '1', col + '2']] = x[col].str.split('-', expand=True)
            del x[col]

    x = x.astype(float)
    
    # Scale and Normalise
    x = imputer.fit_transform(x, y)
    x = MMS.fit_transform(x)
    return x

x_prev = transform(x_prev, y)
x_current = transform(x_current, y)
x_total = transform(x_total, y_class)

In [7]:
# count for nans in all x's
print('Current NaN values: ',np.isnan(x_prev).sum())
print('Prev NaN values: ',np.isnan(x_current).sum())
print('Total NaN values: ',np.isnan(x_total).sum())

Current NaN values:  0
Prev NaN values:  0
Total NaN values:  0


In [8]:
# Split data into test and train
def split_data(x, y, split):
    
    # split train and test
    n_train = int(split * x.shape[0])
    trainX, testX = x[:n_train, :], x[n_train:, :]
    trainy, testy = y[:n_train], y[n_train:]
    return trainX, testX, trainy, testy

trainX_current, testX_current, trainy, testy = split_data(x_current, y, 0.90)
trainX_prev, testX_prev, _, _ = split_data(x_prev, y, 0.90)
trainX_total, testX_total, trainy_total, testy_total = split_data(x_total, y_class, 0.90)

In [9]:
model = load_model('Model.h5')

In [10]:
score = model.evaluate(testX_prev, testy, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.6822580695152283
Test accuracy: 0.566311240196228


In [11]:
# Run model.h5 on the two datasets
# Aggregate the results
# Test accuracy

In [12]:
# Get predictions from one data set and measure it
# Get Predictions
prev_pred = model.predict(testX_prev)
current_pred = model.predict(testX_current)

# Compare predictions with actual
df = pd.DataFrame(testy)
df['Prev Predictions'] = prev_pred
df['Current Predictions'] = current_pred

# Add Ensemble Predictions
df['Ensemble'] = (df['Prev Predictions'] + df['Current Predictions']) / 2

df

Unnamed: 0,Home Win,Prev Predictions,Current Predictions,Ensemble
86929,True,0.538230,0.557973,0.548101
86930,True,0.504199,0.521170,0.512684
86931,True,0.484584,0.503277,0.493930
86932,False,0.552593,0.537937,0.545265
86933,False,0.534033,0.531536,0.532785
...,...,...,...,...
96583,True,0.554920,0.537438,0.546179
96584,True,0.551390,0.548926,0.550158
96585,False,0.539051,0.546131,0.542591
96586,False,0.546749,0.550458,0.548604


In [145]:
# Create dictionary of accuracy of predictions for each prediction threshold

accuracy_dict = {'Threshold': [],'Bet on Win':[], 'Accurate Wins': [], 'Bet on Loss':[], 'Accurate Losses': []}

for x in range(95, 120):
    threshold = x / 100
    # Count how many times Ensemble > 1.06 (Predicted Win) and Home win == True (Actual Win)

    bet_wins = len(df[df['Ensemble'] > threshold])
    bet_losses = len(df[df['Ensemble'] < threshold])
    
    wins = df.apply(lambda x: True if x['Ensemble'] > threshold and x['Home Win'] == True else False, axis=1)
    losses = df.apply(lambda x: True if x['Ensemble'] < threshold and x['Home Win'] == False else False, axis=1)
    win_perc = (len(wins[wins == True]) / len(df[df['Ensemble'] > threshold]) * 100)
    loss_perc = (len(losses[losses == True]) / len(df[df['Ensemble'] < threshold]) * 100)
    
    accuracy_dict['Threshold'].append(threshold)
    accuracy_dict['Bet on Win'].append(bet_wins)
    accuracy_dict['Bet on Loss'].append(bet_losses)
    accuracy_dict['Accurate Wins'].append(win_perc)
    accuracy_dict['Accurate Losses'].append(loss_perc)
    

In [21]:
def create_threshold_df(df, col):
    # Create dictionary of accuracy of predictions for each prediction threshold

    accuracy_dict = {'Threshold': [],'Bet on Win':[], 'Accurate Wins': [], 'Bet on Loss':[], 'Accurate Losses': []}

    for x in range(45, 60):
        threshold = x / 100
        # Count how many times Ensemble > 1.06 (Predicted Win) and Home win == True (Actual Win)

        bet_wins = len(df[df[col] > threshold])
        bet_losses = len(df[df[col] < threshold])

        wins = df.apply(lambda x: True if x[col] > threshold and x['Home Win'] == True else False, axis=1)
        losses = df.apply(lambda x: True if x[col] < threshold and x['Home Win'] == False else False, axis=1)
        win_perc = (len(wins[wins == True]) / len(df[df[col] > threshold]) * 100)
        loss_perc = (len(losses[losses == True]) / len(df[df[col] < threshold]) * 100)

        accuracy_dict['Threshold'].append(threshold)
        accuracy_dict['Bet on Win'].append(bet_wins)
        accuracy_dict['Bet on Loss'].append(bet_losses)
        accuracy_dict['Accurate Wins'].append(win_perc)
        accuracy_dict['Accurate Losses'].append(loss_perc)

    accuracy_df = pd.DataFrame.from_dict(accuracy_dict)
    return accuracy_df
    
ensemble_df = create_threshold_df(df, 'Ensemble')
prev_df = create_threshold_df(df, 'Prev Predictions')
current_df = create_threshold_df(df, 'Current Predictions')

In [22]:
current_df

Unnamed: 0,Threshold,Bet on Win,Accurate Wins,Bet on Loss,Accurate Losses
0,0.45,9590,52.732013,69,59.42029
1,0.46,9411,52.991181,248,60.483871
2,0.47,9111,53.528702,548,62.043796
3,0.48,8791,54.032533,868,61.40553
4,0.49,8291,54.758172,1368,60.160819
5,0.5,7673,55.558452,1986,58.610272
6,0.51,7030,56.216216,2629,56.903766
7,0.52,6378,56.945751,3281,55.714721
8,0.53,5722,57.584761,3937,54.533909
9,0.54,5023,58.471033,4636,53.666954


In [23]:
prev_df

Unnamed: 0,Threshold,Bet on Win,Accurate Wins,Bet on Loss,Accurate Losses
0,0.45,9514,52.943031,145,66.896552
1,0.46,9262,53.400993,397,64.987406
2,0.47,8901,54.072576,758,64.116095
3,0.48,8545,54.745465,1114,63.464991
4,0.49,7889,55.330207,1770,59.322034
5,0.5,7202,56.220494,2457,57.834758
6,0.51,6566,57.021017,3093,56.644035
7,0.52,5989,57.522124,3670,55.313351
8,0.53,5371,58.220071,4288,54.337687
9,0.54,4573,59.064072,5086,53.126229


In [24]:
ensemble_df

Unnamed: 0,Threshold,Bet on Win,Accurate Wins,Bet on Loss,Accurate Losses
0,0.45,9652,52.6523,7,57.142857
1,0.46,9528,52.907221,131,66.412214
2,0.47,9254,53.393127,405,64.444444
3,0.48,8788,54.30132,871,64.064294
4,0.49,8380,55.178998,1279,63.956216
5,0.5,7674,56.085483,1985,60.654912
6,0.51,7006,56.865544,2653,58.499812
7,0.52,6367,57.531019,3292,56.804374
8,0.53,5489,58.66278,4170,55.275779
9,0.54,4559,59.749945,5100,53.705882


In [26]:
# Create a new df for backtesting

backtest_df = data_current[['Home Odds', 'Vis Odds']]
backtest_df = backtest_df.loc[len(df):,:]

# Merge on index and Home Win
new_df = pd.concat([backtest_df, df], axis=1)
new_df.dropna(inplace=True)
new_df

Unnamed: 0,Home Odds,Vis Odds,Home Win,Prev Predictions,Current Predictions,Ensemble
86929,1.48,2.74,True,0.538230,0.557973,0.548101
86930,2.03,1.83,True,0.504199,0.521170,0.512684
86931,1.89,1.95,True,0.484584,0.503277,0.493930
86932,1.70,2.22,False,0.552593,0.537937,0.545265
86933,1.71,2.20,False,0.534033,0.531536,0.532785
...,...,...,...,...,...,...
96583,1.75,2.14,True,0.554920,0.537438,0.546179
96584,1.77,2.11,True,0.551390,0.548926,0.550158
96585,2.85,1.45,False,0.539051,0.546131,0.542591
96586,2.66,1.50,False,0.546749,0.550458,0.548604


In [69]:
## Apply function to pandas dataframe row by row

def calc_result(df, col, threshold):
    conditions = [
     ((df[col] > threshold) & (df['Home Win'] == True)),
        ((df[col] > threshold) & (df['Home Win'] == False)),
        (True)
    ]
    
    values = [df['Home Odds'] - 1, -1, 0]
    
    new_df[col + ' Home Outcome'] = np.select(conditions, values)
    
    conditions2 = [
        ((df[col] > threshold) & (df['Home Win'] == True)),
        ((df[col] > threshold) & (df['Home Win'] == False)),
        (True)
    ]
    
    values2 = [-1, df['Vis Odds'] - 1, 0]
    
    new_df[col + ' Vis Outcome'] = np.select(conditions2, values2)
    
    
    
    # Cum sum values and save last row
    prev_home_final = new_df.cumsum().iloc[-1,-6:][0]
    current_home_final = new_df.cumsum().iloc[-1,-6:][1]
    ensemble_home_final = new_df.cumsum().iloc[-1,-6:][2]
    prev_vis_final = new_df.cumsum().iloc[-1,-6:][3]
    current_vis_final = new_df.cumsum().iloc[-1,-6:][4]
    ensemble_vis_final = new_df.cumsum().iloc[-1,-6:][5]
    
    print('Threshold: ', threshold, '. Final Home Values: ', prev_home_final, current_home_final, ensemble_home_final, '. Final Vis Values: ', prev_vis_final, current_vis_final, ensemble_vis_final)
    
    # Append to outcome df NOT WORKING
#     outcome_df.append({
#         'Threshold': threshold,
#         'Prev Final': prev_final,
#         'Current Final': current_final,
#         'Ensemble Final': ensemble_final
#     }, ignore_index=True)
    
    

col_array = ['Prev Predictions', 'Current Predictions', 'Ensemble']

# for col in col_array:
#     calc_result(new_df, col, 0.56)
    
new_df

Unnamed: 0,Home Odds,Vis Odds,Home Win,Prev Predictions,Current Predictions,Ensemble,Prev Predictions Outcome,Current Predictions Outcome,Ensemble Outcome
86929,1.48,2.74,True,0.538230,0.557973,0.548101,0.0,0.0,0.0
86930,2.03,1.83,True,0.504199,0.521170,0.512684,0.0,0.0,0.0
86931,1.89,1.95,True,0.484584,0.503277,0.493930,0.0,0.0,0.0
86932,1.70,2.22,False,0.552593,0.537937,0.545265,0.0,0.0,0.0
86933,1.71,2.20,False,0.534033,0.531536,0.532785,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
96583,1.75,2.14,True,0.554920,0.537438,0.546179,0.0,0.0,0.0
96584,1.77,2.11,True,0.551390,0.548926,0.550158,0.0,0.0,0.0
96585,2.85,1.45,False,0.539051,0.546131,0.542591,0.0,0.0,0.0
96586,2.66,1.50,False,0.546749,0.550458,0.548604,0.0,0.0,0.0


In [71]:
outcome_df = pd.DataFrame(columns = ['Threshold', 'Prev Final', 'Current Final', 'Ensemble Final'])

for x in range(45, 60):
    threshold = x / 100
    for col in col_array:
        calc_result(new_df, col, threshold)
    

Threshold:  0.45 . Final Home Values:  -521.2800000000005 -262.7700000000001 -74.31999999999995 . Final Vis Values:  30.059999999999988 -9.069999999999999 -48.34000000000001
Threshold:  0.45 . Final Home Values:  -521.2800000000005 -262.7700000000001 -562.1200000000007 . Final Vis Values:  -245.3900000000001 -9.069999999999999 -48.34000000000001
Threshold:  0.45 . Final Home Values:  -521.2800000000005 -262.7700000000001 -562.1200000000007 . Final Vis Values:  -245.3900000000001 -556.0600000000009 -253.5600000000001
Threshold:  0.46 . Final Home Values:  -476.51 -276.62000000000023 -562.1200000000007 . Final Vis Values:  -245.3900000000001 -556.0600000000009 -253.5600000000001
Threshold:  0.46 . Final Home Values:  -476.51 -276.62000000000023 -550.3200000000007 . Final Vis Values:  -238.15999999999988 -556.0600000000009 -253.5600000000001
Threshold:  0.46 . Final Home Values:  -476.51 -276.62000000000023 -550.3200000000007 . Final Vis Values:  -238.15999999999988 -535.3500000000008 -25

In [68]:
new_df

Unnamed: 0,Home Odds,Vis Odds,Home Win,Prev Predictions,Current Predictions,Ensemble,Prev Predictions Outcome,Current Predictions Outcome,Ensemble Outcome
86929,1.48,2.74,True,0.538230,0.557973,0.548101,0.0,0.0,0.0
86930,2.03,1.83,True,0.504199,0.521170,0.512684,0.0,0.0,0.0
86931,1.89,1.95,True,0.484584,0.503277,0.493930,0.0,0.0,0.0
86932,1.70,2.22,False,0.552593,0.537937,0.545265,0.0,0.0,0.0
86933,1.71,2.20,False,0.534033,0.531536,0.532785,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
96583,1.75,2.14,True,0.554920,0.537438,0.546179,0.0,0.0,0.0
96584,1.77,2.11,True,0.551390,0.548926,0.550158,0.0,0.0,0.0
96585,2.85,1.45,False,0.539051,0.546131,0.542591,0.0,0.0,0.0
96586,2.66,1.50,False,0.546749,0.550458,0.548604,0.0,0.0,0.0


In [73]:
# Bet against the model

## Apply function to pandas dataframe row by row

def calc_result(df, col, threshold):
    conditions = [
     ((df[col] > threshold) & (df['Home Win'] == True)),
        ((df[col] > threshold) & (df['Home Win'] == False)),
        (True)
    ]
    
    values = [-1, df['Vis Odds']-1, 0]
    
    new_df[col + ' Home Outcome'] = np.select(conditions, values)
    
    conditions2 = [
        ((df[col] > threshold) & (df['Home Win'] == True)),
        ((df[col] > threshold) & (df['Home Win'] == False)),
        (True)
    ]
    
    values2 = [df['Home Odds']-1, -1, 0]
    
    new_df[col + ' Vis Outcome'] = np.select(conditions2, values2)
    
    
    
    # Cum sum values and save last row
    prev_home_final = new_df.cumsum().iloc[-1,-6:][0]
    current_home_final = new_df.cumsum().iloc[-1,-6:][1]
    ensemble_home_final = new_df.cumsum().iloc[-1,-6:][2]
    prev_vis_final = new_df.cumsum().iloc[-1,-6:][3]
    current_vis_final = new_df.cumsum().iloc[-1,-6:][4]
    ensemble_vis_final = new_df.cumsum().iloc[-1,-6:][5]
    
    print('Threshold: ', threshold, '. Final Home Values: ', prev_home_final, current_home_final, ensemble_home_final, '. Final Vis Values: ', prev_vis_final, current_vis_final, ensemble_vis_final)
    
    # Append to outcome df NOT WORKING
#     outcome_df.append({
#         'Threshold': threshold,
#         'Prev Final': prev_final,
#         'Current Final': current_final,
#         'Ensemble Final': ensemble_final
#     }, ignore_index=True)
    
    

col_array = ['Prev Predictions', 'Current Predictions', 'Ensemble']

# for col in col_array:
#     calc_result(new_df, col, 0.56)


for x in range(45, 60):
    threshold = x / 100
    for col in col_array:
        calc_result(new_df, col, threshold)

Threshold:  0.45 . Final Home Values:  -262.7700000000001 -521.2800000000005 -74.31999999999995 . Final Vis Values:  30.059999999999988 -9.069999999999999 -48.34000000000001
Threshold:  0.45 . Final Home Values:  -262.7700000000001 -521.2800000000005 -245.3900000000001 . Final Vis Values:  -562.1200000000007 -9.069999999999999 -48.34000000000001
Threshold:  0.45 . Final Home Values:  -262.7700000000001 -521.2800000000005 -245.3900000000001 . Final Vis Values:  -562.1200000000007 -253.5600000000001 -556.0600000000009
Threshold:  0.46 . Final Home Values:  -276.62000000000023 -476.51 -245.3900000000001 . Final Vis Values:  -562.1200000000007 -253.5600000000001 -556.0600000000009
Threshold:  0.46 . Final Home Values:  -276.62000000000023 -476.51 -238.15999999999988 . Final Vis Values:  -550.3200000000007 -253.5600000000001 -556.0600000000009
Threshold:  0.46 . Final Home Values:  -276.62000000000023 -476.51 -238.15999999999988 . Final Vis Values:  -550.3200000000007 -254.08000000000018 -5