In [43]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import TensorBoard
import time

In [44]:
#Load Data

datasets = []

for i in range(2, 11):
    dataset = pd.read_csv("TeamDataset_" + str(i) + ".csv").drop(columns = ["Unnamed: 0"])
    dataset = dataset.replace([np.inf, -np.inf, np.nan], 0)
    dataset = dataset.fillna(0)
    datasets.append(dataset)
    for j in range(40, 81, 5):
        dataset = pd.read_csv("TeamDataset_" + str(i) + "_" + str(j) + ".csv").drop(columns = ["Unnamed: 0"])
        dataset = dataset.replace([np.inf, -np.inf, np.nan], 0)
        dataset = dataset.fillna(0)
        datasets.append(dataset)
        


In [45]:
# Data cleaning: incorrect Y

for dataset in datasets:
    dataset["Y"] = dataset["Y"].apply(lambda x: x*-1)

In [46]:
print(datasets[0].head(3))

       Game      Date Home Away     Y  home_b2b  home_min_short  \
0  21300033  20131102  IND  CLE -15.0       0.0          32.791   
1  21300034  20131102  PHI  CHI  -3.0       0.0          33.790   
2  21300035  20131102  NOP  CHA -21.0       0.0          29.406   

   home_home_away  home_PPG_avail  home_MIN_avail  ...  away_s_def_rat  \
0               1             1.0        0.991667  ...           99.45   
1               1             1.0        0.994167  ...           98.55   
2               1             1.0        0.990833  ...           97.90   

   away_s_pf  away_s_opp_tov  away_s_cfga  away_s_cfg_p  away_s_ppg_against  \
0       24.5            15.0         40.0        0.3965                92.0   
1       21.5            16.5         32.5        0.4260                94.0   
2       20.5            17.5         36.5        0.4105                90.0   

   away_s_against_3ptm  away_s_ppg_against_paint  away_s_ppg_against_ft  \
0                  6.0                    

In [47]:
#Partition Data

datasets_train = []
datasets_val = []
datasets_test = []

for dataset in datasets:
    
    #Train data includes 2013-2014 to 2017-2018 seasons --> includes val data for ease of keras use
    train = dataset[dataset["Date"] < 20190800]  
    train_x = train.iloc[:, 5:]
    train_y = train.iloc[:, 4:5]
    train_identifiers = train.iloc[:, :4]
    datasets_train.append((train_x.to_numpy(), train_y.to_numpy().reshape(train.shape[0], 1), train_identifiers))
    
    #Validation data includes 2018-2019 season
    val = dataset[dataset["Date"] > 20180800] 
    val = val[val["Date"] < 20190800] 
    val_x = val.iloc[:, 5:]
    val_y = val.iloc[:, 4:5]
    val_identifiers = val.iloc[:, :4]
    datasets_val.append((val_x.to_numpy(), val_y.to_numpy().reshape(val.shape[0], 1), val_identifiers))
    
    #Test data includes 2019-2020
    test = dataset[dataset["Date"] > 20190800]  
    test_x = test.iloc[:, 5:]
    test_y = test.iloc[:, 4:5]
    test_identifiers = test.iloc[:, :4]
    datasets_test.append((test_x.to_numpy(), test_y.to_numpy().reshape(test.shape[0], 1), test_identifiers))

In [24]:
print(datasets_val[3][1])

[[-18.]
 [ -8.]
 [  1.]
 ...
 [ -4.]
 [ -6.]
 [ -5.]]


In [18]:
print(datasets_test[3][0].shape)

(971, 188)


## Naive MSE (baseline)

In [7]:
#Simple Model --> Predict mean for every example

mse = []
for i in range(len(datasets)):
    prediction_value = np.mean(datasets_train[i][1])
    val_mse = np.mean((np.ones((datasets_test[i][1].shape[0], 1)) * prediction_value - datasets_test[i][1])**2)
    mse.append(val_mse)
print(np.mean(mse))

202.23226619973227


## Simple MSE 

In [9]:
#Simple Model --> Simple Linear Regression

mse = []
train_mse = []

for i in range(len(datasets)):
    weights = np.matmul(np.linalg.pinv(datasets_train[i][0]), datasets_train[i][1])
    predictions = np.matmul(datasets_test[i][0], weights)
    val_mse = np.mean((predictions - datasets_test[i][1])**2)
    train_pred = np.matmul(datasets_train[i][0], weights)
    train_mse_example = np.mean((train_pred - datasets_train[i][1])**2)
    train_mse.append(train_mse_example)
    mse.append(val_mse)

print(np.mean(mse))
print(np.mean(train_mse))

178.4817128839419
148.8256557524309


## Sportsbook MSE

In [10]:
lines = pd.read_csv("OfficialLines.csv")
games = pd.read_csv("Master_GameList.csv")

mse_open = []
mse_close = []

for i in range(lines.shape[0]):
    game_id = lines["Game"].iloc[i]
    
    spread_open = float(lines["Open_Spread_Home"].iloc[i])
    spread_close = float(lines["Close_Spread_Home"].iloc[i])
                         
    team_game = games[games["GAME_ID"] == game_id]
    if team_game["TEAM_ABBREVIATION"].iloc[0] == team_game["Home"].iloc[0]:
        home_real = team_game["PTS"].iloc[0]
        away_real = team_game["PTS"].iloc[1]
                         
    else:
        home_real = team_game["PTS"].iloc[1]
        away_real = team_game["PTS"].iloc[0]
        
    spread_real = away_real - home_real
    
    open_val = (spread_real - spread_open)**2
    mse_open.append(open_val)
        
    close_val = (spread_real - spread_close)**2
    
    if close_val > -1 and close_val < 10000:
        mse_close.append(close_val)
    
print("MSE for open is {}".format(np.mean(mse_open)))
print("MSE for close is {}".format(np.mean(mse_close)))    
    

MSE for open is 150.8273656606694
MSE for close is 148.14847305389222


## NN Model (Iterate Over Dataset Selection)

In [16]:
#Tune dataset 
dataset_sizes = ["2", "2_40", "2-45", "2-50", "2-55", "2-60", "2-65", "2-70", "2-75", "2-80",
                 "3", "3_40", "3-45", "3-50", "3-55", "3-60", "3-65", "3-70", "3-75", "3-80",
                 "4", "4_40", "4-45", "4-50", "4-55", "4-60", "4-65", "4-70", "4-75", "4-80",
                 "5", "5_40", "5-45", "5-50", "5-55", "5-60", "5-65", "5-70", "5-75", "5-80",
                 "6", "6_40", "6-45", "6-50", "6-55", "6-60", "6-65", "6-70", "6-75", "6-80",
                 "7", "7_40", "7-45", "7-50", "7-55", "7-60", "7-65", "7-70", "7-75", "7-80",
                 "8", "8_40", "8-45", "8-50", "8-55", "8-60", "8-65", "8-70", "8-75", "8-80",
                 "9", "9_40", "9-45", "9-50", "9-55", "9-60", "9-65", "9-70", "9-75", "9-80",
                 "10", "10_40", "10-45", "10-50", "10-55", "10-60", "10-65", "10-70", "10-75", "10-80",
                ]



for i in range(len(datasets_train)):
    train_x = datasets_train[i][0]
    train_y = datasets_train[i][1]
    model = tf.keras.Sequential()
    NAME = "DatasetTuning-{}-{}".format(dataset_sizes[i], int(time.time()))
    tensorboard = TensorBoard(log_dir = 'logs/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(60, activation='relu'))
    model.add(tf.keras.layers.Dense(30, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.RMSprop(0.00001)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 2000, verbose=0, batch_size = 128, callbacks = [tensorboard], 
              validation_split = 0.2)

In [None]:
#Further Dataset Tuning (indices 50-80) 
dataset_sizes = ["2", "2_40", "2-45", "2-50", "2-55", "2-60", "2-65", "2-70", "2-75", "2-80",
                 "3", "3_40", "3-45", "3-50", "3-55", "3-60", "3-65", "3-70", "3-75", "3-80",
                 "4", "4_40", "4-45", "4-50", "4-55", "4-60", "4-65", "4-70", "4-75", "4-80",
                 "5", "5_40", "5-45", "5-50", "5-55", "5-60", "5-65", "5-70", "5-75", "5-80",
                 "6", "6_40", "6-45", "6-50", "6-55", "6-60", "6-65", "6-70", "6-75", "6-80",
                 "7", "7_40", "7-45", "7-50", "7-55", "7-60", "7-65", "7-70", "7-75", "7-80",
                 "8", "8_40", "8-45", "8-50", "8-55", "8-60", "8-65", "8-70", "8-75", "8-80",
                 "9", "9_40", "9-45", "9-50", "9-55", "9-60", "9-65", "9-70", "9-75", "9-80",
                 "10", "10_40", "10-45", "10-50", "10-55", "10-60", "10-65", "10-70", "10-75", "10-80",
                ]

for i in range(50, len(datasets_train) - 10):
    train_x = datasets_train[i][0]
    train_y = datasets_train[i][1]
    model = tf.keras.Sequential()
    NAME = "DatasetTuning-{}-{}".format(dataset_sizes[i], int(time.time()))
    tensorboard = TensorBoard(log_dir = 'DSTuning/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(60, activation='relu'))
    model.add(tf.keras.layers.Dense(30, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.Adam(0.00001)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 1000, verbose=0, batch_size = 64, callbacks = [tensorboard], 
              validation_split = 0.2)



In [25]:
#Learning rate and optimization algorithm tuning optimal dataset = 7-50 (53)
train_x = datasets_train[53][0]
train_y = datasets_train[53][1]

learning_rates = [0.001, 0.0001, 0.00001, 0.000001, 0.0000001]

for rate in learning_rates:
    model = tf.keras.Sequential()
    NAME = "LearningRate-{}-{}".format("rms", rate)
    tensorboard = TensorBoard(log_dir = 'learning_rates/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(train_x.shape[1], activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(50, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.RMSprop(rate)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 1000, verbose=0, batch_size = 32, 
              callbacks = [tensorboard], validation_split = 0.2)
    
for rate in learning_rates:
    model = tf.keras.Sequential()
    NAME = "LearningRate-{}-{}".format("adam", rate)
    tensorboard = TensorBoard(log_dir = 'learning_rates/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(train_x.shape[1], activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(50, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.Adam(rate)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 1000, verbose=0, batch_size = 32, 
              callbacks = [tensorboard], validation_split = 0.2)

In [9]:
#Tune batch size using learning rate 1e-5
train_x = datasets_train[53][0]
train_y = datasets_train[53][1]
learning_rate = 1e-5

batches = [4, 8, 16, 32, 64, 128]

for batch in batches:
    model = tf.keras.Sequential()
    NAME = "BatchTuning-{}".format(batch)
    tensorboard = TensorBoard(log_dir = 'batches/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(train_x.shape[1], activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(50, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.RMSprop(learning_rate)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 150, verbose=0, batch_size = batch, 
              callbacks = [tensorboard], validation_split = 0.2)



In [12]:
#Tune Dropout

train_x = datasets_train[53][0]
train_y = datasets_train[53][1]
learning_rate = 1e-5
batch = 8

dropout = [0.1, 0.15, 0.2, 0.25, 0.3]

for prob in dropout:
    model = tf.keras.Sequential()
    NAME = "DropoutTuning-{}".format(prob)
    tensorboard = TensorBoard(log_dir = 'dropout/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(train_x.shape[1], activation='relu'))
    model.add(tf.keras.layers.Dense(150, activation='relu'))
    model.add(tf.keras.layers.Dropout(prob))
    model.add(tf.keras.layers.Dense(75, activation='relu'))
    model.add(tf.keras.layers.Dropout(prob))
    model.add(tf.keras.layers.Dense(30, activation='relu'))
    model.add(tf.keras.layers.Dropout(prob))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dropout(prob))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.RMSprop(learning_rate)
    model.compile(loss='mse', optimizer=optimizer)
    model.fit(train_x, train_y, epochs = 150, verbose=0, batch_size = batch, 
              callbacks = [tensorboard], validation_split = 0.2)

In [14]:
#Preliminary Model -->batch size = 64, learning rate = 0.00001, layers of 60 30 10.
#Dropout not used in this premliminary model

checkpoint_filepath = 'checkpoint/'

checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_filepath, monitor='val_loss', verbose=0, 
                                                save_best_only=True, save_weights_only=False, 
                                                mode='min', save_freq='epoch')

train_x = datasets_train[53][0]
train_y = datasets_train[53][1]

model = tf.keras.Sequential()
NAME = "Model-{}-{}".format("milestone", "one")
tensorboard = TensorBoard(log_dir = 'Models/{}'.format(NAME))
model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
model.add(tf.keras.layers.Dense(60, activation='relu'))
model.add(tf.keras.layers.Dense(30, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1))
optimizer = tf.keras.optimizers.Adam(0.00001)
model.compile(loss='mse', optimizer=optimizer)
model.fit(train_x, train_y, epochs = 2000, verbose=0, batch_size = 64, callbacks = [tensorboard, checkpoint], 
          validation_split = 0.2)

INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/

INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/

INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/

INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets
INFO:tensorflow:Assets written to: checkpoint/assets


<tensorflow.python.keras.callbacks.History at 0x1c9899e50>

In [30]:
loss_train = model.evaluate(datasets_train[53][0], datasets_train[53][1], verbose = 2)
loss_val = model.evaluate(datasets_val[53][0], datasets_val[53][1], verbose = 2)
loss_test = model.evaluate(datasets_test[53][0],  datasets_test[53][1], verbose=2)

228/228 - 0s - loss: 152.0892
39/39 - 0s - loss: 174.2079
31/31 - 0s - loss: 176.4834


In [48]:
#Accuracy Performance

predictions = model.predict([datasets_test[53][0]])
identifiers = pd.DataFrame(datasets_test[53][2])
identifiers.insert(0, "Predicted", predictions)

In [54]:
#Evaluate model on all games
evaluateModel(identifiers, 0)

#Evaluate model on games with confidence
evaluateModel(identifiers, 6)

open is 100.0 percent of games
close is 99.89701338825952 percent of games
The model achieved predictive accuracy of 0.48 on opening spreads
The model achieved predictive accuracy of 0.498 on closing spreads
open is 13.079299691040164 percent of games
close is 17.91967044284243 percent of games
The model achieved predictive accuracy of 0.528 on opening spreads
The model achieved predictive accuracy of 0.494 on closing spreads


In [41]:
games = pd.read_csv("Master_GameList.csv")

def getHomePoints(game_id):
    team_game = games[games["GAME_ID"] == game_id]
    if team_game.iloc[0]["TEAM_ABBREVIATION"] == team_game.iloc[0]["Home"]:
        home_real = team_game.iloc[0]["PTS"]
    else:
        home_real = team_game.iloc[1]["PTS"]
    return home_real

def getAwayPoints(game_id):
    team_game = games[games["GAME_ID"] == game_id]
    if team_game.iloc[0]["TEAM_ABBREVIATION"] == team_game.iloc[0]["Home"]:
        away_real = team_game.iloc[1]["PTS"]
    else:
        away_real = team_game.iloc[0]["PTS"]
    return away_real

def getWinner(away_real, home_real):
    if away_real > home_real:
        winner_real = "away"
    else:
        winner_real = "home"
    return winner_real

def getAccuracy(real, sportsbook, predicted):
    if (real < sportsbook and predicted < sportsbook) or (real > sportsbook and predicted > sportsbook):
        return 1
    return 0

def getWinnerAccuracy(predicted, sportsbook):
    if predicted == sportsbook:
        return 1
    return 0

def printStats(stats):
    print("The model achieved predictive accuracy of {} on opening spreads".format(round(stats[0], 3)))
    print("The model achieved predictive accuracy of {} on closing spreads".format(round(stats[1], 3)))

In [50]:
#Evaluate Model

lines = pd.read_csv("OfficialLines.csv")

def evaluateModel(dataset, conf):
    open_spreads = []
    close_spreads = [] 
    
    counter_open = 0
    counter_close = 0
    
    for k in range(dataset.shape[0]):
        
        #Identify game information
        game = dataset["Game"].iloc[k]
        home = dataset["Home"].iloc[k]
        away = dataset["Home"].iloc[k]

        #Get predicted and actual scores
        predicted = dataset["Predicted"].iloc[k]

        home_real = getHomePoints(game)
        away_real = getAwayPoints(game)

        spread_real = away_real - home_real

        #Get sportsbook lines
        line = lines[lines["Game"] == game]
        
        spread_open = float(line["Open_Spread_Home"].iloc[0])
        spread_close = float(line["Close_Spread_Home"].iloc[0])
        
        #Compute prediction quality
        
        if abs(predicted - spread_open) >= conf:
            counter_open += 1
            open_spreads.append(getAccuracy(spread_real, spread_open, predicted))
            
        if abs(predicted - spread_close) >= conf:
            counter_close += 1
            close_spreads.append(getAccuracy(spread_real, spread_close, predicted))
     
    print("open is {} percent of games".format(100 * (counter_open / dataset.shape[0])))
    print("close is {} percent of games".format(100 * (counter_close / dataset.shape[0])))
    final_stats = [np.mean(open_spreads), np.mean(close_spreads)]
    printStats(final_stats)
    

In [10]:
a = 0.007
print(round(a, 2))

0.01
