In [166]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import TensorBoard
import time

**Load Data**

In [167]:
dataset = pd.read_csv("TeamDataset_7_50.csv").drop(columns = ["Unnamed: 0"])

dataset["Y"] = dataset["Y"].apply(lambda x: x*-1)

**Partition Data**

In [168]:
def weightSeason(train):
    additional_x = pd.DataFrame(columns = train.columns)
    for i in range(train.shape[0]):
        game = train.iloc[[i]]
        year = int(str(train["Date"].iloc[i])[:4])
        multiplier = year - 2013
        for i in range(multiplier):
            additional_x = pd.concat([additional_x, game], axis = 0)

    train = pd.concat([train, additional_x], axis = 0)
    train = train.sort_values(by=['Date'])
    train = train.reset_index(drop=True)
    return train

In [169]:
def weightTeam(train, team):
    multiplier = 5
    additional_x = pd.DataFrame(columns = train.columns)
    for i in range(train.shape[0]):
        game = train.iloc[[i]]
        if game["Home"].iloc[0] == team or game["Away"].iloc[0] == team:
            for i in range(multiplier):
                additional_x = pd.concat([additional_x, game], axis = 0)

    train = pd.concat([train, additional_x], axis = 0)
    train = train.sort_values(by=['Date'])
    train = train.reset_index(drop=True)
    return train

In [170]:
teams = ["ORL", "IND", "MIA", "CHI", "LAL", "LAC", 
         "CLE", "BKN", "PHI", "TOR", "BOS", "WAS", 
         "DET", "NYK", "MIL", "HOU", "CHA", "MIN", 
         "NOP", "DAL", "ATL", "SAS", "MEM", "UTA", 
         "OKC", "POR", "PHX", "SAC", "DEN", "GSW"]

datasets_train = []


train = dataset[dataset["Date"] < 20180800]  
train = weightSeason(train)
for team in teams:
    team_data = weightTeam(train, team)
    team_x = team_data.iloc[:, 5:]
    team_y = team_data.iloc[:, 4:5]
    team_identifiers = team_data.iloc[:, :4]
    datasets_train.append((team_x.to_numpy(), team_y.to_numpy().reshape(team_y.shape[0], 1), team_identifiers))
    print("Done with {}".format(team))
    
#Validation data includes 2018-2019 season
val = dataset[dataset["Date"] > 20180800] 
val = val[val["Date"] < 20190800] 
val_x = val.iloc[:, 5:]
val_y = val.iloc[:, 4:5]
val_identifiers = val.iloc[:, :4]
dataset_val = (val_x.to_numpy(), val_y.to_numpy().reshape(val.shape[0], 1), val_identifiers)

#Test data includes 2019-2020
test = dataset[dataset["Date"] > 20190800]  
test_x = test.iloc[:, 5:]
test_y = test.iloc[:, 4:5]
test_identifiers = test.iloc[:, :4]
dataset_test = (test_x.to_numpy(), test_y.to_numpy().reshape(test.shape[0], 1), test_identifiers)

Done with ORL
Done with IND
Done with MIA
Done with CHI
Done with LAL
Done with LAC
Done with CLE
Done with BKN
Done with PHI
Done with TOR
Done with BOS
Done with WAS
Done with DET
Done with NYK
Done with MIL
Done with HOU
Done with CHA
Done with MIN
Done with NOP
Done with DAL
Done with ATL
Done with SAS
Done with MEM
Done with UTA
Done with OKC
Done with POR
Done with PHX
Done with SAC
Done with DEN
Done with GSW


## Naive Model

In [171]:
#Simple Model --> Predict mean for every example


prediction_value = np.mean(dataset_train[1])
val_mse = np.mean((np.ones((dataset_val[1].shape[0], 1)) * prediction_value - dataset_val[1])**2)
test_mse = np.mean((np.ones((dataset_test[1].shape[0], 1)) * prediction_value - dataset_test[1])**2)

print(val_mse)
print(test_mse)

207.4231565123272
202.17833445659744


## Simple Model (Linear Regression)

In [172]:
weights = np.matmul(np.linalg.pinv(dataset_train[0]), dataset_train[1])
predictions = np.matmul(dataset_test[0], weights)
test_mse = np.mean((predictions - dataset_test[1])**2)

train_pred = np.matmul(dataset_train[0], weights)
train_mse = np.mean((train_pred - dataset_train[1])**2)

val_pred = np.matmul(dataset_val[0], weights)
val_mse = np.mean((val_pred - dataset_val[1])**2)

print(train_mse)
print(val_mse)
print(test_mse)

141.60834667966134
177.34653283255673
162.43834237062939


## Sportbook MSE

In [213]:
lines = pd.read_csv("OfficialLines.csv")
games = pd.read_csv("Master_GameList.csv")

mse_open = []
mse_close = []

for i in range(lines.shape[0]):
    game_id = lines["Game"].iloc[i]
    
    spread_open = float(lines["Open_Spread_Home"].iloc[i])
    spread_close = float(lines["Close_Spread_Home"].iloc[i])
                         
    team_game = games[games["GAME_ID"] == game_id]
    if team_game["TEAM_ABBREVIATION"].iloc[0] == team_game["Home"].iloc[0]:
        home_real = team_game["PTS"].iloc[0]
        away_real = team_game["PTS"].iloc[1]
                         
    else:
        home_real = team_game["PTS"].iloc[1]
        away_real = team_game["PTS"].iloc[0]
        
    spread_real = away_real - home_real
    
    open_val = (spread_real - spread_open)**2
    mse_open.append(open_val)
        
    close_val = (spread_real - spread_close)**2
    
    if close_val > -1 and close_val < 10000:
        mse_close.append(close_val)
    
print("MSE for open is {}".format(np.mean(mse_open)))
print("MSE for close is {}".format(np.mean(mse_close)))    

MSE for open is 150.8273656606694
MSE for close is 148.14847305389222


## Train Model

In [181]:
small = [(19, "DAL"), (23, "UTA")]

#for i, team in enumerate(teams):
for i, team in small:
    checkpoint_filepath = 'checkpoint_bigVal/{}'.format(team)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_filepath, monitor='val_loss', verbose=0, 
                                                    save_best_only=True, save_weights_only=True, mode='min', 
                                                    save_freq='epoch')
    train_x = datasets_train[i][0]
    train_x = np.concatenate((train_x, dataset_val[0]), axis = 0)
    
    train_y = datasets_train[i][1]
    train_y = np.concatenate((train_y, dataset_val[1]), axis = 0)

    model = tf.keras.Sequential()
    NAME = "Model-{}-{}-{}".format("Weighted", team, "Expanded")
    tensorboard = TensorBoard(log_dir = 'Models/{}'.format(NAME))
    model.add(tf.keras.layers.Input(shape=(train_x.shape[1],)))
    model.add(tf.keras.layers.Dense(60, activation='relu'))
    model.add(tf.keras.layers.Dense(30, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    optimizer = tf.keras.optimizers.Adam(0.00001)
    model.compile(loss='mse', optimizer=optimizer)
    
    split = (dataset_val[0].shape[0] / train_x.shape[0])
    model.fit(train_x, train_y, epochs = 400, verbose=0, batch_size = 64, callbacks = [tensorboard, checkpoint], 
              validation_split = split)

In [182]:
for team in teams:
    checkpoint_filepath = 'checkpoint_bigVal/{}'.format(team)
    model.load_weights(checkpoint_filepath)
    print("\n{}".format(team))
    loss_train = model.evaluate(dataset_train[0], dataset_train[1], verbose = 2)
    loss_val = model.evaluate(dataset_val[0], dataset_val[1], verbose = 2)
    loss_test = model.evaluate(datasets_test[0],  datasets_test[1], verbose=2)


ORL
1254/1254 - 1s - loss: 151.6248
39/39 - 0s - loss: 175.3871
31/31 - 0s - loss: 176.6796

IND
1254/1254 - 2s - loss: 150.4563
39/39 - 0s - loss: 174.9926
31/31 - 0s - loss: 182.7012

MIA
1254/1254 - 1s - loss: 151.4589
39/39 - 0s - loss: 174.5072
31/31 - 0s - loss: 177.5104

CHI
1254/1254 - 1s - loss: 151.3304
39/39 - 0s - loss: 173.3978
31/31 - 0s - loss: 175.6337

LAL
1254/1254 - 1s - loss: 151.9726
39/39 - 0s - loss: 175.6748
31/31 - 0s - loss: 174.7802

LAC
1254/1254 - 2s - loss: 153.2101
39/39 - 0s - loss: 177.8969
31/31 - 0s - loss: 179.3354

CLE
1254/1254 - 1s - loss: 153.0576
39/39 - 0s - loss: 177.7879
31/31 - 0s - loss: 188.5524

BKN
1254/1254 - 1s - loss: 151.2906
39/39 - 0s - loss: 175.6936
31/31 - 0s - loss: 176.8442

PHI
1254/1254 - 1s - loss: 151.8486
39/39 - 0s - loss: 173.0894
31/31 - 0s - loss: 175.0081

TOR
1254/1254 - 1s - loss: 149.4034
39/39 - 0s - loss: 174.7312
31/31 - 0s - loss: 175.7889

BOS
1254/1254 - 1s - loss: 149.8415
39/39 - 0s - loss: 173.1047
31/31

## Evaluate Models

**Model Agreement**

In [208]:
def getAccuracy(predict_home, predict_away, line, outcome, conf):
    if predict_home > line and predict_away < line:
        return None
    elif predict_home < line and predict_away > line:
        return None
    elif (predict_home - line) <= conf or (predict_away - line) <= conf:
        return None
    elif predict_home > line and predict_away > line:
        if outcome > line:
            return 1
        else: return 0
    else:
        if outcome < line:
            return 1
        else: return 0 
        
def printStats(stats):
    print("The model achieved predictive accuracy of {} on opening spreads".format(round(stats[0], 3)))
    print("The model achieved predictive accuracy of {} on closing spreads".format(round(stats[1], 3)))

In [None]:
lines = pd.read_csv("OfficialLines.csv")

counter_open = 0
counter_close = 0
open_spreads = []
close_spreads = []
conf = 4

for i in range(datasets_test[2].shape[0]):
    game = datasets_test[2]["Game"].iloc[i]
    home = datasets_test[2]["Home"].iloc[i]
    away = datasets_test[2]["Away"].iloc[i]
    
    #Actual Outcome
    spread_real = datasets_test[1][i, :][0]
    
    
    game_features = datasets_test[0][i, :].reshape(1, datasets_test[0].shape[1])
    
    #Home Model
    checkpoint_filepath = 'checkpoint/{}'.format(home)
    model.load_weights(checkpoint_filepath)
    predict_home = model.predict(game_features)[0, 0]

    #Away Model
    checkpoint_filepath = 'checkpoint/{}'.format(away)
    model.load_weights(checkpoint_filepath)
    predict_away = model.predict(game_features)[0, 0]

    #Sportsbook Prediction
    line = lines[lines["Game"] == game]
    spread_open = float(line["Open_Spread_Home"].iloc[0])
    spread_close = float(line["Close_Spread_Home"].iloc[0])
    
    #Assess open lines
    acc_open = getAccuracy(predict_home, predict_away, spread_open, spread_real, conf)
    if acc_open != None:
        counter_open += 1
        open_spreads.append(acc_open)
        
    #Assess close lines
    acc_close = getAccuracy(predict_home, predict_away, spread_close, spread_real, conf)
    if acc_close != None:
        counter_close += 1
        close_spreads.append(acc_close)

print("Open is {} percent of games".format(100 * (counter_open / datasets_test[2].shape[0])))
print("Close is {} percent of games".format(100 * (counter_close / datasets_test[2].shape[0])))
final_stats = [np.mean(open_spreads), np.mean(close_spreads)]
printStats(final_stats)

**Compute MSE**

In [205]:
train_mse = 0
val_mse = 0
for i, team in enumerate(teams):
    checkpoint_filepath = 'checkpoint/{}'.format(team)
    model.load_weights(checkpoint_filepath)
    train_mse += model.evaluate(datasets_train[i][0], datasets_train[i][1], verbose = 2)
    val_mse += model.evaluate(dataset_val[0], dataset_val[1], verbose = 2)
print("Train MSE is {}".format(train_mse / 30))
print("Validation MSE is {}".format(val_mse / 30))

917/917 - 1s - loss: 150.0759
39/39 - 0s - loss: 172.4235
917/917 - 1s - loss: 148.6202
39/39 - 0s - loss: 173.0741
917/917 - 1s - loss: 145.0933
39/39 - 0s - loss: 174.9758
918/918 - 1s - loss: 153.0775
39/39 - 0s - loss: 174.4477
917/917 - 1s - loss: 147.3410
39/39 - 0s - loss: 173.9798
916/916 - 1s - loss: 149.2510
39/39 - 0s - loss: 175.4232
918/918 - 1s - loss: 154.7843
39/39 - 0s - loss: 178.0298
918/918 - 1s - loss: 149.9467
39/39 - 0s - loss: 176.1398
918/918 - 1s - loss: 144.8386
39/39 - 0s - loss: 172.7151
918/918 - 1s - loss: 141.4887
39/39 - 0s - loss: 171.3109
917/917 - 1s - loss: 140.3386
39/39 - 0s - loss: 170.2478
919/919 - 1s - loss: 146.8316
39/39 - 0s - loss: 170.7331
917/917 - 1s - loss: 157.4682
39/39 - 0s - loss: 175.1130
917/917 - 1s - loss: 143.3578
39/39 - 0s - loss: 172.5170
918/918 - 1s - loss: 140.7502
39/39 - 0s - loss: 173.0145
917/917 - 1s - loss: 148.0063
39/39 - 0s - loss: 173.5927
917/917 - 1s - loss: 152.0322
39/39 - 0s - loss: 173.9947
918/918 - 1s -

In [211]:
lines = pd.read_csv("OfficialLines.csv")

mse_test_all = []
mse_test_subset = []
book_all = []
book_subset = []
game_counter = 0

for i in range(dataset_test[2].shape[0]):
    game = dataset_test[2]["Game"].iloc[i]
    home = dataset_test[2]["Home"].iloc[i]
    away = dataset_test[2]["Away"].iloc[i]
    
    #Actual Outcome
    spread_real = dataset_test[1][i, :][0]
    
    game_features = dataset_test[0][i, :].reshape(1, dataset_test[0].shape[1])
    
    #Home Model
    checkpoint_filepath = 'checkpoint/{}'.format(home)
    model.load_weights(checkpoint_filepath)
    predict_home = model.predict(game_features)[0, 0]

    #Away Model
    checkpoint_filepath = 'checkpoint/{}'.format(away)
    model.load_weights(checkpoint_filepath)
    predict_away = model.predict(game_features)[0, 0]

    #Sportsbook Prediction
    line = lines[lines["Game"] == game]
    spread_open = float(line["Open_Spread_Home"].iloc[0])
    spread_close = float(line["Close_Spread_Home"].iloc[0])
    
    mse_test_all.append((spread_real - np.mean([predict_away, predict_home]))**2)
    book_all.append((spread_real - spread_open)**2)
    
    #Assess open lines
    acc_open = getAccuracy(predict_home, predict_away, spread_open, spread_real, conf)
    if acc_open != None:
        game_counter += 1
        book_subset.append((spread_real - spread_open)**2)
        mse_test_subset.append((spread_real - np.mean([predict_away, predict_home]))**2)
        
        
print("Test all MSE is {}".format(np.mean(mse_test_all)))
print("Test subset is {}".format(np.mean(mse_test_subset)))
print("Book all is {}".format(np.mean(book_all)))
print("Book subset is {}".format(np.mean(book_subset)))
print("Number of subset games is {}".format(game_counter))


Test all MSE is 174.91278164068402
Test subset is 212.434755129812
Book all is 163.8480947476828
Book subset is 177.294
Number of subset games is 125
