In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import sys
sys.path.append('./src/misc')
import tools
import sql_tools

In [4]:
path_to_data = "./src/sql/"
data = sql_tools.read_database(path_to_data+"trailing_database.db", "5_game_trailing")
data = data[data['O/U_line']!='']
data['O/U_result'] = data.apply(lambda row: tools.OU(row['O/U_line'],float(row['total'])),axis=1)

In [223]:
data

Unnamed: 0,season,game_date,teamAbbr,opptAbbr,total,O/U_line,spread,teamMIN,teamFGM,teamFGA,...,opptAST_RATIO_RANK,opptOREB_PCT_RANK,opptDREB_PCT_RANK,opptREB_PCT_RANK,opptTM_TOV_PCT_RANK,opptEFG_PCT_RANK,opptTS_PCT_RANK,opptPACE_RANK,opptPIE_RANK,O/U_result
0,2022-23,2023-05-17 00:00:00,BOS,MIA,239,212.0,8.5,48.0,42.8,89.6,...,489.8,1559.2,1420.8,1529.8,1289.2,610.0,548.6,1028.6,1161.2,1
1,2022-23,2023-05-16 00:00:00,DEN,LAL,258,222.5,7.0,48.0,41.6,91.0,...,1573.4,885.8,1252.8,1194.2,687.6,1663.2,1790.6,992.2,1516.4,1
2,2022-23,2023-05-14 00:00:00,BOS,PHI,200,201.5,6.0,48.0,42.8,89.6,...,592.0,1285.4,844.0,947.6,1230.6,797.0,859.0,1375.0,929.2,0
3,2022-23,2023-05-12 00:00:00,LAL,GSW,223,218.0,3.5,49.0,47.0,92.0,...,867.8,791.4,1384.0,1118.2,922.6,1462.8,1541.6,1536.2,1045.0,1
4,2022-23,2023-05-12 00:00:00,MIA,NYK,188,204.5,6.5,48.0,42.6,86.2,...,857.8,1357.8,1687.6,1528.6,1367.8,1141.2,1191.4,1211.0,1496.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7507,2017-18,2017-10-20 00:00:00,PHX,LAL,262,220.5,3.5,48.0,28.0,89.0,...,953.0,187.0,1588.0,646.0,2279.0,1603.0,1654.0,857.0,120.0,1
7508,2017-18,2017-10-20 00:00:00,BKN,ORL,247,226.5,2.0,48.0,45.0,94.0,...,2007.0,597.0,1588.0,1114.0,1678.0,1939.0,1368.0,857.0,911.0,1
7509,2017-18,2017-10-20 00:00:00,MIN,UTA,197,197.0,4.5,48.0,37.0,85.0,...,1017.0,1535.0,2147.0,2154.0,1054.0,1792.0,1764.0,1675.0,1837.0,2
7510,2017-18,2017-10-20 00:00:00,DAL,SAC,181,202.0,6.0,48.0,38.0,86.0,...,2005.0,597.0,2272.0,1710.0,1933.0,905.0,523.0,3.0,1856.0,0


In [5]:
X,y = tools.model_preprocessing(data,("2016-01-10","2023-12-12"))

## Random Forest

In [4]:
n = 15
n_estimators = 50
n_trials = 10
OU_results = list(y[-n:])

acc_vals = []

print(f"Backtesting for {n} games with {n_estimators} estimators with {n_trials} trials.\n")
for trial in tqdm(range(0,n_trials)):
    pred = []
    for i in range(0,n):
        rf_model = RandomForestClassifier(n_estimators=n_estimators)
        
        X_train = X[n+1-i:]
        y_train = y[n+1-i:]
        
        X_test = X[n-i]
        y_test = y[n-i]
        
        rf_model.fit(X_train,y_train)
        yhat = rf_model.predict(X_test.reshape(1, -1))
        
        pred.append(yhat[0])

    acc = tools.score_results(OU_results,pred)
    acc_vals.append(acc)

  0%|          | 0/20 [00:00<?, ?it/s]

Trial 0


  5%|▌         | 1/20 [00:16<05:22, 16.98s/it]

predicted:  [1, 1, 1, 1, 1, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 1


 10%|█         | 2/20 [00:33<05:05, 16.98s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 2


 15%|█▌        | 3/20 [00:50<04:46, 16.86s/it]

predicted:  [0, 0, 0, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   70.0
Trial 3


 20%|██        | 4/20 [01:07<04:30, 16.88s/it]

predicted:  [0, 1, 0, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   60.0
Trial 4


 25%|██▌       | 5/20 [01:24<04:12, 16.85s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 5


 30%|███       | 6/20 [01:41<03:55, 16.83s/it]

predicted:  [0, 1, 0, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   60.0
Trial 6


 35%|███▌      | 7/20 [01:58<03:39, 16.86s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 7


 40%|████      | 8/20 [02:15<03:23, 16.92s/it]

predicted:  [0, 1, 0, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   60.0
Trial 8


 45%|████▌     | 9/20 [02:32<03:05, 16.91s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 9


 50%|█████     | 10/20 [02:49<02:49, 16.95s/it]

predicted:  [0, 1, 0, 1, 1, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 10


 55%|█████▌    | 11/20 [03:06<02:32, 16.95s/it]

predicted:  [0, 0, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   60.0
Trial 11


 60%|██████    | 12/20 [03:22<02:15, 16.91s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 12


 65%|██████▌   | 13/20 [03:40<01:58, 16.99s/it]

predicted:  [0, 0, 0, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   70.0
Trial 13


 70%|███████   | 14/20 [03:57<01:42, 17.08s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 14


 75%|███████▌  | 15/20 [04:14<01:25, 17.04s/it]

predicted:  [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   40.0
Trial 15


 80%|████████  | 16/20 [04:31<01:07, 16.97s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 16


 85%|████████▌ | 17/20 [04:47<00:50, 16.84s/it]

predicted:  [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   40.0
Trial 17


 90%|█████████ | 18/20 [05:04<00:33, 16.72s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0
Trial 18


 95%|█████████▌| 19/20 [05:21<00:16, 16.86s/it]

predicted:  [1, 0, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   70.0
Trial 19


100%|██████████| 20/20 [05:38<00:00, 16.90s/it]

predicted:  [0, 1, 1, 1, 0, 1, 0, 0, 0, 1]
true:       [1, 0, 0, 1, 0, 1, 1, 2, 0, 1]
accuracy:   50.0





In [11]:
overall_acc = sum(acc_vals)/len(acc_vals)
print(f"Random Forest Classifier achieved {overall_acc*100}% overall accruacy for the past {n} games. Accuracy is calculated from {20} trials.")

0.5399999999999999

## NN Model

In [218]:
import tensorflow as tf
from tensorflow import keras

#earling stopping
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=10)

model = tf.keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape = X[0].shape))
model.add(tf.keras.layers.Dense(128, activation="relu6"))
model.add(keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(3, activation="softmax"))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X[301:], 
          y[301:], 
          epochs=120, 
          validation_split=0.1, 
          batch_size=50,
          verbose=1,
          callbacks=[callback])

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

<keras.callbacks.History at 0x3e1ebbf70>

In [222]:
print(f"Backtesting for {n} games with {n_trials} trials.\n")

n = 300
OU_results = list(y[-n:])
pred = []
acc_vals = []

for i in tqdm(range(0,n)):
  #X_train = X[n+1-i:]
  #y_train = y[n+1-i:]
  
  """if i%10:
      model.fit(X[n+1-i:], 
          y[n+1-i:], 
          epochs=100, 
          validation_split=0.1, 
          batch_size=100,
          callbacks=[callback])"""

  X_test = X[n-i]
  y_test = y[n-i]
  
  prediction_output = model.predict(X_test.reshape(1,-1),verbose=0)
  pred.append(tools.prediction_classifier(prediction_output)[0])

acc = tools.score_results(OU_results,pred)
acc_vals.append(acc)
  
overall_acc = sum(acc_vals)/len(acc_vals)
print(f"\nNeural network achieved {overall_acc*100}% overall accruacy for the past {n} games. Accuracy is calculated from {n_trials} trials.")

Backtesting for 300 games with 1 trials.



100%|██████████| 300/300 [00:08<00:00, 34.84it/s]


Neural network achieved 51.33333333333333% overall accruacy for the past 300 games. Accuracy is calculated from 1 trials.





## CNN

In [272]:
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Flatten
from keras.utils import to_categorical

X,y = tools.model_preprocessing(data,("2016-01-10","2023-12-12"))

In [274]:
X = np.reshape(X, (X.shape[0], X.shape[1], 1))

# Define the neural network structure
model = Sequential()

# Recurrent layer (LSTM)
model.add(LSTM(64, activation='relu', input_shape=(166, 1)))

# Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))  # Output layer with softmax activation for three-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])





In [275]:
pred = model.predict(X_test)
pred_labels = np.argmax(pred, axis=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_categorical)

ValueError: in user code:

    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/engine/training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/engine/training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/engine/training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/engine/training.py", line 2111, in predict_step
        return self(x, training=False)
    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/logange/miniforge3/envs/tensorflow_silicon/lib/python3.9/site-packages/keras/engine/input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_50' (type Sequential).
    
    Input 0 of layer "lstm_6" is incompatible with the layer: expected ndim=3, found ndim=1. Full shape received: (None,)
    
    Call arguments received by layer 'sequential_50' (type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=False
      • mask=None


## Logistic Regression

In [277]:
#remove tied games (which account for only 1% of games) for binary logistic regression model
X,y = tools.model_preprocessing(data[data['O/U_result']!=2],("2017-01-10","2023-12-12"))

In [278]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()
logr.fit( X[30:], y[30:])

In [280]:
#logr.predict(X[300].reshape(1,-1))
logr.score(X[:29],y[:29])

0.41379310344827586

In [281]:
print(f"Backtesting for {n} games with {n_trials} trials.\n")

n = 300
OU_results = list(y[-n:])
pred = []
acc_vals = []

for i in tqdm(range(0,n)):
  #X_train = X[n+1-i:]
  #y_train = y[n+1-i:]
  
  """if i%10:
      model.fit(X[n+1-i:], 
          y[n+1-i:], 
          epochs=100, 
          validation_split=0.1, 
          batch_size=100,
          callbacks=[callback])"""

  X_test = X[n-i]
  y_test = y[n-i]
  
  prediction_output = model.predict(X_test.reshape(1,-1),verbose=0)
  pred.append(tools.prediction_classifier(prediction_output)[0])

acc = tools.score_results(OU_results,pred)
acc_vals.append(acc)
  
overall_acc = sum(acc_vals)/len(acc_vals)
print(f"\nNeural network achieved {overall_acc*100}% overall accruacy for the past {n} games. Accuracy is calculated from {n_trials} trials.")

Backtesting for 20 games with 1 trials.



  4%|▍         | 13/300 [00:14<05:16,  1.10s/it]


KeyboardInterrupt: 