In [26]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tabulate

# REGRESSION
# Return X = testable features, Y = point spread
def spread_df_rgn(normalize=False, get_custom_date_range=False, substituted_spreads=None):
  data_df = pd.read_csv('combined_out.csv')

  real_cols = data_df.select_dtypes(include=['number']).columns

  if not get_custom_date_range:
    data_df = data_df.dropna(axis=0)

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  if get_custom_date_range:
    data_df = data_df[(data_df['Date'] >= '2024-04-01') & (data_df['Date'] <= '2024-04-07')]
    data_df['HomeSpread'] = substituted_spreads
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadActual'] > data_df['HomeSpread']
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadCorrectDirection'].astype(int)
  
  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  
  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, :1]

# CLASSIFICATION
# Return X = testable features, Y = correct point spread direction
def spread_df_cls(normalize=False, get_custom_date_range=False, substituted_spreads=None):
  data_df = pd.read_csv('combined_out.csv')

  real_cols = data_df.select_dtypes(include=['number']).columns

  if not get_custom_date_range:
    data_df = data_df.dropna(axis=0)

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  if get_custom_date_range:
    data_df = data_df[(data_df['Date'] >= '2024-04-01') & (data_df['Date'] <= '2024-04-07')]
    data_df['HomeSpread'] = substituted_spreads
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadActual'] > data_df['HomeSpread']
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadCorrectDirection'].astype(int)

  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  
  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, -1:]

MLP Classifier

Early Stopping Tests

In [28]:
x, y = spread_df_cls()
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

converged_early = []
converged = []
train_accuracy_early = []
train_accuracy = []
test_accuracy_early = []
test_accuracy = []

for i in range(5):
  clf = MLPClassifier( alpha=0.001, hidden_layer_sizes=[128], learning_rate_init=0.01)
  clf.fit(x_train, y_train)
  
  converged.append(clf.n_iter_)
  train_accuracy.append(clf.score(x_train, y_train))
  test_accuracy.append(clf.score(x_test, y_test))
  
print("Average results without Early Stopping")
print("Converged after", mean(converged), "iterations")
print("Train Accuracy:", mean(train_accuracy))
print("Test Accuracy:", mean(test_accuracy))
  
for i in range(5):
  clf = MLPClassifier(early_stopping=True, alpha=0.001, hidden_layer_sizes=[128], learning_rate_init=0.01)
  clf.fit(x_train, y_train)
  
  converged_early.append(clf.n_iter_)
  train_accuracy_early.append(clf.score(x_train, y_train))
  test_accuracy_early.append(clf.score(x_test, y_test))
  
print("Average results with Early Stopping")
print("Converged after", mean(converged_early), "iterations")
print("Train Accuracy:", mean(train_accuracy_early))
print("Test Accuracy:", mean(test_accuracy_early))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average results with Early Stopping
Converged after 73.2 iterations
Train Accuracy: 0.546418428526349
Test Accuracy: 0.5373043917213529


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average results with Early Stopping
Converged after 17.6 iterations
Train Accuracy: 0.5377847901546229
Test Accuracy: 0.5311458859162039
