In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Warnings
import warnings
warnings.simplefilter("ignore", UserWarning)

from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# Models
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

# These models are voting models based off the above models
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingRegressor

# Data prep
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Model evaluations
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold,StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix



In [11]:
# Classification

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

# Models

svc = SVC(kernel='rbf', gamma=0.1, C=10) # 5% increase with these hyperparamters
KNC = KNeighborsClassifier(weights='distance', p=2, n_neighbors=10, metric='euclidean', leaf_size=40) # 2.7% increase with these hp
ADBC = AdaBoostClassifier(n_estimators=155, learning_rate=0.8) # 2% increase with these hp
RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=5) # 1% better with these hyperparameters

GBC = GradientBoostingClassifier(n_estimators=500, learning_rate=0.15) # 2% better
HGBC = HistGradientBoostingClassifier(min_samples_leaf=25, max_leaf_nodes=80, max_iter=100, max_depth=None, learning_rate=0.1, l2_regularization=1.5) # 2% better
XGB = XGBClassifier(n_estimators=150, learning_rate=0.1) # 1.7% better with hp
# QDA = QuadraticDiscriminantAnalysis() # Same with default hp

# Imputer
imputer = SimpleImputer()
MMScaler = MinMaxScaler()

In [3]:
# Models

svr = SVR()
KNC = KNeighborsRegressor()
ADBC = AdaBoostRegressor()
RFC = RandomForestRegressor()

GBC = GradientBoostingRegressor()
HGBC = HistGradientBoostingRegressor()
XGB = XGBRegressor()

# Imputer
imputer = SimpleImputer()
MMScaler = MinMaxScaler()

In [4]:
data = pd.read_csv('./Scraping/MLB 2020-21.csv', parse_dates=['Date'])
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.sort_values(by=['Date'], inplace=True)

# Create y
data['Home Points Dif'] = data['Home PTS'] - data['Vis PTS']
data['Classification'] = data['Home PTS'] > data['Vis PTS']
y_class = data['Classification']

# Add dates and time

# Get Day, Month and Year from date column
dates = pd.DataFrame()
dates['Year'] = data['Date'].dt.strftime('%Y')
dates['Month'] = data['Date'].dt.strftime('%m')
dates['Day'] = data['Date'].dt.strftime('%d')
data = pd.concat([data, dates], axis=1)


# Result of the teams last game

data["HomeLastDif"] = 0
data["VisitorLastDif"] = 0

from collections import defaultdict
won_last = defaultdict(int) # Create dictionary won last

for index, row in data.iterrows(): # for each row
    home_team = row['Home'] # Take the home team in the row
    visitor_team = row['Visitor'] # Take the vis team in each row
    row['HomeLastDif'] = won_last[home_team] # If HomeLastWin is true set that team to won in the won_last dict
    row['VisitorLastDif'] = won_last[visitor_team] # If VisitorLastWin is true set that team to won in won_last dict
    data.loc[index] = row # Set the index for the next row?
    # Set current win
    won_last[home_team] = row['Home Points Dif'] # If home won set that in the won_last dict
    won_last[visitor_team] = - row['Home Points Dif'] # if home did not win set that in the won_last dict
    
# Add WinStreaks

data['HomeWinStreak'] = 0
data['VisitorWinStreak'] = 0

win_streak = defaultdict(int) #  Create a dictionary for teams winning streaks

for index, row in data.iterrows():
    home_team = row['Home'] # Home team = home team for that row
    visitor_team = row['Visitor'] # Vis team = vis team for that row
    row['HomeWinStreak'] = win_streak[home_team] # HomeWinStreak for that row is looked up in the dictionary win_streak
    row['VisitorWinStreak'] = win_streak[visitor_team] # Set VisitorWinStreak in the row to dict value for that team
    data.loc[index] = row # Set row to next row
    # Set current win streak number
    if row['Home Points Dif']:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1
        

# Which team won in their last match?

last_match_winner = defaultdict(int)

def home_team_won_last(row):
    # Variables equal the team names
    home_team = row['Home']
    visitor_team = row['Visitor']

    teams = tuple(sorted([home_team, visitor_team])) # Tuple of the home and visitor team to search for
    result = 1 if last_match_winner[teams] == row['Home'] else 0 # Look in last_match_winner dict for if these teams have played before
    winner = row['Home'] if  row['Home Points Dif'] else row['Visitor'] # Winner variable is home team if the homewin column says it is

    last_match_winner[teams] = winner # Feed the winner into the last_match_winner dict

    return result

data['HomeTeamWonLast'] = data.apply(home_team_won_last, axis=1) # Apply the function on each row (axis=1)

In [5]:
X = data.loc[:,'Year':]
y = data['Home Points Dif']
y_class = data['Classification']

In [167]:
# Train model and accuracy functions

def train_model(X_train_and_test, y_train_and_test, model):
    ''' Scale, Split, Impute and Train one model '''
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_and_test, y_train_and_test, test_size=0.2, shuffle=False)

    pipe = make_pipeline(SimpleImputer(),StandardScaler(), model)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    preds_df = pd.DataFrame(preds, columns=['Predictions'])
    predictions_array.append(preds_df)
    
def accuracy():    
    past_predictions = ''
    _, _, _, y_test = train_test_split(X, y_class, test_size=0.2, shuffle=False)
    past_predictions = pd.DataFrame(y_test)
    past_predictions = past_predictions.reset_index(drop=True)

    for i in predictions_array:
        df = pd.DataFrame(i)
        df.reset_index(drop=True)
        past_predictions = pd.concat([past_predictions, df], axis=1, ignore_index=True)

    past_predictions.columns = ['Actual', 'SVC', 'KNC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB']
    
    for i in past_predictions.columns:
        print(i)
        true_pos = len(past_predictions[(past_predictions[i] == True) & (past_predictions['Actual'] == True)])
        false_pos = len(past_predictions[(past_predictions[i] == True) & (past_predictions['Actual'] == False)])
        true_neg = len(past_predictions[(past_predictions[i] == False) & (past_predictions['Actual'] == False)])
        false_neg = len(past_predictions[(past_predictions[i] == False) & (past_predictions['Actual'] == True)])

        print('True Pos: ', true_pos, ' / ', (past_predictions[i]==True).sum())
        print('Win Acc: ', true_pos/(true_pos+false_pos))
        print('True Neg: ', true_neg, ' / ', (past_predictions[i]==False).sum())
        print('Lose Acc: ', true_neg/(true_neg+false_neg))
        print('Total Acc: ', (true_pos + true_neg)/len(past_predictions))
        print()

In [171]:
X_no_year = X.loc[:,'HomeLastDif':]

In [168]:

models_array = [svc, KNC, ADBC, RFC, GBC, HGBC, XGB]


predictions_array = []

for model in models_array:
    train_model(X, y_class, model)
accuracy()


Actual
True Pos:  175  /  175
Win Acc:  1.0
True Neg:  174  /  174
Lose Acc:  1.0
Total Acc:  1.0

SVC
True Pos:  51  /  87
Win Acc:  0.5862068965517241
True Neg:  138  /  262
Lose Acc:  0.5267175572519084
Total Acc:  0.5415472779369628

KNC
True Pos:  68  /  125
Win Acc:  0.544
True Neg:  117  /  224
Lose Acc:  0.5223214285714286
Total Acc:  0.5300859598853869

ADBC
True Pos:  82  /  158
Win Acc:  0.5189873417721519
True Neg:  98  /  191
Lose Acc:  0.5130890052356021
Total Acc:  0.5157593123209169

RFC
True Pos:  45  /  94
Win Acc:  0.4787234042553192
True Neg:  125  /  255
Lose Acc:  0.49019607843137253
Total Acc:  0.4871060171919771

GBC
True Pos:  26  /  58
Win Acc:  0.4482758620689655
True Neg:  142  /  291
Lose Acc:  0.4879725085910653
Total Acc:  0.4813753581661891

HGBC
True Pos:  55  /  117
Win Acc:  0.4700854700854701
True Neg:  112  /  232
Lose Acc:  0.4827586206896552
Total Acc:  0.4785100286532951

XGB
True Pos:  28  /  72
Win Acc:  0.3888888888888889
True Neg:  130  /  27

In [None]:
### Find extra features

In [102]:
print('Bet on home baseline: ', len(data[data['Classification'] == True]) / len(data))

Bet on home baseline:  0.527283170591614
