In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot

# Warnings
import warnings
warnings.simplefilter("ignore", UserWarning)

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# These models are voting models based off the above models
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingRegressor

# Data prep
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Model evaluations
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold,StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score



In [2]:
# Models NEW HPS

ADBC = AdaBoostClassifier(base_estimator=None, learning_rate=0.1, n_estimators=200)
XGB = XGBClassifier(n_estimators=100, learning_rate=0.1, colsample_bytree=0.4, max_depth=15, reg_alpha=1.2, reg_lambda=1.3, subsample=0.8)
svc = SVC(C=1, decision_function_shape='ovo', kernel='linear', probability=True, shrinking=True)
KNC = KNeighborsClassifier()
RFC = RandomForestClassifier(criterion='gini', max_depth=9, max_leaf_nodes=10, min_samples_split=9, n_estimators=200, oob_score=True, warm_start=True)

GBC = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.01, loss='deviance', n_estimators=200, subsample=1)
HGBC = HistGradientBoostingClassifier(learning_rate=0.01, loss='auto', max_iter=100, max_leaf_nodes=20, min_samples_leaf=15)
QDA = QuadraticDiscriminantAnalysis()


# Imputer
imputer = SimpleImputer()
MMScaler = MinMaxScaler()

In [3]:
ADBC = AdaBoostClassifier()
XGB = XGBClassifier()
svc = SVC()
KNC = KNeighborsClassifier()
RFC = RandomForestClassifier()
GBC = GradientBoostingClassifier()
HGBC = HistGradientBoostingClassifier()
QDA = QuadraticDiscriminantAnalysis()

In [12]:
###### READING DATA #####

use_current_year = True
year = 2020
prev_year = year - 1
next_year = year + 1

season_dict = {2020: ['2020-12-1', '2021-06-30'],
            2019: ['2019-10-1', '2020-10-30'],
             2018: ['2018-10-1', '2019-06-30'],
             2017: ['2017-10-1', '2018-06-30'],
             2016: ['2016-10-1', '2017-06-30']}


data = pd.read_excel('./content/NBA_COMBINED.xlsx', sheet_name='Games', parse_dates=['Date'])

if use_current_year:
    rankings = pd.read_csv('./Web Scraping/Power_rankings.csv', usecols=['Name',f'{prev_year} WRank', f'{prev_year} GARank', f'{prev_year} GFRank', f'{prev_year} MRank',
                                                                     f'{year} WRank', f'{year} GARank', f'{year} GFRank', f'{year} MRank'])
else:
    rankings = pd.read_csv('./Web Scraping/Power_rankings.csv', usecols=['Name','2016 WRank', '2016 GARank', '2016 GFRank', '2016 MRank'])


# Drop unnecessary columns
data = data[['Date', 'Start (ET)', 'Visitor', 'Vis PTS', 'Home', 'Home PTS']]

# Drop unnecessary rows
data = data[(data['Date'] > season_dict[prev_year][0]) & (data['Date'] < season_dict[year][1])]

# Create prediction column
data['Home Win'] = data['Home PTS'] > data['Vis PTS']

# Create home and vis rankings DF
home_rank = rankings.copy()
vis_rank = rankings.copy()

if use_current_year:
    home_rank.columns = ['Home', f'{prev_year} HW', f'{prev_year} HGF', f'{prev_year} HGA', f'{prev_year} HMP', f'{year} HW', f'{year} HGF', f'{year} HGA', f'{year} HMP']
    vis_rank.columns = ['Visitor', f'{prev_year} VW', f'{prev_year} VGF', f'{prev_year} VGA', f'{prev_year} VMP', f'{year} VW', f'{year} VGF', f'{year} VGA', f'{year} VMP']
else:
    home_rank.columns = ['Home', f'{prev_year} HW', f'{prev_year} HGF', f'{prev_year} HGA', f'{prev_year} HMP']
    vis_rank.columns = ['Visitor', f'{prev_year} VW', f'{prev_year} VGF', f'{prev_year} VGA', f'{prev_year} VMP']


# Merge rankings and df columns
data = data.merge(home_rank, on='Home', how='left')
data = data.merge(vis_rank, on='Visitor', how='left')

# Set current year columns for last year games to nan
# index_to_nan = data[(data['Date'] > season_dict[year][0])].index[0]
# data.loc[:index_to_nan,f'{year} HW':] = np.NaN

# Set X/y_train_and_test
rows_with_results = len(data) - len(data[data['Vis PTS'].isna()])

# Training and testing
X_train_and_test = data.loc[:rows_with_results-1,f'{prev_year} HW':]
y_train_and_test = data['Home Win'][:rows_with_results-1]
y_train_and_test = y_train_and_test.astype(bool)

# Make future predictions dataframe and teams and dates as well

data

Unnamed: 0,Date,Start (ET),Visitor,Vis PTS,Home,Home PTS,Home Win,2019 HW,2019 HGF,2019 HGA,...,2020 HGA,2020 HMP,2019 VW,2019 VGF,2019 VGA,2019 VMP,2020 VW,2020 VGF,2020 VGA,2020 VMP
0,2019-10-22,8:00p,New Orleans Pelicans,122.0,Toronto Raptors,130.0,True,2.0,13.0,30.0,...,17.5,14.0,21.0,5.0,4.0,20.0,20.0,8.0,4.0,17.0
1,2019-10-22,10:30p,Los Angeles Lakers,102.0,Los Angeles Clippers,112.0,True,4.0,4.0,17.5,...,25.0,3.0,3.0,11.0,27.0,5.0,7.5,23.0,29.0,7.0
2,2019-10-23,7:00p,Chicago Bulls,125.0,Charlotte Hornets,126.0,True,23.0,30.0,19.0,...,16.0,19.0,24.0,27.0,17.5,22.0,21.5,15.0,10.0,20.0
3,2019-10-23,7:00p,Detroit Pistons,119.0,Indiana Pacers,110.0,False,7.0,23.0,28.0,...,11.0,15.5,26.5,25.0,16.0,23.0,28.0,24.0,17.5,23.0
4,2019-10-23,7:00p,Cleveland Cavaliers,85.0,Orlando Magic,94.0,True,18.0,24.0,26.0,...,14.5,29.0,28.5,26.0,8.5,28.0,25.5,30.0,19.0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974,2021-04-16,8:00p,Memphis Grizzlies,,Chicago Bulls,,False,24.0,27.0,17.5,...,10.0,20.0,16.5,14.0,10.0,18.5,16.0,14.0,14.5,12.0
1975,2021-04-16,8:00p,Denver Nuggets,,Houston Rockets,,False,9.5,2.0,8.5,...,7.0,26.0,6.0,18.5,20.0,11.0,6.0,4.5,22.0,6.0
1976,2021-04-16,8:00p,Miami Heat,,Minnesota Timberwolves,,False,28.5,12.0,3.0,...,3.0,27.0,9.5,15.0,21.0,8.0,12.5,26.0,28.0,18.0
1977,2021-04-16,8:30p,Portland Trail Blazers,,San Antonio Spurs,,False,19.0,8.0,6.0,...,13.0,21.0,14.5,6.0,5.0,17.0,9.0,7.0,5.0,15.5


In [13]:
# Create X and y
X = data.loc[:,f'{prev_year} HW':]
y = data['Home Win']

In [14]:
# Train model function

def train_model(X_train_and_test, y_train_and_test, model):
    ''' Scale, Split, Impute and Train one model '''
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_and_test, y_train_and_test, test_size=test_size, shuffle=False)
    pipe = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), model)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    
    # Combine predictions with actuals
    preds_df = pd.DataFrame(preds, columns=['Predictions'])
    preds_df.index = pd.RangeIndex(start=y_train.last_valid_index()+1, stop=y_train.last_valid_index()+1 + len(y_test))
    predictions_array.append(preds_df)
    preds_and_true = pd.concat([y_test, preds_df], axis=1, ignore_index=True)

    
    # Accuracy
    wins = preds_and_true.apply(lambda x: True if x[0] == True and x[1] == True else False, axis=1)
    losses = preds_and_true.apply(lambda x: True if x[0] == False and x[1] == False else False, axis=1)
    print('Model: ',str(model))
    print('Total test games: ', len(y_test))
    print('Wins predicted correctly: ',len(wins[wins == True].index))
    print('Losses predicted correctly: ',len(losses[losses == True].index))
    print('Percentage predicted correctly: ', (len(wins[wins == True].index) + len(losses[losses == True].index)) / len(preds_and_true))

In [16]:
test_size

0.4224355735219808

In [36]:
# Train and test models

test_size = len(data[(data['Date'] > season_dict[year][0])]) / len(data)
test_size = 0.08

predictions_array = []

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

for model in models_array:
    train_model(X, y, model)

Model:  SVC()
Total test games:  159
Wins predicted correctly:  30
Losses predicted correctly:  54
Percentage predicted correctly:  0.5283018867924528
Model:  AdaBoostClassifier()
Total test games:  159
Wins predicted correctly:  28
Losses predicted correctly:  49
Percentage predicted correctly:  0.48427672955974843
Model:  RandomForestClassifier()
Total test games:  159
Wins predicted correctly:  22
Losses predicted correctly:  49
Percentage predicted correctly:  0.44654088050314467
Model:  GradientBoostingClassifier()
Total test games:  159
Wins predicted correctly:  26
Losses predicted correctly:  51
Percentage predicted correctly:  0.48427672955974843
Model:  HistGradientBoostingClassifier()
Total test games:  159
Wins predicted correctly:  25
Losses predicted correctly:  54
Percentage predicted correctly:  0.4968553459119497
Model:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       

In [37]:
# Print out past predictions to back test in excel
predictions_array

excel_backtest = pd.DataFrame()
for i in predictions_array:
    excel_backtest = pd.concat([excel_backtest, i], axis=1)

excel_backtest.columns =['SVC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB', 'QDA', 'KNC']

# Merge Excel Backtest with data dates and teams
excel_backtest = pd.concat([data, excel_backtest], axis=1)
excel_backtest = excel_backtest[['Date', 'Visitor', 'Home', 'Home Win', 'SVC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB', 'QDA', 'KNC']]

excel_backtest.dropna(inplace=True)

excel_backtest.to_excel('Excel Backtest.xlsx')

In [None]:
# Make future predictions

future_models = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

def make_preds(X_train, y_train, X_predict, model):
    # Train
    pipe = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), model)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_predict)
    return preds

# Append the predictions onto the entire data and keep only date, teams and prediction columns
future_predictions_array = []

for model in future_models:
    preds = make_preds(X_train, y_train, X_predict, model)
    future_predictions_array.append(preds)

len(future_predictions_array)

In [None]:
# Print out predictions for Feb, Mar and April to backtest, both imputed and data leakage predictions
# If profit > with data leakage then okay

# Backtest in py file

In [400]:
## HYPER PARAM TUNING

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

pipe_SVC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), svc)

pipe_ADBC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), ADBC)

pipe_RFC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), RFC)

pipe_GBC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), GBC)

pipe_HGBC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), HGBC)

pipe_XGB = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), XGB)

pipe_QDA = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), QDA)

pipe_KNC = make_pipeline(SimpleImputer(),StandardScaler(),SelectKBest(f_regression, k='all'), KNC)

param_range = [1,3,6,9,10]
param_range_fl = [1.0, 0.5]

grid_params_svc = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 
                    'svc__C': param_range, 'svc__shrinking': [True, False],
                  'svc__probability': [True, False], 'svc__decision_function_shape': ['ovo', 'ovr']}]

grid_params_adbc = [{'adaboostclassifier__base_estimator': [None, 'svc', 'randomforestclassifier', 'knearestneighbors'],
                    'adaboostclassifier__n_estimators': [50, 100, 200], 'adaboostclassifier__learning_rate': [0.01, 0.1, 1, 3]}]

grid_params_rf = [{'randomforestclassifier__n_estimators': [50, 100, 150, 200], 'randomforestclassifier__criterion': ['gini', 'entropy'],
                'randomforestclassifier__max_depth': param_range,
                'randomforestclassifier__min_samples_split': param_range[1:], 'randomforestclassifier__max_leaf_nodes':[5, 10, 30, None],
                  'randomforestclassifier__oob_score': [True, False], 'randomforestclassifier__warm_start': [True, False]
                  }]

grid_params_gbc = [{'gradientboostingclassifier__loss': ['deviance', 'exponential'], 'gradientboostingclassifier__learning_rate': [0.01, 0.1, 1, 3],
                   'gradientboostingclassifier__n_estimators': [200, 250, 300], 'gradientboostingclassifier__subsample': [1,2,3,4],
                   'gradientboostingclassifier__criterion': ['friedman_mse', 'mse', 'mae']}]

grid_params_hgbc = [{'histgradientboostingclassifier__loss': ['auto', 'binary_crossentropy', 'categorical_crossentropy'],
                    'histgradientboostingclassifier__learning_rate': [0.01, 0.1, 1], 'histgradientboostingclassifier__max_iter': [80,100,150,200],
                    'histgradientboostingclassifier__max_leaf_nodes': [20, 31, 40, None], 'histgradientboostingclassifier__min_samples_leaf': [2, 5, 12, 15]}]

grid_params_xgb = [{'xgbclassifier__n_estimators': [100, 200, 400, 600], 'xgbclassifier__colsample_bytree':[0.4, 0.6, 0.8,1],
                   'xgbclassifier__max_depth': [15, 20, 25], 'xgbclassifier__reg_alpha': [1.1, 1.2, 1.3],
                   'xgbclassifier__reg_lambda':[1.1, 1.2, 1.3], 'xgbclassifier__subsample':[0.7, 0.8, 0.9]}]

grid_params_knc = [{'kneighborsclassifier__n_neighbors':[2,5,8,10], 'kneighborsclassifier__weights': ['uniform', 'distance'],
                   'kneighborsclassifier__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'kneighborsclassifier__leaf_size':[20,30,40],
                   'kneighborsclassifier__p':[1,2]}]

svc = GridSearchCV(estimator=pipe_SVC, param_grid=grid_params_svc,
                  scoring='accuracy', cv=10, n_jobs=-1)

RFC = GridSearchCV(estimator=pipe_RFC,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=-1)

ADBC = GridSearchCV(estimator=pipe_ADBC,
                   param_grid = grid_params_adbc,
                   scoring='accuracy', cv=10, n_jobs=-1)

GBC = GridSearchCV(estimator=pipe_GBC,
                  param_grid = grid_params_gbc,
                  scoring='accuracy', cv=10, n_jobs=-1)

HGBC = GridSearchCV(estimator=pipe_HGBC,
                   param_grid=grid_params_hgbc,
                   scoring='accuracy', cv=10, n_jobs=-1)

XGB = GridSearchCV(estimator=pipe_XGB,
                  param_grid = grid_params_xgb,
                  scoring='accuracy', cv=10, n_jobs=-1)

KNC = GridSearchCV(estimator=pipe_KNC,
                  param_grid = grid_params_knc,
                  scoring='accuracy', cv=10, n_jobs=-1)

grids = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

grid_dict = { 
        0: 'svc', 1: 'ADBC', 2: 'RFC', 3: 'GBC', 4: 'HGBC', 5:'XGB', 6: 'KNC'}

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

Performing model optimizations...

Estimator: svc
Best params are : {'svc__C': 1, 'svc__decision_function_shape': 'ovo', 'svc__kernel': 'linear', 'svc__probability': True, 'svc__shrinking': True}
Best training accuracy: 0.659
Test set accuracy score for best params: 0.605 

Estimator: RFC
Best params are : {'adaboostclassifier__base_estimator': None, 'adaboostclassifier__learning_rate': 0.1, 'adaboostclassifier__n_estimators': 200}
Best training accuracy: 0.668
Test set accuracy score for best params: 0.617 

Estimator: ADBC
Best params are : {'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_depth': 9, 'randomforestclassifier__max_leaf_nodes': 10, 'randomforestclassifier__min_samples_split': 9, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__oob_score': True, 'randomforestclassifier__warm_start': True}
Best training accuracy: 0.668
Test set accuracy score for best params: 0.617 

Estimator: GBC
Best params are : {'gradientboostingclassifie

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').