In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot

# Warnings
import warnings
warnings.simplefilter("ignore", UserWarning)

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# These models are voting models based off the above models
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingRegressor

# Data prep
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Model evaluations
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold,StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance



In [2]:
# Models

ADBC = AdaBoostClassifier(n_estimators=155, learning_rate=0.8) # 2% increase with these hp
XGB = XGBClassifier(n_estimators=150, learning_rate=0.1) # 1.7% better with hp
svc = SVC()
KNC = KNeighborsClassifier()
RFC = RandomForestClassifier()
GBC = GradientBoostingClassifier()
HGBC = HistGradientBoostingClassifier()
QDA = QuadraticDiscriminantAnalysis()

# Imputer
imputer = SimpleImputer()
MMScaler = MinMaxScaler()

In [4]:
data = pd.read_excel('./content/NBA_COMBINED.xlsx', sheet_name='Games', parse_dates=['Date'])
rankings = pd.read_excel('./content/NBA_COMBINED.xlsx', sheet_name='Power Rankings', usecols='Q:AG', skiprows=3)

In [5]:
# Drop unnecessary columns
data = data[['Date', 'Start (ET)', 'Visitor', 'Vis PTS', 'Home', 'Home PTS']]
rankings = rankings[['Team.3', 'W_Power', 'GF_Power', 'GA_Power', 'Margin_Power']]
rankings = rankings.iloc[:30,:]

In [6]:
# Create home and vis rankings DF
home_rank = rankings.copy()
vis_rank = rankings.copy()

home_rank.columns = ['Home', 'HW', 'HGF', 'HGA', 'HMP']
vis_rank.columns = ['Visitor', 'VW', 'VGF', 'VGA', 'VMP']

In [7]:
# Keep only 2017-18 data
index_to_delete_from = data[data['Date'] == 'Tue, Oct 16, 2018'].index
data = data.iloc[:index_to_delete_from[0],:]

In [8]:
# Create prediction column
data['Home Win'] = data['Home PTS'] > data['Vis PTS']

In [9]:
# Merge rankings and df columns
merged = data.merge(home_rank, on='Home', how='left')
merged = merged.merge(vis_rank, on='Visitor', how='left')
merged

Unnamed: 0,Date,Start (ET),Visitor,Vis PTS,Home,Home PTS,Home Win,HW,HGF,HGA,HMP,VW,VGF,VGA,VMP
0,2017-10-17,8:01p,Boston Celtics,99.0,Cleveland Cavaliers,102.0,True,0.834437,0.870410,0.644397,0.772234,0.867550,0.773218,0.482759,0.739696
1,2017-10-17,10:30p,Houston Rockets,122.0,Golden State Warriors,121.0,False,0.966887,0.967603,0.353448,0.967462,0.900662,0.935205,0.838362,0.902386
2,2017-10-18,7:00p,Charlotte Hornets,90.0,Detroit Pistons,102.0,True,0.370861,0.157667,0.224138,0.316703,0.337748,0.481641,0.418103,0.511931
3,2017-10-18,7:00p,Brooklyn Nets,131.0,Indiana Pacers,140.0,True,0.569536,0.514039,0.450431,0.479393,0.006623,0.611231,0.935345,0.056399
4,2017-10-18,7:00p,Miami Heat,109.0,Orlando Magic,116.0,True,0.139073,0.125270,0.709052,0.088937,0.503311,0.319654,0.159483,0.674620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,2018-05-28,9:00p,Golden State Warriors,101.0,Houston Rockets,92.0,False,0.900662,0.935205,0.838362,0.902386,0.966887,0.967603,0.353448,0.967462
1308,2018-05-31,9:00p,Cleveland Cavaliers,114.0,Golden State Warriors,124.0,True,0.966887,0.967603,0.353448,0.967462,0.834437,0.870410,0.644397,0.772234
1309,2018-06-03,8:00p,Cleveland Cavaliers,103.0,Golden State Warriors,122.0,True,0.966887,0.967603,0.353448,0.967462,0.834437,0.870410,0.644397,0.772234
1310,2018-06-06,9:00p,Golden State Warriors,110.0,Cleveland Cavaliers,102.0,False,0.834437,0.870410,0.644397,0.772234,0.966887,0.967603,0.353448,0.967462


In [10]:
# Create X and y
X = merged.loc[:,'HW':]
y = merged['Home Win']

In [13]:
# Train model function

def train_model(X_train_and_test, y_train_and_test, model):
    ''' Scale, Split, Impute and Train one model '''
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_and_test, y_train_and_test, test_size=0.2, shuffle=False)


    # Impute
    colsT = X_train.columns
    colsV = X_test.columns
    X_train = pd.DataFrame(imputer.fit_transform(X_train))
    X_test = pd.DataFrame(imputer.transform(X_test))
    X_train.columns = colsT
    X_test.columns = colsV
    
    # Scale and replace column names
    X_scaled_train = MMScaler.fit_transform(X_train)
    X_scaled_test = MMScaler.transform(X_test)
    X_scaled_train = pd.DataFrame(X_scaled_train, columns=colsT)
    X_scaled_test = pd.DataFrame(X_scaled_test, columns=colsV)
    
    # Train
    model.fit(X_scaled_train, y_train)
    preds = model.predict(X_scaled_test)

    
    # Combine predictions with actuals
    preds_df = pd.DataFrame(preds, columns=['Predictions'])
    preds_df.index = pd.RangeIndex(start=y_train.last_valid_index()+1, stop=y_train.last_valid_index()+1 + len(y_test))
    predictions_array.append(preds_df)
    preds_and_true = pd.concat([y_test, preds_df], axis=1, ignore_index=True)

    
    # Accuracy
    wins = preds_and_true.apply(lambda x: True if x[0] == True and x[1] == True else False, axis=1)
    losses = preds_and_true.apply(lambda x: True if x[0] == False and x[1] == False else False, axis=1)
    print('Model: ',str(model))
    print('Total test games: ', len(y_test))
    print('Wins predicted correctly: ',len(wins[wins == True].index))
    print('Losses predicted correctly: ',len(losses[losses == True].index))
    print('Percentage predicted correctly: ', (len(wins[wins == True].index) + len(losses[losses == True].index)) / len(preds_and_true))

In [14]:
# Train and test models

predictions_array = []

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

for model in models_array:
    train_model(X, y, model)

Model:  SVC()
Total test games:  263
Wins predicted correctly:  135
Losses predicted correctly:  28
Percentage predicted correctly:  0.6197718631178707
Model:  AdaBoostClassifier(learning_rate=0.8, n_estimators=155)
Total test games:  263
Wins predicted correctly:  124
Losses predicted correctly:  48
Percentage predicted correctly:  0.6539923954372624
Model:  RandomForestClassifier()
Total test games:  263
Wins predicted correctly:  113
Losses predicted correctly:  51
Percentage predicted correctly:  0.623574144486692
Model:  GradientBoostingClassifier()
Total test games:  263
Wins predicted correctly:  121
Losses predicted correctly:  43
Percentage predicted correctly:  0.623574144486692
Model:  HistGradientBoostingClassifier()
Total test games:  263
Wins predicted correctly:  110
Losses predicted correctly:  51
Percentage predicted correctly:  0.6121673003802282
Model:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_by

In [None]:
# Use 2019-2020 rankings to predict 2020-2021
# Use Current 2020-21 Data to predict the rest of the season

# All dates made it worse. Add date columns one by one


In [79]:
# Add dates and time

# Get Day, Month and Year from date column
dates = pd.DataFrame()
dates['Year'] = data['Date'].dt.strftime('%Y')
dates['Month'] = data['Date'].dt.strftime('%m')
dates['Day'] = data['Date'].dt.strftime('%d')

# Add dates
X = pd.concat([X, dates], axis=1)

# Get start time
start_time = data['Start (ET)'].str[:-1]
start_time = start_time.str.replace(':', '.')
start_time = start_time.astype(float)
start_time.columns = ['Start Time']

# Add start time
X = pd.concat([X, start_time], axis=1)
X

Unnamed: 0,HW,HGF,HGA,HMP,VW,VGF,VGA,VMP,Year,Month,Day,Start (ET)
0,0.834437,0.870410,0.644397,0.772234,0.867550,0.773218,0.482759,0.739696,2017,10,17,8.01
1,0.966887,0.967603,0.353448,0.967462,0.900662,0.935205,0.838362,0.902386,2017,10,17,10.30
2,0.370861,0.157667,0.224138,0.316703,0.337748,0.481641,0.418103,0.511931,2017,10,18,7.00
3,0.569536,0.514039,0.450431,0.479393,0.006623,0.611231,0.935345,0.056399,2017,10,18,7.00
4,0.139073,0.125270,0.709052,0.088937,0.503311,0.319654,0.159483,0.674620,2017,10,18,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...
1307,0.900662,0.935205,0.838362,0.902386,0.966887,0.967603,0.353448,0.967462,2018,05,28,9.00
1308,0.966887,0.967603,0.353448,0.967462,0.834437,0.870410,0.644397,0.772234,2018,05,31,9.00
1309,0.966887,0.967603,0.353448,0.967462,0.834437,0.870410,0.644397,0.772234,2018,06,03,8.00
1310,0.834437,0.870410,0.644397,0.772234,0.966887,0.967603,0.353448,0.967462,2018,06,06,9.00


In [85]:
# Train and test models

predictions_array = []

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

for model in models_array:
    train_model(X, y, model)

Model:  SVC()
Total test games:  263
Wins predicted correctly:  162
Losses predicted correctly:  0
Percentage predicted correctly:  0.6159695817490495
Model:  AdaBoostClassifier(learning_rate=0.8, n_estimators=155)
Total test games:  263
Wins predicted correctly:  117
Losses predicted correctly:  51
Percentage predicted correctly:  0.6387832699619772
Model:  RandomForestClassifier()
Total test games:  263
Wins predicted correctly:  115
Losses predicted correctly:  45
Percentage predicted correctly:  0.6083650190114068
Model:  GradientBoostingClassifier()
Total test games:  263
Wins predicted correctly:  116
Losses predicted correctly:  48
Percentage predicted correctly:  0.623574144486692
Model:  HistGradientBoostingClassifier()
Total test games:  263
Wins predicted correctly:  112
Losses predicted correctly:  53
Percentage predicted correctly:  0.6273764258555133
Model:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_by