In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot

# Warnings
import warnings
warnings.simplefilter("ignore", UserWarning)

from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# These models are voting models based off the above models
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingRegressor

# Data prep
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Model evaluations
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold,StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance



In [2]:
# Models

svc = SVC(kernel='rbf', gamma=0.1, C=10) # 5% increase with these hyperparamters
KNC = KNeighborsClassifier(weights='distance', p=2, n_neighbors=10, metric='euclidean', leaf_size=40) # 2.7% increase with these hp
ADBC = AdaBoostClassifier(n_estimators=155, learning_rate=0.01) # 2% increase with these hp
RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=5) # 1% better with these hyperparameters

GBC = GradientBoostingClassifier(n_estimators=500, learning_rate=0.15) # 2% better
HGBC = HistGradientBoostingClassifier(min_samples_leaf=25, max_leaf_nodes=80, max_iter=100, max_depth=None, learning_rate=0.1, l2_regularization=1.5) # 2% better
XGB = XGBClassifier(n_estimators=150, learning_rate=0.1) # 1.7% better with hp
QDA = QuadraticDiscriminantAnalysis() # Same with default hp

# Imputer
imputer = SimpleImputer()
MMScaler = MinMaxScaler()

In [3]:
# Read data
data = pd.read_excel('./content/NBA_COMBINED.xlsx', parse_dates=['Date'])

# Drop unneccesary columns
data = data.drop(columns=['PTS22', 'PTS3', 'Attend.'], axis=1)

# Add point diff column to predict. True or False
data['Home Points Differ'] = data['Home PTS'] > data['Vis PTS']

y_all = data['Home Points Differ']

data.loc[len(data.loc[data['Vis PTS'] > 0]):,'Home Points Differ'] = 0

data

Unnamed: 0,Date,Start (ET),Visitor,Vis PTS,Home,Home PTS,2016-17 Vis Rank,2016-17 Home Rank,2017-18 Vis Rank,2017-18 Home Rank,2018-19 Vis Rank,2018-19 Home Rank,2019-20 Vis Rank,2019-20 Home Rank,Home Points Differ
0,2017-10-17,8:01p,Boston Celtics,99.0,Cleveland Cavaliers,102.0,4,5,,,,,,,True
1,2017-10-17,10:30p,Houston Rockets,122.0,Golden State Warriors,121.0,3,1,,,,,,,False
2,2017-10-18,7:00p,Charlotte Hornets,90.0,Detroit Pistons,102.0,20,19,,,,,,,True
3,2017-10-18,7:00p,Brooklyn Nets,131.0,Indiana Pacers,140.0,30,13,,,,,,,True
4,2017-10-18,7:00p,Miami Heat,109.0,Orlando Magic,116.0,17,26,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4842,2021-05-16,9:00p,Dallas Mavericks,,Minnesota Timberwolves,,22,24,28.0,12.0,22.0,21.0,12.0,29.0,0
4843,2021-05-16,9:00p,Los Angeles Lakers,,New Orleans Pelicans,,28,21,21.0,9.0,20.0,24.0,3.0,21.0,0
4844,2021-05-16,9:00p,Los Angeles Clippers,,Oklahoma City Thunder,,6,10,18.0,10.0,12.0,10.0,4.0,10.0,0
4845,2021-05-16,9:00p,Denver Nuggets,,Portland Trail Blazers,,18,16,14.0,7.0,4.0,6.0,6.0,15.0,0


In [4]:
# Add dates and time

# Get Day, Month and Year from date column
dates = pd.DataFrame()
dates['Year'] = data['Date'].dt.strftime('%Y')
dates['Month'] = data['Date'].dt.strftime('%m')
dates['Day'] = data['Date'].dt.strftime('%d')

# Add dates
data = pd.concat([data, dates], axis=1)

# Get start time
start_time = data['Start (ET)'].str[:-1]
start_time = start_time.str.replace(':', '.')
start_time = start_time.astype(float)
start_time.columns = ['Start Time']

# Add start time
data = pd.concat([data, start_time], axis=1)
data

Unnamed: 0,Date,Start (ET),Visitor,Vis PTS,Home,Home PTS,2016-17 Vis Rank,2016-17 Home Rank,2017-18 Vis Rank,2017-18 Home Rank,2018-19 Vis Rank,2018-19 Home Rank,2019-20 Vis Rank,2019-20 Home Rank,Home Points Differ,Year,Month,Day,Start (ET).1
0,2017-10-17,8:01p,Boston Celtics,99.0,Cleveland Cavaliers,102.0,4,5,,,,,,,True,2017,10,17,8.01
1,2017-10-17,10:30p,Houston Rockets,122.0,Golden State Warriors,121.0,3,1,,,,,,,False,2017,10,17,10.30
2,2017-10-18,7:00p,Charlotte Hornets,90.0,Detroit Pistons,102.0,20,19,,,,,,,True,2017,10,18,7.00
3,2017-10-18,7:00p,Brooklyn Nets,131.0,Indiana Pacers,140.0,30,13,,,,,,,True,2017,10,18,7.00
4,2017-10-18,7:00p,Miami Heat,109.0,Orlando Magic,116.0,17,26,,,,,,,True,2017,10,18,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4842,2021-05-16,9:00p,Dallas Mavericks,,Minnesota Timberwolves,,22,24,28.0,12.0,22.0,21.0,12.0,29.0,0,2021,05,16,9.00
4843,2021-05-16,9:00p,Los Angeles Lakers,,New Orleans Pelicans,,28,21,21.0,9.0,20.0,24.0,3.0,21.0,0,2021,05,16,9.00
4844,2021-05-16,9:00p,Los Angeles Clippers,,Oklahoma City Thunder,,6,10,18.0,10.0,12.0,10.0,4.0,10.0,0,2021,05,16,9.00
4845,2021-05-16,9:00p,Denver Nuggets,,Portland Trail Blazers,,18,16,14.0,7.0,4.0,6.0,6.0,15.0,0,2021,05,16,9.00


In [5]:
# Drop Ranking columns from prev years

data.drop(['2016-17 Vis Rank', '2016-17 Home Rank', '2017-18 Vis Rank', '2017-18 Home Rank'], inplace=True, axis =1)
# data.columns

# Remove games before 22-10-19
first_games = data.loc[data['Date'] == '2019-10-22'].index
first_games[0]
data = data.loc[first_games[0]:,:]
data

Unnamed: 0,Date,Start (ET),Visitor,Vis PTS,Home,Home PTS,2018-19 Vis Rank,2018-19 Home Rank,2019-20 Vis Rank,2019-20 Home Rank,Home Points Differ,Year,Month,Day,Start (ET).1
2624,2019-10-22,8:00p,New Orleans Pelicans,122.0,Toronto Raptors,130.0,24.0,2.0,,,True,2019,10,22,8.0
2625,2019-10-22,10:30p,Los Angeles Lakers,102.0,Los Angeles Clippers,112.0,20.0,12.0,,,True,2019,10,22,10.3
2626,2019-10-23,7:00p,Chicago Bulls,125.0,Charlotte Hornets,126.0,27.0,17.0,,,True,2019,10,23,7.0
2627,2019-10-23,7:00p,Detroit Pistons,119.0,Indiana Pacers,110.0,16.0,11.0,,,False,2019,10,23,7.0
2628,2019-10-23,7:00p,Cleveland Cavaliers,85.0,Orlando Magic,94.0,28.0,15.0,,,True,2019,10,23,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4842,2021-05-16,9:00p,Dallas Mavericks,,Minnesota Timberwolves,,22.0,21.0,12.0,29.0,0,2021,05,16,9.0
4843,2021-05-16,9:00p,Los Angeles Lakers,,New Orleans Pelicans,,20.0,24.0,3.0,21.0,0,2021,05,16,9.0
4844,2021-05-16,9:00p,Los Angeles Clippers,,Oklahoma City Thunder,,12.0,10.0,4.0,10.0,0,2021,05,16,9.0
4845,2021-05-16,9:00p,Denver Nuggets,,Portland Trail Blazers,,4.0,6.0,6.0,15.0,0,2021,05,16,9.0


In [6]:
# Fill in actual values for home and visitor last win
data["HomeLastWin"] = False
data["VisitorLastWin"] = False

from collections import defaultdict
won_last = defaultdict(int) # Create dictionary won last

for index, row in data.iterrows(): # for each row
  home_team = row['Home'] # Take the home team in the row
  visitor_team = row['Visitor'] # Take the vis team in each row
  row['HomeLastWin'] = won_last[home_team] # If HomeLastWin is true set that team to won in the won_last dict
  row['VisitorLastWin'] = won_last[visitor_team] # If VisitorLastWin is true set that team to won in won_last dict
  data.loc[index] = row # Set the index for the next row?
  # Set current win
  won_last[home_team] = row['Home Points Differ'] # If home won set that in the won_last dict
  won_last[visitor_team] = not row['Home Points Differ'] # if home did not win set that in the won_last dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [7]:
# Add WinStreaks

data['HomeWinStreak'] = 0
data['VisitorWinStreak'] = 0

win_streak = defaultdict(int) #  Create a dictionary for teams winning streaks

for index, row in data.iterrows():
  home_team = row['Home'] # Home team = home team for that row
  visitor_team = row['Visitor'] # Vis team = vis team for that row
  row['HomeWinStreak'] = win_streak[home_team] # HomeWinStreak for that row is looked up in the dictionary win_streak
  row['VisitorWinStreak'] = win_streak[visitor_team] # Set VisitorWinStreak in the row to dict value for that team
  data.loc[index] = row # Set row to next row
  # Set current win streak number
  if row['Home Points Differ']:
    win_streak[home_team] += 1
    win_streak[visitor_team] = 0
  else:
    win_streak[home_team] = 0
    win_streak[visitor_team] += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
# Which team won in their last match?

last_match_winner = defaultdict(int)

def home_team_won_last(row):
  # Variables equal the team names
  home_team = row['Home']
  visitor_team = row['Visitor']

  teams = tuple(sorted([home_team, visitor_team])) # Tuple of the home and visitor team to search for
  result = 1 if last_match_winner[teams] == row['Home'] else 0 # Look in last_match_winner dict for if these teams have played before
  winner = row['Home'] if  row['Home Points Differ'] else row['Visitor'] # Winner variable is home team if the homewin column says it is

  last_match_winner[teams] = winner # Feed the winner into the last_match_winner dict

  return result

data['HomeTeamWonLast'] = data.apply(home_team_won_last, axis=1) # Apply the function on each row (axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
# Individual Player Rankings

# Team dictionary to change abbr. team names to full length ==== MADE IT WORSE
team_dict = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHI': 'Chicago Bulls',
    'CHO': 'Charlotte Hornets',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

# Function to add in player ranks

def add_player_ranks_from_excel(year):
    # Add in player ranks for home and vis teams
    player_ranks = pd.read_excel('./content/NBA_COMBINED.xlsx', sheet_name='Player Rank '+str(year),)

    # Map Team dictionary
    player_ranks['Team'] = player_ranks['Tm'].map(team_dict)

    # Drop multiple rank entries
    player_ranks.drop_duplicates(subset='Rk', inplace=True, keep='last')

    # Join the team and position columns so we only have 5 players per team
    player_ranks['Joined'] = player_ranks['Team'] + player_ranks['Pos']

    # drop duplicate team members
    player_ranks.drop_duplicates(subset='Joined', inplace=True)

    # drop other columns
    player_ranks = player_ranks[['Rk', 'Pos', 'Team']]

    # Make positions columns
    player_ranks = player_ranks.pivot_table(values='Rk', index='Team', columns='Pos', aggfunc='first')

    # Reset the index to numbers again
    player_ranks.reset_index(inplace=True)

    # Create Home and Vis columns
    columns = ['C', 'PF', 'PG', 'SF', 'SG']
    for col in columns:
      player_ranks['H'+col] = player_ranks[col]
      player_ranks['V'+col] = player_ranks[col]

    Home_ranks_df = player_ranks[['Team', 'HC', 'HPF', 'HPG', 'HSF', 'HSG']]
    Vis_ranks_df = player_ranks[['Team', 'VC', 'VPF', 'VPG', 'VSF', 'VSG']]
    Home_ranks_df.columns = ['Home', 'HC '+year, 'HPF '+year, 'HPG '+year, 'HSF '+year, 'HSG '+year]
    Vis_ranks_df.columns = ['Visitor', 'VC '+year, 'VPF '+year, 'VPG '+year, 'VSF '+year, 'VSG '+year]
    return Home_ranks_df, Vis_ranks_df

In [10]:
# Find individual player rankings from excel
# homeranks16, visranks16 = add_player_ranks_from_excel('2016-17')
# homeranks17, visranks17 = add_player_ranks_from_excel('2017-18')
homeranks18, visranks18 = add_player_ranks_from_excel('2018-19')

# Merge player rankings and data
X = data.copy()
# X = X.merge(homeranks16, on='Home', how='left')
# X = X.merge(visranks16, on='Visitor', how='left')
# X = X.merge(homeranks17, on='Home', how='left')
# X = X.merge(visranks17, on='Visitor', how='left')
X = X.merge(homeranks18, on='Home', how='left')
X = X.merge(visranks18, on='Visitor', how='left')
X = X.drop('Home Points Differ', axis=1)
X.columns


Index(['Date', 'Start (ET)', 'Visitor', 'Vis PTS', 'Home', 'Home PTS',
       '2018-19 Vis Rank', '2018-19 Home Rank', '2019-20 Vis Rank',
       '2019-20 Home Rank', 'Year', 'Month', 'Day', 'Start (ET)',
       'HomeLastWin', 'VisitorLastWin', 'HomeWinStreak', 'VisitorWinStreak',
       'HomeTeamWonLast', 'HC 2018-19', 'HPF 2018-19', 'HPG 2018-19',
       'HSF 2018-19', 'HSG 2018-19', 'VC 2018-19', 'VPF 2018-19',
       'VPG 2018-19', 'VSF 2018-19', 'VSG 2018-19'],
      dtype='object')

In [11]:
# Drop non-feature columns
X = X.loc[:,'2018-19 Vis Rank':]
X.columns

Index(['2018-19 Vis Rank', '2018-19 Home Rank', '2019-20 Vis Rank',
       '2019-20 Home Rank', 'Year', 'Month', 'Day', 'Start (ET)',
       'HomeLastWin', 'VisitorLastWin', 'HomeWinStreak', 'VisitorWinStreak',
       'HomeTeamWonLast', 'HC 2018-19', 'HPF 2018-19', 'HPG 2018-19',
       'HSF 2018-19', 'HSG 2018-19', 'VC 2018-19', 'VPF 2018-19',
       'VPG 2018-19', 'VSF 2018-19', 'VSG 2018-19'],
      dtype='object')

In [12]:
X = X.astype(float)
X.dtypes

2018-19 Vis Rank     float64
2018-19 Home Rank    float64
2019-20 Vis Rank     float64
2019-20 Home Rank    float64
Year                 float64
Month                float64
Day                  float64
Start (ET)           float64
HomeLastWin          float64
VisitorLastWin       float64
HomeWinStreak        float64
VisitorWinStreak     float64
HomeTeamWonLast      float64
HC 2018-19           float64
HPF 2018-19          float64
HPG 2018-19          float64
HSF 2018-19          float64
HSG 2018-19          float64
VC 2018-19           float64
VPF 2018-19          float64
VPG 2018-19          float64
VSF 2018-19          float64
VSG 2018-19          float64
dtype: object

In [13]:
# Drop games that havent been played
rows_with_results = len(data) - len(data[data['Vis PTS'].isna()])

# Training and testing
X_train_and_test = X.iloc[:rows_with_results-1,:]
y_train_and_test = y_all.iloc[:rows_with_results-1]
y_train_and_test = y_train_and_test.astype(bool)
# X = X.drop(['2019-20 Vis Rank'], axis=1)
# X = X.drop(['2019-20 Home Rank'], axis=1)

# Future Games
X_valid = X.iloc[rows_with_results:,:]

# Get future teams and dates
future_teams_and_dates = data.iloc[rows_with_results:,:]
future_teams_and_dates = future_teams_and_dates[['Date', 'Visitor','Home']]

In [14]:
# Train model function

def train_model(X_train_and_test, y_train_and_test, model):
    ''' Scale, Split, Impute and Train one model '''
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_and_test, y_train_and_test, test_size=0.2, shuffle=False)

    pipe = make_pipeline(SimpleImputer(),StandardScaler(), model)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    
    # Combine predictions with actuals
    preds_df = pd.DataFrame(preds, columns=['Predictions'])
    preds_df.index = pd.RangeIndex(start=y_train.last_valid_index()+1, stop=y_train.last_valid_index()+1 + len(y_test))
    predictions_array.append(preds_df)
    preds_and_true = pd.concat([y_test, preds_df], axis=1, ignore_index=True)

    
    # Accuracy
    wins = preds_and_true.apply(lambda x: True if x[0] == True and x[1] == True else False, axis=1)
    losses = preds_and_true.apply(lambda x: True if x[0] == False and x[1] == False else False, axis=1)
    print('Model: ',str(model))
    print('Total test games: ', len(y_test))
    print('Wins predicted correctly: ',len(wins[wins == True].index))
    print('Losses predicted correctly: ',len(losses[losses == True].index))
    print('Percentage predicted correctly: ', (len(wins[wins == True].index) + len(losses[losses == True].index)) / len(preds_and_true))
    
    return preds_df

In [15]:
# Without individual player rankings ( 1.4% worse all up )

X_without_players = X_train_and_test.loc[:,:'HomeTeamWonLast']

# Train and test models

predictions_array = []

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

for model in models_array:
    train_model(X_train_and_test, y_train_and_test, model)

Model:  SVC(C=10, gamma=0.1)
Total test games:  439
Wins predicted correctly:  162
Losses predicted correctly:  56
Percentage predicted correctly:  0.49658314350797267
Model:  AdaBoostClassifier(learning_rate=0.01, n_estimators=155)
Total test games:  439
Wins predicted correctly:  259
Losses predicted correctly:  0
Percentage predicted correctly:  0.5899772209567198
Model:  RandomForestClassifier(min_samples_split=5, n_estimators=1000)
Total test games:  439
Wins predicted correctly:  222
Losses predicted correctly:  30
Percentage predicted correctly:  0.5740318906605922
Model:  GradientBoostingClassifier(learning_rate=0.15, n_estimators=500)
Total test games:  439
Wins predicted correctly:  167
Losses predicted correctly:  72
Percentage predicted correctly:  0.5444191343963554
Model:  HistGradientBoostingClassifier(l2_regularization=1.5, max_leaf_nodes=80,
                               min_samples_leaf=25)
Total test games:  439
Wins predicted correctly:  183
Losses predicted correc

In [16]:
# Without individual player rankings ( 1.4% worse all up )

X_without_players = X_train_and_test.loc[:,:'HomeTeamWonLast']

# Train and test models

predictions_array = []

models_array = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

for model in models_array:
    train_model(X_without_players, y_train_and_test, model)

Model:  SVC(C=10, gamma=0.1)
Total test games:  439
Wins predicted correctly:  178
Losses predicted correctly:  65
Percentage predicted correctly:  0.5535307517084282
Model:  AdaBoostClassifier(learning_rate=0.01, n_estimators=155)
Total test games:  439
Wins predicted correctly:  259
Losses predicted correctly:  0
Percentage predicted correctly:  0.5899772209567198
Model:  RandomForestClassifier(min_samples_split=5, n_estimators=1000)
Total test games:  439
Wins predicted correctly:  211
Losses predicted correctly:  25
Percentage predicted correctly:  0.5375854214123007
Model:  GradientBoostingClassifier(learning_rate=0.15, n_estimators=500)
Total test games:  439
Wins predicted correctly:  168
Losses predicted correctly:  70
Percentage predicted correctly:  0.5421412300683371
Model:  HistGradientBoostingClassifier(l2_regularization=1.5, max_leaf_nodes=80,
                               min_samples_leaf=25)
Total test games:  439
Wins predicted correctly:  172
Losses predicted correct

In [17]:
# Make future predictions Without player rankings


X_train, X_test, y_train, y_test = train_test_split(X_without_players, y_train_and_test, test_size=0.2, shuffle=False)

all_predictions = pd.DataFrame(y_test)
for i in predictions_array:
    all_predictions = pd.concat([all_predictions, i], axis=1)
    
all_predictions.columns =['Home Points Differ', 'SVC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB', 'QDA', 'KNC']

In [18]:
# Make future predictions

future_models = [svc, ADBC, RFC, GBC, HGBC, XGB, QDA, KNC]

def make_preds(X_train, y_train, X_predict, model):
    # Impute
    colsT = X_train.columns
    colsV = X_predict.columns
    X_train = pd.DataFrame(imputer.fit_transform(X_train))
    X_predict = pd.DataFrame(imputer.transform(X_predict))
    X_train.columns = colsT
    X_predict.columns = colsV


    # Train
    model.fit(X_train, y_train)
    preds = model.predict(X_predict)
    return preds

In [20]:
print(X_valid.columns)
X_without_players_valid = X_valid.loc[:,:'HomeTeamWonLast']
print(X_without_players_valid.columns)
print(X_without_players.columns)

Index(['2018-19 Vis Rank', '2018-19 Home Rank', '2019-20 Vis Rank',
       '2019-20 Home Rank', 'Year', 'Month', 'Day', 'Start (ET)',
       'HomeLastWin', 'VisitorLastWin', 'HomeWinStreak', 'VisitorWinStreak',
       'HomeTeamWonLast', 'HC 2018-19', 'HPF 2018-19', 'HPG 2018-19',
       'HSF 2018-19', 'HSG 2018-19', 'VC 2018-19', 'VPF 2018-19',
       'VPG 2018-19', 'VSF 2018-19', 'VSG 2018-19'],
      dtype='object')
Index(['2018-19 Vis Rank', '2018-19 Home Rank', '2019-20 Vis Rank',
       '2019-20 Home Rank', 'Year', 'Month', 'Day', 'Start (ET)',
       'HomeLastWin', 'VisitorLastWin', 'HomeWinStreak', 'VisitorWinStreak',
       'HomeTeamWonLast'],
      dtype='object')
Index(['2018-19 Vis Rank', '2018-19 Home Rank', '2019-20 Vis Rank',
       '2019-20 Home Rank', 'Year', 'Month', 'Day', 'Start (ET)',
       'HomeLastWin', 'VisitorLastWin', 'HomeWinStreak', 'VisitorWinStreak',
       'HomeTeamWonLast'],
      dtype='object')


In [21]:
# Append the predictions onto the entire data and keep only date, teams and prediction columns
future_predictions_array = []

for model in future_models:
    preds = make_preds(X_without_players, y_train_and_test, X_without_players_valid, model)
    future_predictions_array.append(preds)

len(future_predictions_array)



8

In [22]:
# Concat

future_predictions = pd.DataFrame(future_teams_and_dates)
future_predictions = future_predictions.reset_index(drop=True)

for i in future_predictions_array:
    df = pd.DataFrame(i)
    future_predictions = pd.concat([future_predictions, df], axis=1)

future_predictions.columns = ['Date', 'Visitor', 'Home' , 'SVC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB', 'QDA', 'KNC']
future_predictions

Unnamed: 0,Date,Visitor,Home,SVC,ADBC,RFC,GBC,HGBC,XGB,QDA,KNC
0,2021-05-14,Cleveland Cavaliers,Washington Wizards,True,True,True,True,True,True,True,True
1,2021-05-14,Denver Nuggets,Detroit Pistons,True,True,True,True,True,True,True,True
2,2021-05-14,Utah Jazz,Oklahoma City Thunder,False,True,True,True,True,True,True,True
3,2021-05-14,Orlando Magic,Philadelphia 76ers,True,True,True,True,True,True,True,True
4,2021-05-14,Toronto Raptors,Dallas Mavericks,True,True,False,False,False,False,False,True
5,2021-05-14,Los Angeles Clippers,Houston Rockets,True,True,True,True,True,True,True,True
6,2021-05-14,Sacramento Kings,Memphis Grizzlies,False,True,True,True,False,False,True,False
7,2021-05-14,New Orleans Pelicans,Golden State Warriors,True,True,True,True,False,True,True,False
8,2021-05-15,Chicago Bulls,Brooklyn Nets,True,True,True,True,True,False,True,True
9,2021-05-15,Los Angeles Lakers,Indiana Pacers,True,True,True,True,True,True,True,True


In [23]:
future_predictions.to_excel('future_predictions_one_year.xlsx', index=False)

In [48]:
past_predictions = pd.DataFrame()
past_predictions = past_predictions.reset_index(drop=True)

for i in predictions_array:
    df = pd.DataFrame(i)
    past_predictions = pd.concat([past_predictions, df], axis=1)

past_predictions.columns = ['SVC', 'ADBC', 'RFC', 'GBC', 'HGBC', 'XGB', 'QDA', 'KNC']
past_predictions

Unnamed: 0,SVC,ADBC,RFC,GBC,HGBC,XGB,QDA,KNC
1394,True,True,True,True,True,False,True,False
1395,True,True,True,True,True,True,True,False
1396,True,True,True,True,False,True,False,True
1397,True,True,True,True,True,True,True,True
1398,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...
1738,True,True,True,True,True,True,True,True
1739,True,True,False,False,False,False,False,True
1740,True,True,True,True,True,True,True,False
1741,True,True,True,False,True,False,True,True


In [49]:
past_predictions.to_excel('Past_predictions.xlsx')