In [442]:
# Importing Libraries & Functions


#Normal Stuff
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import warnings
warnings.filterwarnings('ignore')

#StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import Logit

#SciKit Learn 
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import r2_score,mean_squared_error, confusion_matrix, mean_absolute_error, accuracy_score, auc, roc_curve, roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier

import xgboost as xgb
from xgboost import XGBClassifier

def outputs(cm):
    acc = np.round((cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel()),4)
    tpr = np.round(cm.ravel()[3] / (cm.ravel()[3] + cm.ravel()[2]),4)
    fpr = np.round(cm.ravel()[1] / (cm.ravel()[1] + cm.ravel()[0]),4)
    outputs = [acc, tpr, fpr]
    return outputs

def cross_val_plot(x, y): 
    sns.set_style('darkgrid')
    fig, ax = plt.subplots(figsize = (12,6))
    
    plt.plot(x, y)

    plt.title('Accuracy v.s. Parameter', fontsize = 20, fontweight = 'bold', pad = 15)
    plt.xlabel('Parameter Value', fontsize = 14, labelpad = 15), plt.ylabel('Accuracy', fontsize = 14, labelpad = 15)
    plt.xticks(fontsize = 12), plt.yticks(fontsize = 12);
    plt.show();
    return

## Data Processing

In [2]:
# Loading Data + Only Newer seasons for simplicity + Filtering Unnecessary Columns 
games = pd.read_csv('games.csv')
timestamp = pd.to_datetime(games['GAME_DATE_EST'])
games.insert(0, 'TIMESTAMP', timestamp)
games = games[games['SEASON'].isin([2010,2011,2012,2013,2014,2015,2016,2017,2018, 2019])].sort_values('TIMESTAMP')

teams = pd.read_csv('teams.csv')
teams = teams[['TEAM_ID','NICKNAME']]
team_dict = teams.set_index('TEAM_ID').T.to_dict('list')
games['HOME_TEAM_NAME'] = games['HOME_TEAM_ID'].map(team_dict)
games['AWAY_TEAM_NAME'] = games['VISITOR_TEAM_ID'].map(team_dict)


drop_cols = ['GAME_DATE_EST', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 
             'VISITOR_TEAM_ID', 'TEAM_ID_home', 'TEAM_ID_away', 'SEASON']
games = games.drop(drop_cols, axis = 1).reset_index(drop = True)
front_cols = ['TIMESTAMP', 'GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']
temp_games = games[front_cols]
games = pd.concat([temp_games, games.drop(front_cols, axis = 1)], axis = 1)
games['HOME_TEAM_NAME'] = games['HOME_TEAM_NAME'].apply(lambda x: x[0])
games['AWAY_TEAM_NAME'] = games['AWAY_TEAM_NAME'].apply(lambda x: x[0])

#Sanity Check: Making sure each teams plays 82 regular szn games
# reg_start = '2018-10-16'
# playoff_start = '2019-04-13'
# reg = games[(games["TIMESTAMP"] >= reg_start) & (games["TIMESTAMP"] < playoff_start)].reset_index(drop = True)
# game_counts = reg['HOME_TEAM_NAME'].value_counts().sort_index() + reg['AWAY_TEAM_NAME'].value_counts().sort_index()
# print('Number of teams: ', len(reg['HOME_TEAM_NAME'].value_counts()))
# print('Number of teams playing 82 games: ', np.count_nonzero(game_counts == 82))
reg = games.copy()



## F2: Cumulative Player Stats

In [3]:
def pos_stat(stat_list):
    
    
    ### Takes in a list of statistics to consider from the DETAILS data frame and calculates
    ### the stats by position in each home for the home and away team.
    ### NOTE: THESE ARE NOT CUMULATIVE STATS!!!!!
    

    # Removing non-starters, getting rid of unnecessary columns
    details = pd.read_csv('games_details.csv')
    details = details[~details['START_POSITION'].isna()]
    details['TEAM_NAME'] = details['TEAM_ID'].map(team_dict).apply(lambda x: x[0])
    details['MINS'] = details['MIN'].str.split(':').apply(lambda x: float(x[0]) + (float(x[1])/60))
    details = details.drop(['TEAM_ABBREVIATION', 'TEAM_CITY', 'COMMENT', 'MIN'], axis = 1)
    temp_cols = ['GAME_ID','START_POSITION', 'TEAM_NAME']
    final_cols = np.append(temp_cols, stat_list).flatten()
    details = details[final_cols]
    #details = details.iloc[:300, :]
    
    output_df = pd.DataFrame()
    for stat in stat_list: 
        
        
        # Groupby the GAME, POSITION and TEAM and add up all stats by each position 
        df = details.copy()
        df = df.groupby(['GAME_ID', 'START_POSITION', 'TEAM_NAME']).sum().reset_index()
        df = pd.merge(df, games[['GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']], on = 'GAME_ID')

        # Need to Re-Order the DF s.t. home and away are known 
        home_condition = (df['TEAM_NAME'] == df['HOME_TEAM_NAME'])
        away_condition = (df['TEAM_NAME'] != df['HOME_TEAM_NAME'])

        # Adding HOME Columns Manually 
        df['C_' + str(stat) + '_home'] = df[ home_condition & (df['START_POSITION'] == 'C')][stat]
        df['F_' + str(stat) + '_home'] = df[ home_condition & (df['START_POSITION'] == 'F')][stat]
        df['G_' + str(stat) + '_home'] = df[ home_condition & (df['START_POSITION'] == 'G')][stat]

        # Adding AWAY Columns Manually 
        df['C_' + str(stat) + '_away'] = df[ away_condition & (df['START_POSITION'] == 'C')][stat]
        df['F_' + str(stat) + '_away'] = df[ away_condition & (df['START_POSITION'] == 'F')][stat]
        df['G_' + str(stat) + '_away'] = df[ away_condition & (df['START_POSITION'] == 'G')][stat]

        # Grouping Again to Get Rid of NaN 
        df = df.groupby('GAME_ID', as_index = False).sum()
        if stat_list.index(stat) > 0: 
            df = df.drop('GAME_ID', axis = 1)
        # Append to final data frame 
        output_df = pd.concat([output_df, df], axis = 1)

    output_df = output_df.drop(stat_list, axis = 1)
    return output_df

In [4]:
stat_list = ['FGM', 'FGA','FG_PCT', 'FG3M', 'FG3A', 
             'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
             'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 
             'PF', 'PTS', 'PLUS_MINUS']
pos_df = pos_stat(stat_list)

In [5]:
# Calculing Player Efficiency Rating
# Formula: (PTS + REB + AST + STL + BLK − Missed FG − Missed FT - TO) / GP
p1 = pos_df['C_PTS_home'] + pos_df['C_REB_home'] + pos_df['C_AST_home'] + pos_df['C_STL_home']
missed_fg = (pos_df['C_FGA_home'] - pos_df['C_FGM_home'])
missed_ft = pos_df['C_FTA_home'] - pos_df['C_FTM_home']
p2 = (missed_fg + missed_ft + pos_df['C_TO_home'])
per = (p1 - p2)/82

for pos in ['C', 'F', 'G']: 
    p1 = pos_df[pos + '_PTS_home'] + pos_df[pos + '_REB_home'] + pos_df[pos + '_AST_home'] + pos_df[pos + '_STL_home']
    missed_fg = (pos_df[pos + '_FGA_home'] - pos_df[pos + '_FGM_home'])
    missed_ft = pos_df[pos + '_FTA_home'] - pos_df[pos + '_FTM_home']
    p2 = (missed_fg + missed_ft + pos_df[pos + '_TO_home'])
    per = (p1-p2)/82
    pos_df[pos + '_PER_home'] = per
    
for pos in ['C', 'F', 'G']: 
    p1 = pos_df[pos + '_PTS_away'] + pos_df[pos + '_REB_away'] + pos_df[pos + '_AST_away'] + pos_df[pos + '_STL_away']
    missed_fg = (pos_df[pos + '_FGA_away'] - pos_df[pos + '_FGM_away'])
    missed_ft = pos_df[pos + '_FTA_away'] - pos_df[pos + '_FTM_away']
    p2 = (missed_fg + missed_ft + pos_df[pos + '_TO_away'])
    per = (p1-p2)/82
    pos_df[pos + '_PER_away'] = per

## F3: Momentum & Recency


In [458]:
def cum_avg(arr): 
    temp_list = [np.mean(arr[:i]) for i in np.arange(len(arr)+1)][1:]
    return np.append(np.nan, temp_list)[:-1]

def cum_5_avg(arr): 
    means = []
    for i in np.arange(len(arr)): 
        if i < 5: 
            means = np.append(means, np.nan)
        else: 
            means = np.append(means, np.mean(arr[i-5: i+1]))
    return means

def cum_10_avg(arr): 
    means = []
    for i in np.arange(len(arr)): 
        if i < 10: 
            means = np.append(means, np.nan)
        else: 
            means = np.append(means, np.mean(arr[i-10: i+1]))
    return means

def cum_wins(arr): 
    temp_list = [np.sum(arr[:i]) for i in np.arange(len(arr)+1)][1:]
    return np.append(np.nan, temp_list)[:-1]

def cum_5_wins(arr): 
    wins = []
    for i in np.arange(len(arr)): 
        if i < 5: 
            wins = np.append(wins, np.nan)
        else: 
            wins = np.append(wins, np.sum(arr[i-5: i+1]))
    return wins

def cum_10_wins(arr): 
    wins = []
    for i in np.arange(len(arr)): 
        if i < 10: 
            wins = np.append(wins, np.nan)
        else: 
            wins = np.append(wins, np.sum(arr[i-10: i+1]))
    return wins

def cum_losses(arr): 
    temp_list = [np.count_nonzero(arr[:i]==0) for i in np.arange(len(arr)+1)][1:]
    return np.append(np.nan, temp_list)[:-1]

def cum_5_losses(arr): 
    losses = []
    for i in np.arange(len(arr)): 
        if i < 5: 
            losses = np.append(losses, np.nan)
        else: 
            losses = np.append(losses, np.count_nonzero(arr[i-5: i+1]==0))
    return losses

def cum_10_losses(arr): 
    losses = []
    for i in np.arange(len(arr)): 
        if i < 10: 
            losses = np.append(losses, np.nan)
        else: 
            losses = np.append(losses, np.count_nonzero(arr[i-10: i+1]==0))
    return losses

In [16]:
# Cum Stats Home
df = reg.copy()
df = df.merge(pos_df, on = 'GAME_ID')
df['TEAM_LIST'] = df['HOME_TEAM_NAME'] + ', ' + df['AWAY_TEAM_NAME']
df_home = df.drop('AWAY_TEAM_NAME', axis = 1)
df_home = df.sort_values('TIMESTAMP', ascending = True)
column_list = df_home.columns[4:]
home_stat_list = [stat for stat in column_list if 'home' in stat]

t = time.time()
for stat in home_stat_list: 
    
    keep_cols = np.append(['TIMESTAMP', 'TEAM_LIST','GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME'], stat)
    df_home_temp = df_home[keep_cols]
    
    # Two Grouped DFs
    df_1 = df_home_temp.groupby(['HOME_TEAM_NAME', 'GAME_ID']).mean()
    df_2 = df_home_temp.groupby(['HOME_TEAM_NAME']).agg(list)

    # Using Functions above to calulate the cumulative values as lists
    cum_lists = df_2.apply(lambda x: cum_avg(x[stat]), axis = 1)
    cum_5_lists = df_2.apply(lambda x: cum_5_avg(x[stat]), axis = 1)
    cum_10_lists = df_2.apply(lambda x: cum_10_avg(x[stat]), axis = 1)

    # Converting these nested lists to single lists
    cum_vals = cum_lists.to_frame().explode(0)[0].to_numpy()
    cum_5_vals = cum_5_lists.to_frame().explode(0)[0].to_numpy()
    cum_10_vals = cum_10_lists.to_frame().explode(0)[0].to_numpy()

    df_1 = df_1.reset_index()
    df_1['cum_' + str(stat)] = cum_vals
    df_1['cum_5_' + str(stat)] = cum_5_vals
    df_1['cum_10_' + str(stat)] = cum_10_vals
    
    # Converting to Dictionaries and Adding to Large DF
    cum_dict = df_1.set_index('GAME_ID').iloc[:, -3].T.to_dict()
    cum_dict_5 = df_1.set_index('GAME_ID').iloc[:, -2].T.to_dict()
    cum_dict_10 = df_1.set_index('GAME_ID').iloc[:, -1].T.to_dict()

    df_home['cum_' + str(stat)] = df['GAME_ID'].map(cum_dict)
    df_home['cum_5_' + str(stat)] = df['GAME_ID'].map(cum_dict_5)
    df_home['cum_10_' + str(stat)] = df['GAME_ID'].map(cum_dict_10)
    
    
    # HEAD TO HEAD STATS ==> SAME THING BASICALLY 
    
    # Two Grouped DFs
    df_1_H2H = df_home.groupby(['TEAM_LIST', 'GAME_ID']).mean()
    df_2_H2H = df_home.groupby('TEAM_LIST').agg(list)
    
    # Using Functions above to calulate the cumulative values as lists
    cum_lists_H2H = df_2_H2H.apply(lambda x: cum_avg(x[stat]), axis = 1)
    cum_5_lists_H2H = df_2_H2H.apply(lambda x: cum_5_avg(x[stat]), axis = 1)
    cum_10_lists_H2H = df_2_H2H.apply(lambda x: cum_10_avg(x[stat]), axis = 1)

    # Converting these nested lists to single lists
    cum_vals_H2H = cum_lists_H2H.to_frame().explode(0)[0].to_numpy()
    cum_5_vals_H2H = cum_5_lists_H2H.to_frame().explode(0)[0].to_numpy()
    cum_10_vals_H2H = cum_5_lists_H2H.to_frame().explode(0)[0].to_numpy()

    #df_1_H2H = df_1_H2H.reset_index()
    df_1_H2H['cum_' + str(stat) + '_H2H'] = cum_vals_H2H
    df_1_H2H['cum_5_' + str(stat) + '_H2H'] = cum_5_vals_H2H
    df_1_H2H['cum_10_' + str(stat) + '_H2H'] = cum_10_vals_H2H
    
   
    # Converting to Dictionaries and Adding to Large DF
    cum_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -3].T.to_dict()
    cum_5_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -2].T.to_dict()
    cum_10_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -1].T.to_dict()
    
    df_home['cum_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_dict_H2H)
    df_home['cum_5_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_5_dict_H2H)
    df_home['cum_10_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_10_dict_H2H)
  
    
    

    
    print('# ' + str(home_stat_list.index(stat)) + ' of ' + str(len(home_stat_list)) + ' COMPLETE: ', str(stat), '-- TIME: ', time.time()-t)


# 0 of 66 COMPLETE:  PTS_home -- TIME:  19.914891958236694
# 1 of 66 COMPLETE:  FG_PCT_home -- TIME:  38.72488498687744
# 2 of 66 COMPLETE:  FT_PCT_home -- TIME:  59.661585092544556
# 3 of 66 COMPLETE:  FG3_PCT_home -- TIME:  79.63998293876648
# 4 of 66 COMPLETE:  AST_home -- TIME:  101.40286898612976
# 5 of 66 COMPLETE:  REB_home -- TIME:  126.62532591819763
# 6 of 66 COMPLETE:  C_FGM_home -- TIME:  156.09960889816284
# 7 of 66 COMPLETE:  F_FGM_home -- TIME:  187.67229008674622
# 8 of 66 COMPLETE:  G_FGM_home -- TIME:  211.35696506500244
# 9 of 66 COMPLETE:  C_FGA_home -- TIME:  236.4054229259491
# 10 of 66 COMPLETE:  F_FGA_home -- TIME:  262.60438084602356
# 11 of 66 COMPLETE:  G_FGA_home -- TIME:  292.63565492630005
# 12 of 66 COMPLETE:  C_FG_PCT_home -- TIME:  318.6850309371948
# 13 of 66 COMPLETE:  F_FG_PCT_home -- TIME:  349.52812099456787
# 14 of 66 COMPLETE:  G_FG_PCT_home -- TIME:  380.8778989315033
# 15 of 66 COMPLETE:  C_FG3M_home -- TIME:  415.42972683906555
# 16 of 66 COMP

In [21]:
# Cum Stats Away
df_away = df.drop('HOME_TEAM_NAME', axis = 1)
df_away = df.sort_values('TIMESTAMP', ascending = True)
column_list = df_away.columns[4:]
away_stat_list = [stat for stat in column_list if 'away' in stat]

t = time.time()
for stat in away_stat_list: 
    
    keep_cols = np.append(['TIMESTAMP', 'TEAM_LIST','GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME'], stat)
    df_away_temp = df_away[keep_cols]
    
    # Two Grouped DFs
    df_1 = df_away_temp.groupby(['AWAY_TEAM_NAME', 'GAME_ID']).mean()
    df_2 = df_away_temp.groupby(['AWAY_TEAM_NAME']).agg(list)

    # Using Functions above to calulate the cumulative values as lists
    cum_lists = df_2.apply(lambda x: cum_avg(x[stat]), axis = 1)
    cum_5_lists = df_2.apply(lambda x: cum_5_avg(x[stat]), axis = 1)
    cum_10_lists = df_2.apply(lambda x: cum_10_avg(x[stat]), axis = 1)

    # Converting these nested lists to single lists
    cum_vals = cum_lists.to_frame().explode(0)[0].to_numpy()
    cum_5_vals = cum_5_lists.to_frame().explode(0)[0].to_numpy()
    cum_10_vals = cum_10_lists.to_frame().explode(0)[0].to_numpy()

    df_1 = df_1.reset_index()
    df_1['cum_' + str(stat)] = cum_vals
    df_1['cum_5_' + str(stat)] = cum_5_vals
    df_1['cum_10_' + str(stat)] = cum_10_vals
    
    # Converting to Dictionaries and Adding to Large DF
    cum_dict = df_1.set_index('GAME_ID').iloc[:, -3].T.to_dict()
    cum_dict_5 = df_1.set_index('GAME_ID').iloc[:, -2].T.to_dict()
    cum_dict_10 = df_1.set_index('GAME_ID').iloc[:, -1].T.to_dict()

    df_away['cum_' + str(stat)] = df['GAME_ID'].map(cum_dict)
    df_away['cum_5_' + str(stat)] = df['GAME_ID'].map(cum_dict_5)
    df_away['cum_10_' + str(stat)] = df['GAME_ID'].map(cum_dict_10)

    
    
    
    # HEAD TO HEAD STATS ==> SAME THING BASICALLY 
    
    
    
    # Two Grouped DFs
    df_1_H2H = df_away.groupby(['TEAM_LIST', 'GAME_ID']).mean()
    df_2_H2H = df_away.groupby('TEAM_LIST').agg(list)
    
    # Using Functions above to calulate the cumulative values as lists
    cum_lists_H2H = df_2_H2H.apply(lambda x: cum_avg(x[stat]), axis = 1)
    cum_5_lists_H2H = df_2_H2H.apply(lambda x: cum_5_avg(x[stat]), axis = 1)
    cum_10_lists_H2H = df_2_H2H.apply(lambda x: cum_10_avg(x[stat]), axis = 1)

    # Converting these nested lists to single lists
    cum_vals_H2H = cum_lists_H2H.to_frame().explode(0)[0].to_numpy()
    cum_5_vals_H2H = cum_5_lists_H2H.to_frame().explode(0)[0].to_numpy()
    cum_10_vals_H2H = cum_5_lists_H2H.to_frame().explode(0)[0].to_numpy()

    #df_1_H2H = df_1_H2H.reset_index()
    df_1_H2H['cum_' + str(stat) + '_H2H'] = cum_vals_H2H
    df_1_H2H['cum_5_' + str(stat) + '_H2H'] = cum_5_vals_H2H
    df_1_H2H['cum_10_' + str(stat) + '_H2H'] = cum_10_vals_H2H
    
   
    # Converting to Dictionaries and Adding to Large DF
    cum_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -3].T.to_dict()
    cum_5_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -2].T.to_dict()
    cum_10_dict_H2H = df_1_H2H.droplevel(0).iloc[:, -1].T.to_dict()
    
    df_away['cum_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_dict_H2H)
    df_away['cum_5_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_5_dict_H2H)
    df_away['cum_10_' + str(stat) + '_H2H'] = df['GAME_ID'].map(cum_10_dict_H2H)
    
    
    print('# ' + str(away_stat_list.index(stat)+1) + ' of ' + str(len(away_stat_list)-1) + ' COMPLETE: ', str(stat), '-- TIME: ', time.time()-t)


# 1 of 65 COMPLETE:  PTS_away -- TIME:  16.15410804748535
# 2 of 65 COMPLETE:  FG_PCT_away -- TIME:  32.72529602050781
# 3 of 65 COMPLETE:  FT_PCT_away -- TIME:  49.19173812866211
# 4 of 65 COMPLETE:  FG3_PCT_away -- TIME:  66.24934792518616
# 5 of 65 COMPLETE:  AST_away -- TIME:  83.94797992706299
# 6 of 65 COMPLETE:  REB_away -- TIME:  102.28905582427979
# 7 of 65 COMPLETE:  C_FGM_away -- TIME:  121.6736581325531
# 8 of 65 COMPLETE:  F_FGM_away -- TIME:  140.9667067527771
# 9 of 65 COMPLETE:  G_FGM_away -- TIME:  160.62837076187134
# 10 of 65 COMPLETE:  C_FGA_away -- TIME:  180.8727867603302
# 11 of 65 COMPLETE:  F_FGA_away -- TIME:  201.69956493377686
# 12 of 65 COMPLETE:  G_FGA_away -- TIME:  223.4031798839569
# 13 of 65 COMPLETE:  C_FG_PCT_away -- TIME:  245.4473750591278
# 14 of 65 COMPLETE:  F_FG_PCT_away -- TIME:  267.7765049934387
# 15 of 65 COMPLETE:  G_FG_PCT_away -- TIME:  290.7130858898163
# 16 of 65 COMPLETE:  C_FG3M_away -- TIME:  315.6560769081116
# 17 of 65 COMPLETE:  

In [814]:
# Combining the Home and Away DataFrames 
df_home_2 = df_home.copy()
df_away_2 = df_away.copy()
df_home_2 = df_home_2.drop(np.append(home_stat_list, away_stat_list,), axis = 1)
# df_home_2 = df_home_2.drop(['C_PER_away', 'F_PER_away', 'G_PER_away'], axis = 1)
# df_away_2 = df_away_2.drop(['C_PER_away', 'F_PER_away', 'G_PER'], axis = 1)
df_away_2 = df_away_2.drop(np.append(home_stat_list, away_stat_list), axis = 1)

df_clean = pd.merge(df_home_2, df_away_2, on = 'GAME_ID')
xcol = [col for col in df_clean.columns if '_x' in col]
ycol = [col for col in df_clean.columns if '_y' in col]
df_clean = df_clean.drop(ycol, axis = 1)
df_clean.columns = df_clean.columns.str.replace(r'_x$', '')

In [815]:
# Home Win/Loss Counter
df_clean2 = df_clean.copy()

game_szn_dict = pd.read_csv('games.csv')[['GAME_ID','SEASON']].set_index('GAME_ID').iloc[:, 0].T.to_dict()
df_clean2['SEASON'] = df_clean2['GAME_ID'].map(game_szn_dict)

keep_cols = ['TIMESTAMP', 'SEASON', 'HOME_TEAM_WINS','GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']
df_clean2_temp = df_clean2[keep_cols]
df_clean2_temp['AWAY_TEAM_WINS'] = df_clean2_temp['HOME_TEAM_WINS'].map({0:1, 1:0})

# Two Grouped DFs
df_g1 = df_clean2_temp.groupby(['HOME_TEAM_NAME', 'SEASON','GAME_ID']).mean()
df_g2 = df_clean2_temp.groupby(['HOME_TEAM_NAME']).agg(list)

# Using Functions above to calulate the cumulative values as lists

    # HOME WINS!
cum_wins_home = df_g2['HOME_TEAM_WINS'].apply(cum_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_5_wins_home = df_g2['HOME_TEAM_WINS'].apply(cum_5_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_10_wins_home = df_g2['HOME_TEAM_WINS'].apply(cum_10_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()

    # HOME WINS!
cum_losses_home = df_g2['HOME_TEAM_WINS'].apply(cum_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_5_losses_home = df_g2['HOME_TEAM_WINS'].apply(cum_5_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_10_losses_home = df_g2['HOME_TEAM_WINS'].apply(cum_10_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()


# #df_clean = df_clean.reset_index()
df_clean2['cum_WINS_home'] = cum_wins_home
df_clean2['cum_LOSSES_home'] = cum_losses_home
df_clean2['cum_5_WINS_home'] = cum_5_wins_home
df_clean2['cum_10_WINS_home'] = cum_10_wins_home
df_clean2['cum_5_LOSSES_home'] = cum_5_losses_home
df_clean2['cum_10_LOSSES_home'] = cum_10_losses_home


In [816]:
# Away Win/Loss Counter


df_clean2_temp['AWAY_TEAM_WINS'] = df_clean2_temp['HOME_TEAM_WINS'].map({0:1, 1:0})

# Two Grouped DFs
df_g1 = df_clean2_temp.groupby(['AWAY_TEAM_NAME', 'SEASON','GAME_ID']).mean()
df_g2 = df_clean2_temp.groupby(['AWAY_TEAM_NAME']).agg(list)

# Using Functions above to calulate the cumulative values as lists

    # AWAY WINS!
cum_wins_away = df_g2['AWAY_TEAM_WINS'].apply(cum_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_5_wins_away = df_g2['AWAY_TEAM_WINS'].apply(cum_5_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_10_wins_away = df_g2['AWAY_TEAM_WINS'].apply(cum_10_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()

    # AWAY LOSSES
cum_losses_away = df_g2['AWAY_TEAM_WINS'].apply(cum_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_5_losses_away = df_g2['AWAY_TEAM_WINS'].apply(cum_5_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_10_losses_away = df_g2['AWAY_TEAM_WINS'].apply(cum_10_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()


# #df_clean = df_clean.reset_index()
df_clean2['cum_WINS_away'] = cum_wins_away
df_clean2['cum_LOSSES_away'] = cum_losses_away
df_clean2['cum_5_WINS_away'] = cum_5_wins_away
df_clean2['cum_10_WINS_away'] = cum_10_wins_away
df_clean2['cum_5_LOSSES_away'] = cum_5_losses_away
df_clean2['cum_10_LOSSES_away'] = cum_10_losses_away


In [817]:
# HEAD TO HEAD WIN/LOSS COUNTER 

#HOME HEAD TO HEAD Win/Loss Counter

df_clean2_temp['AWAY_TEAM_WINS'] = df_clean2_temp['HOME_TEAM_WINS'].map({0:1, 1:0})
df_clean2_temp['TEAM_LIST'] = df_clean2_temp['HOME_TEAM_NAME'] + ', ' + df_clean2_temp['AWAY_TEAM_NAME']


# Two Grouped DFs
df_g1 = df_clean2_temp.groupby(['TEAM_LIST', 'SEASON','GAME_ID']).mean()
df_g2 = df_clean2_temp.groupby(['TEAM_LIST']).agg(list)

# Using Functions above to calulate the cumulative values as lists

    # HOME WINS!
cum_wins_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_5_wins_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_5_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_10_wins_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_10_wins).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()

    # HOME Losses!
cum_losses_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_5_losses_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_5_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()
cum_10_losses_home_H2H = df_g2['HOME_TEAM_WINS'].apply(cum_10_losses).to_frame().explode('HOME_TEAM_WINS')['HOME_TEAM_WINS'].to_numpy()


# #df_clean = df_clean.reset_index()
df_clean2['cum_WINS_home_H2H'] = cum_wins_home_H2H
df_clean2['cum_LOSSES_home_H2H'] = cum_losses_home_H2H
df_clean2['cum_5_WINS_home_H2H'] = cum_5_wins_home_H2H
df_clean2['cum_10_WINS_home_H2H'] = cum_10_wins_home_H2H
df_clean2['cum_5_LOSSES_home_H2H'] = cum_5_losses_home_H2H
df_clean2['cum_10_LOSSES_home_H2H'] = cum_10_losses_home_H2H




#____________________

# AWAY HEAD TO HEAD Win/Loss Counter

# Two Grouped DFs
df_g1 = df_clean2_temp.groupby(['TEAM_LIST', 'SEASON','GAME_ID']).mean()
df_g2 = df_clean2_temp.groupby(['TEAM_LIST']).agg(list)

# Using Functions above to calulate the cumulative values as lists

    # HOME WINS!
cum_wins_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_5_wins_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_5_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_10_wins_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_10_wins).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()

    # HOME Losses!
cum_losses_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_5_losses_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_5_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()
cum_10_losses_away_H2H = df_g2['AWAY_TEAM_WINS'].apply(cum_10_losses).to_frame().explode('AWAY_TEAM_WINS')['AWAY_TEAM_WINS'].to_numpy()


# #df_clean = df_clean.reset_index()
df_clean2['cum_WINS_away_H2H'] = cum_wins_away_H2H
df_clean2['cum_LOSSES_away_H2H'] = cum_losses_away_H2H
df_clean2['cum_5_WINS_away_H2H'] = cum_5_wins_away_H2H
df_clean2['cum_10_WINS_away_H2H'] = cum_10_wins_away_H2H
df_clean2['cum_5_LOSSES_away_H2H'] = cum_5_losses_away_H2H
df_clean2['cum_10_LOSSES_away_H2H'] = cum_10_losses_away_H2H



## F5: X-Factor

In [818]:
# # Creating Roster Data Frame
# details = pd.read_csv('games_details.csv')
# details = details[~details['START_POSITION'].isna()]
# details['TEAM_NAME'] = details['TEAM_ID'].map(team_dict).apply(lambda x: x[0])

# details = details[['GAME_ID', 'TEAM_NAME', 'PLAYER_NAME']]
# details_grouped = details.groupby(['GAME_ID', 'TEAM_NAME'], as_index = False).agg(list)

# roster = pd.merge(details_grouped, games[['TIMESTAMP','GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']], on = 'GAME_ID')
# home_condition = (roster['TEAM_NAME'] == roster['HOME_TEAM_NAME'])
# away_condition = (roster['TEAM_NAME'] != roster['HOME_TEAM_NAME'])

# roster_home = roster[home_condition]
# roster_home['ROSTER_home'] = roster_home['PLAYER_NAME']
# roster_home = roster_home[['TIMESTAMP','GAME_ID', 'ROSTER_home']]

# roster_away = roster[away_condition]
# roster_away['ROSTER_away'] = roster_away['PLAYER_NAME']
# roster_away = roster_away[['TIMESTAMP','GAME_ID', 'ROSTER_away']]

# roster = pd.merge(roster_home, roster_away, on = 'GAME_ID')
# roster['SEASON'] = roster['TIMESTAMP_x'].dt.year
# roster = roster.drop(['TIMESTAMP_x', 'TIMESTAMP_y'], axis = 1)

In [819]:
# # Creating Features for the # of Players on home/away team that were MVP, DPOY, ROY, 6MOY in previous 3 Seasons

# def prev_3(award, awards): 
#     award_list = np.array(list(awards[award]))
#     prev_3_list = []
#     for i in np.arange(12): 
#         prev_3_list.append(award_list[np.arange(i+1, i+4)])
#     return prev_3_list

# awards = pd.read_csv('awards.csv').iloc[:15, :5]
# awards['SEASON'] = awards['SEASON'].astype(int)
# full_list = []
# award_names = ['MVP', 'DPOY', 'ROY', '6MOY']
# for award in award_names: 
#     temp_3_list = prev_3(award, awards)
#     full_list.append(temp_3_list)
# awards = awards.iloc[:12]
# for i in np.arange(len(full_list)): 
#     awards['PREV_3_' + str(award_names[i])] = full_list[i]
#     awards = awards.drop(award_names[i], axis = 1)

# df_X = pd.merge(roster, awards, on = 'SEASON')
# def count_award_home(row, award_name): 
#     return sum(item in row['ROSTER_home'] for item in row['PREV_3_' + str(award_name)])
# def count_award_away(row, award_name): 
#     return sum(item in row['ROSTER_away'] for item in row['PREV_3_' + str(award_name)])


# award_name = '6MOY'
# for award_name in award_names: 
#     df_X['COUNT_PREV_3_' + award_name + '_home'] = df_X.apply(lambda row: count_award_home(row, award_name), axis = 1)
#     df_X['COUNT_PREV_3_' + award_name + '_away'] = df_X.apply(lambda row: count_award_away(row, award_name), axis = 1)
#     df_X = df_X.drop('PREV_3_' + str(award_name), axis = 1)
# award_df = df_X.drop(['ROSTER_home', 'ROSTER_away', 'SEASON'], axis = 1)

### ODDS DATA


In [820]:
num = len(df_clean2) - len(df_clean2.dropna())
print('Number of NaN Rows: ', num)
cols = [col for col in df_clean2.columns if np.any(['5' in col, '10' in col])]
for col in cols: 
    new_col = re.sub(r'_\d{1,2}_', '_', col)
    df_clean2[col] = df_clean2[col].fillna(df_clean2[new_col])
print('Number of NaN Rows Saved: ', len(df_clean2) - num)

Number of NaN Rows:  10152
Number of NaN Rows Saved:  3106


In [792]:
# Odds 
odds = pd.DataFrame()
season_list = np.arange(2010, 2020)
for season in season_list: 
    df = pd.read_excel('ODDS_DATA/odds_' + str(season) + '.xlsx', usecols = ['Date', 'Team', 'ML'])
    df['Date'] = df['Date'].apply(lambda x: '{0:0>4}'.format(x))

    new_year_index = df[df['Date'].str.contains(r'01\d{2}')].index[0]
    year1 = np.repeat(str(season), new_year_index-1)
    year2 = np.repeat(str(season+1), len(df) - len(year1))
    df['Year'] = np.append(year1, year2)
    df['TIMESTAMP'] = df['Year'] + df['Date']
    df = df.drop('Date', axis = 1)
    odds = odds.append(df)
home_teams, home_odds = odds.iloc[1::2]['Team'], odds.iloc[1::2]['ML']
away_teams, away_odds = odds.iloc[::2]['Team'], odds.iloc[::2]['ML']
dates = odds.iloc[1::2]['TIMESTAMP']
odds = pd.DataFrame({'TIMESTAMP' : list(dates), 
                   'HOME_TEAM_NAME' : list(home_teams),
                   'AWAY_TEAM_NAME' : list(away_teams), 
                   'HOME_TEAM_ODDS' : list(home_odds), 
                   'AWAY_TEAM_ODDS' : list(away_odds)})
odds['HOME_TEAM_NAME'] = odds['HOME_TEAM_NAME'].str.replace(r'([a-z])([A-Z])', r'\1 \2')
odds['AWAY_TEAM_NAME'] = odds['AWAY_TEAM_NAME'].str.replace(r'([a-z])([A-Z])', r'\1 \2')

odds['TIMESTAMP'] = pd.to_datetime(odds['TIMESTAMP'], format = '%Y%m%d', errors = 'coerce')
teams = pd.read_csv('teams.csv', usecols = ['CITY', 'NICKNAME'])
teams['CITY'][7] = 'LAClippers'
teams['CITY'][8] = 'LALakers'

odds['HOME_TEAM_NAME'] = odds['HOME_TEAM_NAME'].str.replace('New Jersey', 'Brooklyn')
odds['AWAY_TEAM_NAME'] = odds['AWAY_TEAM_NAME'].str.replace('New Jersey', 'Brooklyn')

team_city_dict = teams.set_index('CITY')['NICKNAME'].T.to_dict()
odds['HOME_TEAM_NAME'] = odds['HOME_TEAM_NAME'].map(team_city_dict)
odds['AWAY_TEAM_NAME'] = odds['AWAY_TEAM_NAME'].map(team_city_dict)

In [827]:
new_df = pd.merge(df_clean2, odds, on = ['TIMESTAMP', 'HOME_TEAM_NAME'])

cols = [col for col in new_df.columns if np.any(['5' in col, '10' in col])]
for col in cols: 
    new_col = re.sub(r'_\d{1,2}_', '_', col)
    new_df[col] = new_df[col].fillna(new_df[new_col])

new_df = new_df.fillna(0)
new_df = new_df.fillna(0)
szn_dict = pd.read_csv('games.csv', usecols = ['GAME_ID', 'SEASON']).set_index('GAME_ID')['SEASON'].T.to_dict()
new_df['SEASON'] = new_df['GAME_ID'].map(szn_dict)

cum_col_list = np.append([col for col in list(df_fin.columns) if 'cum' in col], ['HOME_TEAM_WINS', 'SEASON', 'GAME_ID'])
new_df = new_df[cum_col_list]


In [833]:
xfactor = pd.read_csv('XFactor_features.csv')
df = pd.merge(new_df, xfactor)

In [910]:
# Splitting Data

train_df = df[df['SEASON'].between(2010,2016)]
test_df = df[df['SEASON'].between(2017,2018)]

train_df, test_df = train_df.drop(['SEASON', 'GAME_ID'], axis = 1), test_df.drop(['SEASON', 'GAME_ID'], axis = 1)
X_train, y_train = train_df.drop('HOME_TEAM_WINS', axis = 1), train_df['HOME_TEAM_WINS']
X_test, y_test = test_df.drop('HOME_TEAM_WINS', axis = 1), test_df['HOME_TEAM_WINS']

# X, y = df.drop('HOME_TEAM_WINS', axis = 1), df['HOME_TEAM_WINS']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

train_prop = np.round(len(X_train) / (len(X_train) + len(X_test)), 3)
test_prop = np.round(len(X_test) / (len(X_train) + len(X_test)), 3)
print('Training Data' + ' -- ' + str(len(X_train)) + ' rows -- ' + str(train_prop*100) + '%')
print('Testing Data' + ' -- ' + str(len(X_test)) + ' rows -- ' + str(test_prop*100) + '%')

Training Data -- 8945 rows -- 77.3%
Testing Data -- 2624 rows -- 22.7%


In [845]:
# Baseline Model 
model_0 = DummyClassifier(strategy = "most_frequent")
model_0.fit(X_train, y_train)
y_pred_0 = model_0.predict(X_test)

# Confusion Matrix & Outputs 
cm_0 = confusion_matrix(y_test, y_pred_0)
outputs_0 = outputs(cm_0)
print ("\nConfusion Matrix : \n", cm_0) 
print ("\nAccuracy : ", outputs_0[0])


Confusion Matrix : 
 [[   0 1079]
 [   0 1545]]

Accuracy :  0.5888


In [846]:
# # Logistic Regression

# model_1 = LogisticRegression(random_state = 69, max_iter = 5000, verbose = 3, n_jobs = -1)
# model_1.fit(X_train, y_train)
# y_prob_1 = model_1.predict_proba(X_test)
# y_pred_1 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_1[:,1]], index = y_test.index)

# # Confusion Matrix & Outputs 
# cm_1 = confusion_matrix(y_test, y_pred_1)
# outputs_1 = outputs(cm_1)
# print ("\nConfusion Matrix : \n", cm_1) 
# print ("\nAccuracy : ", outputs_1[0]) 

In [901]:
# # Random Forest

# model_2 = RandomForestClassifier(random_state = 69)
# model_2.fit(X_train, y_train)
# y_prob_2 = model_2.predict_proba(X_test)
# y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)

# # Confusion Matrix & Outputs 
# cm_2 = confusion_matrix(y_test, y_pred_2)
# outputs_2 = outputs(cm_2)
# print ("\nConfusion Matrix : \n", cm_2) 
# print ("\nAccuracy : ", outputs_2[0]) 

In [None]:
# XgBoost

model_2 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=108,
 max_depth=4,
 min_child_weight=2,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.01,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model_2.fit(X_train, y_train)
y_prob_2 = model_2.predict_proba(X_test)
y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_2 = confusion_matrix(y_test, y_pred_2)
outputs_2 = outputs(cm_2)
print ("\nConfusion Matrix : \n", cm_2) 
print ("\nAccuracy : ", outputs_2[0]) 

In [941]:
layer_list = []
for i in np.arange(25,475,25): 
    layer_list.append((i,))
    layer_list.append((i,i))
    layer_list.append((i,i,i))
    layer_list.append((i,np.round(i/2)))

In [None]:
# Neural Network 
from sklearn.neural_network import MLPClassifier
# model_3 = MLPClassifier(hidden_layer_sizes =(150,100,50), max_iter = 300,
#                         activation = 'relu', solver = 'adam', random_state = 69)
# model_3.fit(X_train, y_train)
# y_prob_3 = model_3.predict_proba(X_test)
# y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

grid = {
    'hidden_layer_sizes': layer_list
}
model_3 = MLPClassifier(max_iter = 200, alpha = 0.0001, activation = 'relu',
                        solver = 'adam', random_state = 69)
model_3 = GridSearchCV(model_3, grid, n_jobs = -1, cv = 5, verbose = 3)
model_3.fit(X_train, y_train)

y_prob_3 = model_3.best_estimator_.predict(X_test)
print(model_3.best_params_)
y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3])

# Confusion Matrix & Outputs 
cm_3 = confusion_matrix(y_test, y_pred_3)
outputs_3 = outputs(cm_3)
print ("\nConfusion Matrix : \n", cm_3) 
print ("\nAccuracy : ", outputs_3[0]) 



Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 78.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 280.1min


### trying PCA

In [362]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 


scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Make an instance of the Model ... .95 means 95 % of explained variance
pca = PCA(.99) 
pca.fit(X_train)
print(pca.n_components_)

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)


201


In [363]:
# XgBoost

model_2 = XGBClassifier(random_state = 69)
model_2.fit(X_train, y_train)
y_prob_2 = model_2.predict_proba(X_test)
y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_2 = confusion_matrix(y_test, y_pred_2)
outputs_2 = outputs(cm_2)
print ("\nConfusion Matrix : \n", cm_2) 
print ("\nAccuracy : ", outputs_2[0]) 


Confusion Matrix : 
 [[ 278  763]
 [ 170 1306]]

Accuracy :  0.6293


In [339]:
# y_prob_2 = model_2.predict_proba(X_train)
# y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_train.index)

# # Confusion Matrix & Outputs 
# cm_2 = confusion_matrix(y_train, y_pred_2)
# outputs_2 = outputs(cm_2)
# print ("\nConfusion Matrix : \n", cm_2) 
# print ("\nAccuracy : ", outputs_2[0]) 

we should test our model on the training set to see the overfiting potential 

In [340]:
def modelfit(alg, train, predictors, useTrainCV = True, cv_folds = 5, early_stopping_rounds = 50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train[predictors].values, label=train[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold = cv_folds,
                          metrics = 'error', early_stopping_rounds = early_stopping_rounds, verbose_eval = True)
        alg.set_params(n_estimators = cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train[predictors], train['HOME_TEAM_WINS'], eval_metric = 'error')
        
    #Predict training set:
    train_predictions = alg.predict(train[predictors])
    train_predprob = alg.predict_proba(train[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(train['HOME_TEAM_WINS'].values, train_predictions))
    print("AUC Score (Train): %f" % roc_auc_score(train['HOME_TEAM_WINS'], train_predprob))
                    
#     feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

In [343]:
target = 'HOME_TEAM_WINS'
train = pd.concat([X_train, y_train], axis = 1)
predictors = [x for x in train.columns if x not in [target]]

model_3 = XGBClassifier(learning_rate = 0.1, n_estimators = 1000, max_depth = 5, min_child_weight = 1,
                     gamma = 0, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic',
                     nthread = 4, scale_pos_weight = 1, seed = 69)
modelfit(model_3, train, X_train.columns, useTrainCV = True)

[0]	train-error:0.317888+0.00535608	test-error:0.357569+0.00622042
[1]	train-error:0.301997+0.00184675	test-error:0.351511+0.00527198
[2]	train-error:0.295019+0.0026074	test-error:0.348829+0.00748783
[3]	train-error:0.2905+0.00127641	test-error:0.343763+0.00615271
[4]	train-error:0.28531+0.00066403	test-error:0.340087+0.00368869
[5]	train-error:0.280567+0.00206109	test-error:0.338895+0.00642837
[6]	train-error:0.278059+0.00233751	test-error:0.337803+0.00892485
[7]	train-error:0.275253+0.00302776	test-error:0.337207+0.00457361
[8]	train-error:0.273664+0.00357941	test-error:0.337107+0.00522299
[9]	train-error:0.27051+0.00415267	test-error:0.338994+0.00650702
[10]	train-error:0.268276+0.00346004	test-error:0.339292+0.0086499
[11]	train-error:0.266413+0.00321433	test-error:0.338895+0.00767615
[12]	train-error:0.264154+0.00357408	test-error:0.338597+0.00732043
[13]	train-error:0.26177+0.00394797	test-error:0.337902+0.0076211
[14]	train-error:0.259411+0.00351274	test-error:0.338696+0.0083658

In [344]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=68, max_depth=5, 
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=1, seed=27), 
                        param_grid = param_test1, 
                        scoring = 'accuracy', n_jobs = -1, iid=False , cv=5, verbose = 3)
gsearch1.fit(train[predictors],train[target])
gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 21.9min finished


({'max_depth': 3, 'min_child_weight': 1}, 0.6637865295637936)

In [346]:
# worked in intervals of 2 we look at +-1 from the optimal parameter value from above
param_test2 = {
 'max_depth':[2,3,4],
 'min_child_weight':[1,2]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=68, max_depth=5,
                                                  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test2, scoring='accuracy', n_jobs=-1, iid=False, cv=5, verbose = 3)
gsearch2.fit(train[predictors],train[target])
gsearch2.best_params_, gsearch2.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.1min finished


({'max_depth': 4, 'min_child_weight': 2}, 0.6677600068275179)

In [347]:
# tuning gamma
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=68, max_depth=4, 
                                                  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test3, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch3.fit(train[predictors],train[target])
gsearch3.best_params_, gsearch3.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  6.1min finished


({'gamma': 0.3}, 0.6693490326778619)

In [348]:
# retuning estimators WITH NEW ESTIMATORS
xgb2 = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, min_child_weight=2,
                     gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
                     nthread=4, scale_pos_weight=1,seed=27)
modelfit(xgb2, train, predictors, useTrainCV = True)

[0]	train-error:0.336735+0.00157148	test-error:0.36025+0.00400966
[1]	train-error:0.325536+0.00536517	test-error:0.353199+0.0073676
[2]	train-error:0.318857+0.00388242	test-error:0.347339+0.00692479
[3]	train-error:0.315529+0.00365669	test-error:0.344557+0.00703243
[4]	train-error:0.313096+0.00292453	test-error:0.342471+0.00639698
[5]	train-error:0.309222+0.00241401	test-error:0.346146+0.00894755
[6]	train-error:0.307335+0.00322426	test-error:0.342869+0.00618682
[7]	train-error:0.306367+0.00224368	test-error:0.341081+0.00847652
[8]	train-error:0.304405+0.00439689	test-error:0.341777+0.00857069
[9]	train-error:0.301847+0.00369603	test-error:0.339988+0.0106365
[10]	train-error:0.300333+0.00318606	test-error:0.340684+0.0100528
[11]	train-error:0.299613+0.00377988	test-error:0.340187+0.0092239
[12]	train-error:0.297676+0.00431628	test-error:0.340683+0.00787209
[13]	train-error:0.297279+0.00364048	test-error:0.341577+0.00921771
[14]	train-error:0.296335+0.00444925	test-error:0.339392+0.0088

In [350]:
# tuning col and subsample 
param_test4 = {
 'subsample':[i/10.0 for i in range(7,9)],
 'colsample_bytree':[i/10.0 for i in range(7,9)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=83, max_depth=4, 
                                                  min_child_weight=2, gamma=0.3, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch4.fit(train[predictors],train[target])
gsearch4.best_params_, gsearch4.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.4min finished


({'colsample_bytree': 0.8, 'subsample': 0.8}, 0.668652961312541)

In [351]:
# param_test6 = {
#  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
# }
# gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=75, max_depth=3,
#  min_child_weight=3, gamma=0.4, subsample=0.8, colsample_bytree=0.7,
#  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
#  param_grid = param_test6, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
# gsearch6.fit(train[predictors],train[target])
# gsearch6.best_params_, gsearch6.best_score_

In [352]:
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=2,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.01,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb3, train, predictors, useTrainCV = True)

[0]	train-error:0.336735+0.00157148	test-error:0.36025+0.00400966
[1]	train-error:0.325536+0.00536517	test-error:0.353199+0.0073676
[2]	train-error:0.318857+0.00388242	test-error:0.347339+0.00692479
[3]	train-error:0.315529+0.00365669	test-error:0.344756+0.00685535
[4]	train-error:0.313096+0.00292453	test-error:0.342471+0.00639698
[5]	train-error:0.309197+0.00246807	test-error:0.346146+0.00894755
[6]	train-error:0.30736+0.00318077	test-error:0.342968+0.00623605
[7]	train-error:0.306441+0.00224127	test-error:0.341081+0.00837338
[8]	train-error:0.304455+0.00440916	test-error:0.341876+0.0086247
[9]	train-error:0.30222+0.00427024	test-error:0.341279+0.00951443
[10]	train-error:0.30068+0.00418933	test-error:0.34128+0.0093351
[11]	train-error:0.299563+0.00388487	test-error:0.340286+0.00892278
[12]	train-error:0.297999+0.00435535	test-error:0.342272+0.00647406
[13]	train-error:0.29775+0.00415948	test-error:0.343266+0.00777052
[14]	train-error:0.295118+0.00458088	test-error:0.338796+0.00891153

In [2192]:
# xgb4 = XGBClassifier(
#  learning_rate =0.01,
#  n_estimators=5000,
#  max_depth=3,
#  min_child_weight=3,
#  gamma=0.4,
#  subsample=0.8,
#  colsample_bytree=0.7,
#  reg_alpha=0.01,
#  objective= 'binary:logistic',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)
# modelfit(xgb4, train, predictors, useTrainCV = True)

In [353]:
xgb5 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=108,
 max_depth=4,
 min_child_weight=2,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.01,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
#modelfit(xgb5, train, predictors, useTrainCV = False)


# XgBoost

model_3 = xgb5
model_3.fit(X_train, y_train)
y_prob_3 = model_3.predict_proba(X_test)
y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_3 = confusion_matrix(y_test, y_pred_3)
outputs_3 = outputs(cm_3)
print ("\nConfusion Matrix : \n", cm_3) 
print ("\nAccuracy : ", outputs_3[0]) 


Confusion Matrix : 
 [[ 467  574]
 [ 254 1222]]

Accuracy :  0.671
