In [1]:
from sklearn.linear_model import Lasso
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [9]:
pd.set_option('display.max_columns', None)

In [2]:
with open('data/pickles/model_df21.p', 'rb') as readfile: 
    model_df21 = pickle.load(readfile)
    
with open('data/pickles/model_df20.p', 'rb') as readfile: 
    model_df20 = pickle.load(readfile)
    
with open('data/pickles/model_df19.p', 'rb') as readfile: 
    model_df19 = pickle.load(readfile)
    
with open('data/pickles/model_df18.p', 'rb') as readfile: 
    model_df18 = pickle.load(readfile)
    
with open('data/pickles/model_df17.p', 'rb') as readfile: 
    model_df17 = pickle.load(readfile)

In [4]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_Home', '_Away'],
                      on=['GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_Home != joined.TEAM_ID_Away]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_Home.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result

In [5]:
testdf21 = combine_team_games(model_df21)
testdf20 = combine_team_games(model_df20)
testdf19 = combine_team_games(model_df19)
testdf18 = combine_team_games(model_df18)
testdf17 = combine_team_games(model_df17)

In [6]:
drop_columns = ['PTS Away_Home',
 'PLUS_MINUS Away_Home',
 'FG_PCT Away_Home',
 'FG3_PCT Away_Home',
 'FT_PCT Away_Home',
 'EFG_PCT_y Away_Home',
 'FTA_RATE Away_Home',
 'TM_TOV_PCT_y Away_Home',
 'OREB_PCT_y Away_Home',
 'OPP_EFG_PCT Away_Home',
 'OPP_FTA_RATE Away_Home',
 'OPP_TOV_PCT Away_Home',
 'OPP_OREB_PCT Away_Home',
 'PIE Away_Home',
 'OFF_RATING Away_Home',
 'DEF_RATING Away_Home',
 'NET_RATING Away_Home',
 'TS_PCT Away_Home',
 'PACE Away_Home',
 'AST_TOV Away_Home',
 'PTS_ALLOWED Away_Home',
'PTS Home_Away',
 'PLUS_MINUS Home_Away',
 'FG_PCT Home_Away',
 'FG3_PCT Home_Away',
 'FT_PCT Home_Away',
 'EFG_PCT_y Home_Away',
 'FTA_RATE Home_Away',
 'TM_TOV_PCT_y Home_Away',
 'OREB_PCT_y Home_Away',
 'OPP_EFG_PCT Home_Away',
 'OPP_FTA_RATE Home_Away',
 'OPP_TOV_PCT Home_Away',
 'OPP_OREB_PCT Home_Away',
 'PIE Home_Away',
 'OFF_RATING Home_Away',
 'DEF_RATING Home_Away',
 'NET_RATING Home_Away',
 'TS_PCT Home_Away',
 'PACE Home_Away',
 'AST_TOV Home_Away',
 'PTS_ALLOWED Home_Away']

In [7]:
testdf21.drop(labels = drop_columns, axis=1, inplace=True)
testdf20.drop(labels = drop_columns, axis=1, inplace=True)
testdf19.drop(labels = drop_columns, axis=1, inplace=True)
testdf18.drop(labels = drop_columns, axis=1, inplace=True)
testdf17.drop(labels = drop_columns, axis=1, inplace=True)

In [11]:
test_columns = ['PTS LAST 5_Home',
 'PLUS_MINUS LAST 5_Home',
'FG_PCT LAST 5_Home',
 'FG3_PCT LAST 5_Home',
 'FT_PCT LAST 5_Home',
 'EFG_PCT_y LAST 5_Home',
 'FTA_RATE LAST 5_Home',
 'TM_TOV_PCT_y LAST 5_Home',
 'OREB_PCT_y LAST 5_Home',
 'OPP_EFG_PCT LAST 5_Home',
 'OPP_FTA_RATE LAST 5_Home',
 'OPP_TOV_PCT LAST 5_Home',
 'OPP_OREB_PCT LAST 5_Home',
 'PIE LAST 5_Home',
 'OFF_RATING LAST 5_Home',
 'DEF_RATING LAST 5_Home',
 'NET_RATING LAST 5_Home',
 'TS_PCT LAST 5_Home',
 'PACE LAST 5_Home',
 'AST_TOV LAST 5_Home',
 'PTS_ALLOWED LAST 5_Home',
 'PTS LAST 10_Home',
 'PLUS_MINUS LAST 10_Home',
'FG_PCT LAST 10_Home',
 'FG3_PCT LAST 10_Home',
 'FT_PCT LAST 10_Home',
 'EFG_PCT_y LAST 10_Home',
 'FTA_RATE LAST 10_Home',
 'TM_TOV_PCT_y LAST 10_Home',
 'OREB_PCT_y LAST 10_Home',
 'OPP_EFG_PCT LAST 10_Home',
 'OPP_FTA_RATE LAST 10_Home',
 'OPP_TOV_PCT LAST 10_Home',
 'OPP_OREB_PCT LAST 10_Home',
 'PIE LAST 10_Home',
 'OFF_RATING LAST 10_Home',
 'DEF_RATING LAST 10_Home',
 'NET_RATING LAST 10_Home',
 'TS_PCT LAST 10_Home',
 'PACE LAST 10_Home',
 'AST_TOV LAST 10_Home',
 'PTS_ALLOWED LAST 10_Home',
 'HOME_TEAM_Home',
 'PTS Season_Home',
 'PLUS_MINUS Season_Home',
'FG_PCT Season_Home',
 'FG3_PCT Season_Home',
 'FT_PCT Season_Home',
 'EFG_PCT_y Season_Home',
 'FTA_RATE Season_Home',
 'TM_TOV_PCT_y Season_Home',
 'OREB_PCT_y Season_Home',
 'OPP_EFG_PCT Season_Home',
 'OPP_FTA_RATE Season_Home',
 'OPP_TOV_PCT Season_Home',
 'OPP_OREB_PCT Season_Home',
 'PIE Season_Home',
 'OFF_RATING Season_Home',
 'DEF_RATING Season_Home',
 'NET_RATING Season_Home',
 'TS_PCT Season_Home',
 'PACE Season_Home',
 'AST_TOV Season_Home',
 'PTS_ALLOWED Season_Home',
 'WIN_%_Home',
 'back_to_back_Home', 
 'PTS Home_Home',
 'PLUS_MINUS Home_Home',
 'FG_PCT Home_Home',
 'FG3_PCT Home_Home',
 'FT_PCT Home_Home',
 'EFG_PCT_y Home_Home',
 'FTA_RATE Home_Home',
 'TM_TOV_PCT_y Home_Home',
 'OREB_PCT_y Home_Home',
 'OPP_EFG_PCT Home_Home',
 'OPP_FTA_RATE Home_Home',
 'OPP_TOV_PCT Home_Home',
 'OPP_OREB_PCT Home_Home',
 'PIE Home_Home',
 'OFF_RATING Home_Home',
 'DEF_RATING Home_Home',
 'NET_RATING Home_Home',
 'TS_PCT Home_Home',
 'PACE Home_Home',
 'AST_TOV Home_Home',
 'PTS_ALLOWED Home_Home',
  'PTS LAST 5_Away',
'FG_PCT LAST 5_Away',
 'FG3_PCT LAST 5_Away',
 'FT_PCT LAST 5_Away',
 'PLUS_MINUS LAST 5_Away',
 'EFG_PCT_y LAST 5_Away',
 'FTA_RATE LAST 5_Away',
 'TM_TOV_PCT_y LAST 5_Away',
 'OREB_PCT_y LAST 5_Away',
 'OPP_EFG_PCT LAST 5_Away',
 'OPP_FTA_RATE LAST 5_Away',
 'OPP_TOV_PCT LAST 5_Away',
 'OPP_OREB_PCT LAST 5_Away',
 'PIE LAST 5_Away',
 'OFF_RATING LAST 5_Away',
 'DEF_RATING LAST 5_Away',
 'NET_RATING LAST 5_Away',
 'TS_PCT LAST 5_Away',
 'PACE LAST 5_Away',
 'AST_TOV LAST 5_Away',
 'PTS_ALLOWED LAST 5_Away',
 'PTS LAST 10_Away',
 'PLUS_MINUS LAST 10_Away',
'FG_PCT LAST 10_Away',
 'FG3_PCT LAST 10_Away',
 'FT_PCT LAST 10_Away',
 'EFG_PCT_y LAST 10_Away',
 'FTA_RATE LAST 10_Away',
 'TM_TOV_PCT_y LAST 10_Away',
 'OREB_PCT_y LAST 10_Away',
 'OPP_EFG_PCT LAST 10_Away',
 'OPP_FTA_RATE LAST 10_Away',
 'OPP_TOV_PCT LAST 10_Away',
 'OPP_OREB_PCT LAST 10_Away',
 'PIE LAST 10_Away',
 'OFF_RATING LAST 10_Away',
 'DEF_RATING LAST 10_Away',
 'NET_RATING LAST 10_Away',
 'TS_PCT LAST 10_Away',
 'PACE LAST 10_Away',
 'AST_TOV LAST 10_Away',
 'PTS_ALLOWED LAST 10_Away',
 'HOME_TEAM_Away',
 'PTS Season_Away',
 'PLUS_MINUS Season_Away',
'FG_PCT Season_Away',
 'FG3_PCT Season_Away',
 'FT_PCT Season_Away',
 'EFG_PCT_y Season_Away',
 'FTA_RATE Season_Away',
 'TM_TOV_PCT_y Season_Away',
 'OREB_PCT_y Season_Away',
 'OPP_EFG_PCT Season_Away',
 'OPP_FTA_RATE Season_Away',
 'OPP_TOV_PCT Season_Away',
 'OPP_OREB_PCT Season_Away',
 'PIE Season_Away',
 'OFF_RATING Season_Away',
 'DEF_RATING Season_Away',
 'NET_RATING Season_Away',
 'TS_PCT Season_Away',
 'PACE Season_Away',
 'AST_TOV Season_Away',
 'PTS_ALLOWED Season_Away',
  'WIN_%_Away',
  'back_to_back_Away',
  'PTS Away_Away',
 'PLUS_MINUS Away_Away',
 'FG_PCT Away_Away',
 'FG3_PCT Away_Away',
 'FT_PCT Away_Away',
 'EFG_PCT_y Away_Away',
 'FTA_RATE Away_Away',
 'TM_TOV_PCT_y Away_Away',
 'OREB_PCT_y Away_Away',
 'OPP_EFG_PCT Away_Away',
 'OPP_FTA_RATE Away_Away',
 'OPP_TOV_PCT Away_Away',
 'OPP_OREB_PCT Away_Away',
 'PIE Away_Away',
 'OFF_RATING Away_Away',
 'DEF_RATING Away_Away',
 'NET_RATING Away_Away',
 'TS_PCT Away_Away',
 'PACE Away_Away',
 'AST_TOV Away_Away',
 'PTS_ALLOWED Away_Away']

In [24]:
ff_columns = ['EFG_PCT_y LAST 10_Home',
 'FTA_RATE LAST 10_Home',
 'TM_TOV_PCT_y LAST 10_Home',
 'OREB_PCT_y LAST 10_Home',
 'OPP_EFG_PCT LAST 10_Home',
 'OPP_FTA_RATE LAST 10_Home',
 'OPP_TOV_PCT LAST 10_Home',
 'OPP_OREB_PCT LAST 10_Home',
  'EFG_PCT_y LAST 10_Away',
 'FTA_RATE LAST 10_Away',
 'TM_TOV_PCT_y LAST 10_Away',
 'OREB_PCT_y LAST 10_Away',
 'OPP_EFG_PCT LAST 10_Away',
 'OPP_FTA_RATE LAST 10_Away',
 'OPP_TOV_PCT LAST 10_Away',
 'OPP_OREB_PCT LAST 10_Away',
   'EFG_PCT_y Season_Home',
 'FTA_RATE Season_Home',
 'TM_TOV_PCT_y Season_Home',
 'OREB_PCT_y Season_Home',
 'OPP_EFG_PCT Season_Home',
 'OPP_FTA_RATE Season_Home',
 'OPP_TOV_PCT Season_Home',
 'OPP_OREB_PCT Season_Home',
'EFG_PCT_y Season_Away',
 'FTA_RATE Season_Away',
 'TM_TOV_PCT_y Season_Away',
 'OREB_PCT_y Season_Away',
 'OPP_EFG_PCT Season_Away',
 'OPP_FTA_RATE Season_Away',
 'OPP_TOV_PCT Season_Away',
 'OPP_OREB_PCT Season_Away',]

In [69]:
cols = ['PIE LAST 5_Home',
 'OFF_RATING LAST 5_Home',
 'DEF_RATING LAST 5_Home',
 'NET_RATING LAST 5_Home',
 'TS_PCT LAST 5_Home',
 'PIE Home_Home',
 'OFF_RATING Home_Home',
 'DEF_RATING Home_Home',
 'NET_RATING Home_Home',
 'TS_PCT Home_Home',
 'PIE LAST 5_Away',
 'OFF_RATING LAST 5_Away',
 'DEF_RATING LAST 5_Away',
 'NET_RATING LAST 5_Away',
 'TS_PCT LAST 5_Away',
  'PIE Away_Away',
 'OFF_RATING Away_Away',
 'DEF_RATING Away_Away',
 'NET_RATING Away_Away',
 'TS_PCT Away_Away']

In [101]:
simp = ['PIE Home_Home',
 'OFF_RATING Home_Home',
 'DEF_RATING Home_Home',
 'NET_RATING Home_Home',
 'TS_PCT Home_Home',
 'PIE Away_Away',
 'OFF_RATING Away_Away',
 'DEF_RATING Away_Away',
 'NET_RATING Away_Away',
 'TS_PCT Away_Away',
'HOME_TEAM_Home',
 'HOME_TEAM_Away']

In [102]:
last5 = ['PIE LAST 5_Home', 'NET_RATING LAST 5_Home',
 'TS_PCT LAST 5_Home', 'PIE LAST 5_Away',  'NET_RATING LAST 5_Away',
 'TS_PCT LAST 5_Away','HOME_TEAM_Home',
 'HOME_TEAM_Away' ]

In [44]:
testdf21['PTS_Total'] = testdf21['PTS_Home'] + testdf21['PTS_Away']

In [16]:
testdf21.dropna(inplace=True)

In [103]:
y = testdf21.WIN_Home
X = testdf21[last5]

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix

In [105]:
rand=RandomForestClassifier(n_estimators=100,random_state=42)
rand.fit(X_train, y_train)
y_pred_r = rand.predict(X_test)

In [106]:
accuracy_score(y_test, y_pred_r)

0.5670995670995671

In [57]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [107]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [108]:
svc = SVC()
svc.fit(X_train_sc, y_train)
y_pred_svc = svc.predict(X_test_sc)

In [109]:
accuracy_score(y_test, y_pred_svc)

0.6190476190476191

In [66]:
import statsmodels.api as sm

In [67]:
sm_X = sm.add_constant(X)

In [68]:
mod = sm.Logit(y, sm_X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.610534
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               WIN_Home   No. Observations:                  924
Model:                          Logit   Df Residuals:                      891
Method:                           MLE   Df Model:                           32
Date:                Tue, 19 Oct 2021   Pseudo R-squ.:                  0.1117
Time:                        16:24:28   Log-Likelihood:                -564.13
converged:                       True   LL-Null:                       -635.05
Covariance Type:            nonrobust   LLR p-value:                 8.884e-16
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                         3.3954      5.826      0.583      0.560      -8.

In [77]:
mod = sm.Logit(y, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.619412
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               WIN_Home   No. Observations:                  924
Model:                          Logit   Df Residuals:                      904
Method:                           MLE   Df Model:                           19
Date:                Tue, 19 Oct 2021   Pseudo R-squ.:                 0.09875
Time:                        16:39:42   Log-Likelihood:                -572.34
converged:                       True   LL-Null:                       -635.05
Covariance Type:            nonrobust   LLR p-value:                 1.067e-17
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
PIE LAST 5_Home           -5.3369      5.817     -0.917      0.359     -16.739      