# Overview

The purpose of this notebook is to help with feature selection. Specifically I have calculated team stats using 1 ,3, 5, 10, 20, 30, and 40, 50 game rolling windows and using the raw totals vs scoring and venue adjustments. I want to determine which rolling windows or mix of windows results in the best model. To do so I will first train a simple logistic regression model using each rolling window and a mix. Then I will try Scikit-learn's select K best to see one comes out from that. 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import hockey_scraper
import pickle
import time
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import normalize, FunctionTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import log_loss
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier
from collections import Counter
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import make_pipeline, Pipeline
pd.set_option('display.max_columns', None)

In [74]:
#define function to model and evaluate on a simple logistic regression
def model_and_evaluate(df, feature_columns, scoring = 'neg_log_loss'):
    #for final modeling, train on 2017 to 2020 and test on 2021 season. So here I will look at CV scores only on the 2017-2020 season to avoid any data leakage when evalutating feature options
    X_train = df[df['Season'] != '2020-2021'].dropna()
    y_train = df[df['Season'] != '2020-2021'].dropna()['Home_Team_Won']
    
    numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

    categorical_features = ['B2B_Status']

    categorical_transformer = Pipeline(steps=[
        ('ohe', OneHotEncoder())])
    
    numeric_features = feature_columns.copy()
    numeric_features.remove('B2B_Status')
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('logisticregression', LogisticRegression(random_state =2020, max_iter=10000))])



    return cross_val_score(pipeline, X_train, y_train, cv=5, scoring = scoring).mean()


# Evaluate Features With No Scoring and Venue Adjustments

In [7]:
df = pd.read_csv('data/all_games_multirolling_noSVA.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_3_FF%_5v5,home_last_3_GF%_5v5,home_last_3_xGF%_5v5,home_last_3_SH%,home_last3_pp_TOI_per_game,home_last3_xGF_per_min_pp,home_last3_pk_TOI_per_game,home_last3_xGA_per_min_pk,home_B2B,home_last_5_FF%_5v5,home_last_5_GF%_5v5,home_last_5_xGF%_5v5,home_last_5_SH%,home_last5_pp_TOI_per_game,home_last5_xGF_per_min_pp,home_last5_pk_TOI_per_game,home_last5_xGA_per_min_pk,home_last_10_FF%_5v5,home_last_10_GF%_5v5,home_last_10_xGF%_5v5,home_last_10_SH%,home_last10_pp_TOI_per_game,home_last10_xGF_per_min_pp,home_last10_pk_TOI_per_game,home_last10_xGA_per_min_pk,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_last_30_FF%_5v5,home_last_30_GF%_5v5,home_last_30_xGF%_5v5,home_last_30_SH%,home_last30_pp_TOI_per_game,home_last30_xGF_per_min_pp,home_last30_pk_TOI_per_game,home_last30_xGA_per_min_pk,away_Team_Key,away_last_3_FF%_5v5,away_last_3_GF%_5v5,away_last_3_xGF%_5v5,away_last_3_SH%,away_last3_pp_TOI_per_game,away_last3_xGF_per_min_pp,away_last3_pk_TOI_per_game,away_last3_xGA_per_min_pk,away_B2B,away_last_5_FF%_5v5,away_last_5_GF%_5v5,away_last_5_xGF%_5v5,away_last_5_SH%,away_last5_pp_TOI_per_game,away_last5_xGF_per_min_pp,away_last5_pk_TOI_per_game,away_last5_xGA_per_min_pk,away_last_10_FF%_5v5,away_last_10_GF%_5v5,away_last_10_xGF%_5v5,away_last_10_SH%,away_last10_pp_TOI_per_game,away_last10_xGF_per_min_pp,away_last10_pk_TOI_per_game,away_last10_xGA_per_min_pk,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_last_30_FF%_5v5,away_last_30_GF%_5v5,away_last_30_xGF%_5v5,away_last_30_SH%,away_last30_pp_TOI_per_game,away_last30_xGF_per_min_pp,away_last30_pk_TOI_per_game,away_last30_xGA_per_min_pk
0,0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,0.946716,0.259293,0.917127,Frederik Andersen,0.947245,0.043154,0.874317,WPG_2017-10-04,50.222222,60.0,46.666667,10.11236,3.444444,0.063871,4.005556,0.090707,0.0,52.560647,56.521739,51.729107,9.285714,4.19,0.079714,3.693333,0.098556,51.240876,50.0,51.89374,7.228916,5.128333,0.091453,4.315,0.128158,49.213483,51.190476,50.903164,8.977035,5.860833,0.102801,5.44,0.105423,49.276086,50.0,48.848563,8.720113,5.171111,0.119961,4.815,0.115057,TOR_2017-10-04,53.777778,50.0,51.673469,8.823529,6.45,0.10646,3.394444,0.111948,0.0,53.457447,47.368421,51.673469,7.086614,5.893333,0.06991,3.07,0.074267,50.712251,48.648649,50.580307,7.860262,5.073333,0.08042,4.235,0.114522,49.103448,48.611111,48.707092,7.526882,4.525833,0.118431,4.195,0.123361,50.022655,50.925926,50.175861,7.462687,4.538889,0.128519,4.363889,0.133978
1,1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,0.944706,0.45795,0.889831,Jake Allen,0.952147,0.199829,0.89881,PIT_2017-10-04,42.180095,50.0,46.91358,13.559322,2.761111,0.182294,3.683333,0.171041,0.0,40.960452,57.142857,45.309026,12.121212,3.336667,0.143856,3.546667,0.153383,43.557423,45.238095,43.04456,8.558559,3.633333,0.176422,3.358333,0.113151,47.997139,52.439024,48.360562,8.811475,4.7925,0.134168,4.569167,0.119825,49.062049,52.991453,50.927977,8.310992,4.693889,0.123707,4.812778,0.120859,STL_2017-10-04,43.661972,47.058824,47.54386,10.666667,7.283333,0.113043,5.25,0.067937,0.0,46.290801,50.0,50.637826,11.570248,6.0,0.096,4.966667,0.109128,50.678733,60.465116,55.805893,10.116732,5.313333,0.101255,4.9,0.088571,48.849295,64.705882,51.396973,8.835341,4.81,0.102079,5.041667,0.07914,48.991935,61.85567,52.271593,8.196721,4.613889,0.096376,4.832222,0.087882
2,2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cam Talbot,0.937824,0.1235,0.887892,Mike Smith,0.941538,0.119962,0.911877,EDM_2017-10-04,60.106383,66.666667,64.403829,9.090909,6.222222,0.109286,5.166667,0.078065,0.0,59.223301,57.142857,58.893871,8.571429,6.283333,0.113316,4.54,0.131278,53.588517,56.521739,53.596939,10.31746,6.091667,0.11409,4.815,0.132087,51.551855,60.0,53.204661,9.6,5.31,0.132015,4.020833,0.128953,50.875438,57.017544,51.825667,8.724832,4.756667,0.12684,3.927222,0.125874,CGY_2017-10-04,42.307692,45.454545,40.900563,8.928571,5.316667,0.136677,5.644444,0.08622,0.0,42.5,45.0,39.044289,9.375,4.816667,0.153218,5.853333,0.112415,46.463023,40.540541,41.550191,7.462687,5.438333,0.111983,5.585,0.10188,48.07396,54.794521,45.88685,9.070295,4.541667,0.118899,4.8525,0.099227,49.484004,54.385965,47.297977,9.15805,4.732778,0.12755,4.905,0.116276
3,3,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,0.937824,0.1235,0.887892,Mike Smith,0.941538,0.119962,0.911877,EDM_2017-10-04,60.106383,66.666667,64.403829,9.090909,6.222222,0.109286,5.166667,0.078065,0.0,59.223301,57.142857,58.893871,8.571429,6.283333,0.113316,4.54,0.131278,53.588517,56.521739,53.596939,10.31746,6.091667,0.11409,4.815,0.132087,51.551855,60.0,53.204661,9.6,5.31,0.132015,4.020833,0.128953,50.875438,57.017544,51.825667,8.724832,4.756667,0.12684,3.927222,0.125874,CGY_2017-10-04,42.307692,45.454545,40.900563,8.928571,5.316667,0.136677,5.644444,0.08622,0.0,42.5,45.0,39.044289,9.375,4.816667,0.153218,5.853333,0.112415,46.463023,40.540541,41.550191,7.462687,5.438333,0.111983,5.585,0.10188,48.07396,54.794521,45.88685,9.070295,4.541667,0.118899,4.8525,0.099227,49.484004,54.385965,47.297977,9.15805,4.732778,0.12755,4.905,0.116276
4,4,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,0.925575,-0.456992,0.825688,Brian Elliott,0.945069,0.172382,0.865,S.J_2017-10-04,52.694611,63.636364,49.312169,12.280702,4.45,0.098876,6.3,0.115344,0.0,52.861953,57.894737,51.612903,10.091743,4.62,0.118615,4.763333,0.137299,53.015873,45.238095,52.869664,8.296943,3.926667,0.129117,4.883333,0.13802,53.213166,49.295775,53.15662,7.692308,4.8,0.1075,4.711667,0.123629,52.961672,52.727273,52.613531,7.988981,4.742778,0.1095,4.49,0.131552,PHI_2017-10-04,55.617978,66.666667,57.032008,7.792208,4.672222,0.11415,5.777778,0.110192,0.0,51.351351,57.142857,49.740035,6.779661,5.173333,0.137242,5.963333,0.086864,51.294498,60.606061,52.518819,8.196721,4.32,0.172222,4.793333,0.117872,50.595711,57.575758,52.199883,8.0,5.54,0.146841,4.888333,0.096966,51.322751,48.913043,51.151585,6.293706,5.412222,0.141778,4.803333,0.093963


In [9]:
list(df.columns)

['Unnamed: 0',
 'game_id',
 'date',
 'venue',
 'home_team',
 'away_team',
 'start_time',
 'home_score',
 'away_score',
 'status',
 'Home_Team_Won',
 'Home_Team_Key',
 'Away_Team_Key',
 'home_goalie',
 'home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_goalie',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%',
 'home_Team_Key',
 'home_last_3_FF%_5v5',
 'home_last_3_GF%_5v5',
 'home_last_3_xGF%_5v5',
 'home_last_3_SH%',
 'home_last3_pp_TOI_per_game',
 'home_last3_xGF_per_min_pp',
 'home_last3_pk_TOI_per_game',
 'home_last3_xGA_per_min_pk',
 'home_B2B',
 'home_last_5_FF%_5v5',
 'home_last_5_GF%_5v5',
 'home_last_5_xGF%_5v5',
 'home_last_5_SH%',
 'home_last5_pp_TOI_per_game',
 'home_last5_xGF_per_min_pp',
 'home_last5_pk_TOI_per_game',
 'home_last5_xGA_per_min_pk',
 'home_last_10_FF%_5v5',
 'home_last_10_GF%_5v5',
 'home_last_10_xGF%_5v5',
 'home_last_10_SH%',
 'home_last10_pp_TOI_per_game',
 'home_last10_xGF_per_min_pp',
 'home_la

In [10]:
#define different feature rolling game sets
r3 = ['home_B2B', 'away_B2B', 'home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%', 'home_last_3_FF%_5v5',
 'home_last_3_GF%_5v5',
 'home_last_3_xGF%_5v5',
 'home_last_3_SH%',
 'home_last3_pp_TOI_per_game',
 'home_last3_xGF_per_min_pp',
 'home_last3_pk_TOI_per_game',
 'home_last3_xGA_per_min_pk', 'away_last_3_FF%_5v5',
 'away_last_3_GF%_5v5',
 'away_last_3_xGF%_5v5',
 'away_last_3_SH%',
 'away_last3_pp_TOI_per_game',
 'away_last3_xGF_per_min_pp',
 'away_last3_pk_TOI_per_game',
 'away_last3_xGA_per_min_pk']
r5 =['home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%', 'home_B2B', 'away_B2B', 'home_last_5_FF%_5v5',
 'home_last_5_GF%_5v5',
 'home_last_5_xGF%_5v5',
 'home_last_5_SH%',
 'home_last5_pp_TOI_per_game',
 'home_last5_xGF_per_min_pp',
 'home_last5_pk_TOI_per_game',
 'home_last5_xGA_per_min_pk', 'away_last_5_FF%_5v5',
 'away_last_5_GF%_5v5',
 'away_last_5_xGF%_5v5',
 'away_last_5_SH%',
 'away_last5_pp_TOI_per_game',
 'away_last5_xGF_per_min_pp',
 'away_last5_pk_TOI_per_game',
 'away_last5_xGA_per_min_pk']
r10 =['home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%', 'home_B2B', 'away_B2B', 'home_last_10_FF%_5v5',
 'home_last_10_GF%_5v5',
 'home_last_10_xGF%_5v5',
 'home_last_10_SH%',
 'home_last10_pp_TOI_per_game',
 'home_last10_xGF_per_min_pp',
 'home_last10_pk_TOI_per_game',
 'home_last10_xGA_per_min_pk', 'away_last_10_FF%_5v5',
 'away_last_10_GF%_5v5',
 'away_last_10_xGF%_5v5',
 'away_last_10_SH%',
 'away_last10_pp_TOI_per_game',
 'away_last10_xGF_per_min_pp',
 'away_last10_pk_TOI_per_game',
 'away_last10_xGA_per_min_pk']
r20 = ['home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%', 'home_B2B', 'away_B2B', 'home_last_20_FF%_5v5',
 'home_last_20_GF%_5v5',
 'home_last_20_xGF%_5v5',
 'home_last_20_SH%',
 'home_last20_pp_TOI_per_game',
 'home_last20_xGF_per_min_pp',
 'home_last20_pk_TOI_per_game',
 'home_last20_xGA_per_min_pk', 'away_last_20_FF%_5v5',
 'away_last_20_GF%_5v5',
 'away_last_20_xGF%_5v5',
 'away_last_20_SH%',
 'away_last20_pp_TOI_per_game',
 'away_last20_xGF_per_min_pp',
 'away_last20_pk_TOI_per_game',
 'away_last20_xGA_per_min_pk']
r30 = ['home_Last_20_FenwickSV%',
 'home_Last_20_GSAx/60',
 'home_Last_20_HDCSV%',
 'away_Last_20_FenwickSV%',
 'away_Last_20_GSAx/60',
 'away_Last_20_HDCSV%', 'home_B2B', 'away_B2B',  'home_last_30_FF%_5v5',
 'home_last_30_GF%_5v5',
 'home_last_30_xGF%_5v5',
 'home_last_30_SH%',
 'home_last30_pp_TOI_per_game',
 'home_last30_xGF_per_min_pp',
 'home_last30_pk_TOI_per_game',
 'home_last30_xGA_per_min_pk', 'away_last_30_FF%_5v5',
 'away_last_30_GF%_5v5',
 'away_last_30_xGF%_5v5',
 'away_last_30_SH%',
 'away_last30_pp_TOI_per_game',
 'away_last30_xGF_per_min_pp',
 'away_last30_pk_TOI_per_game',
 'away_last30_xGA_per_min_pk']

all_r = list(set(r3+r5+r10+r20+r30))

r3_30 =list(set(r3+r30))
r5_30 = list(set(r5+r30))
r10_30 = list(set(r10+r30))
r_3_5_30 = list(set(r3+r5+r30))
r_5_20 = list(set(r5+r20))

In [11]:
windows =  {'3' : r3,'5' : r5,'10' : r10,'20' : r20, '30' : r30, 'all' : all_r, '3+30': r3_30, '5+30': r5_30, '10+30' : r10_30, '3+5+30': r_3_5_30, '5+20': r_5_20}

I can see from the below results that longer rolling game windows are scoring better. Using 5 games + the longer window shows some potential

In [12]:
for k,v in windows.items():
    print(f'{k} rolling cv log loss = {model_and_evaluate(df, v)}')

3 rolling cv log loss = -0.6861562495735097
5 rolling cv log loss = -0.6847321956176653
10 rolling cv log loss = -0.6809962171610652
20 rolling cv log loss = -0.6771632184554629
30 rolling cv log loss = -0.674385744275857
all rolling cv log loss = -0.6862683128672741
3+30 rolling cv log loss = -0.6776354571333818
5+30 rolling cv log loss = -0.6771719023613585
10+30 rolling cv log loss = -0.6788676522172316
3+5+30 rolling cv log loss = -0.6811070794335059
5+20 rolling cv log loss = -0.6807867901085245


In [14]:
for k,v in windows.items():
    print(f'{k} rolling cv accuracy = {model_and_evaluate(df, v, "accuracy")}')

3 rolling cv accuracy = 0.5573552779464389
5 rolling cv accuracy = 0.5579314532277442
10 rolling cv accuracy = 0.5687883524745823
20 rolling cv accuracy = 0.575533651134049
30 rolling cv accuracy = 0.5802321926096262
all rolling cv accuracy = 0.5664423114188638
3+30 rolling cv accuracy = 0.5849285809638232
5+30 rolling cv accuracy = 0.581990000904311
10+30 rolling cv accuracy = 0.5778865821781837
3+5+30 rolling cv accuracy = 0.5737792878335723
5+20 rolling cv accuracy = 0.5687909362202385


## Using Select K Best

In [28]:
X = df[all_r]
y= df['Home_Team_Won']

In [15]:
df['home_B2B'] = df['home_B2B'].astype('category')
df['away_B2B'] = df['away_B2B'].astype('category')

In [29]:
numeric_features = [
 'away_last3_xGF_per_min_pp',
 'home_last20_pk_TOI_per_game',
 'away_last_5_SH%',
 'away_last_10_SH%',
 'home_last_10_xGF%_5v5',
 'home_last20_xGF_per_min_pp',
 'home_last_3_xGF%_5v5',
 'home_last3_pp_TOI_per_game',
 'away_last30_xGA_per_min_pk',
 'home_Last_20_HDCSV%',
 'home_last30_pk_TOI_per_game',
 'home_last10_xGF_per_min_pp',
 'away_last_3_xGF%_5v5',
 'home_Last_20_FenwickSV%',
 'away_Last_20_HDCSV%',
 'away_last_30_xGF%_5v5',
 'away_last20_pk_TOI_per_game',
 'home_last30_xGF_per_min_pp',
 'home_last_20_xGF%_5v5',
 'away_last5_xGF_per_min_pp',
 'away_last10_pk_TOI_per_game',
 'home_last_20_SH%',
 'home_last_3_FF%_5v5',
 'home_last10_pk_TOI_per_game',
 'away_last_30_SH%',
 'away_Last_20_GSAx/60',
 'home_last_10_SH%',
 'away_last5_pp_TOI_per_game',
 'home_last_30_GF%_5v5',
 'away_last_3_GF%_5v5',
 'home_last_5_GF%_5v5',
 'away_last30_pp_TOI_per_game',
 'home_last5_pp_TOI_per_game',
 'home_last10_xGA_per_min_pk',
 'away_last_20_GF%_5v5',
 'home_last3_xGA_per_min_pk',
 'home_last_10_GF%_5v5',
 'away_last_5_GF%_5v5',
 'home_last_30_SH%',
 'home_last_30_FF%_5v5',
 'home_Last_20_GSAx/60',
 'home_last5_xGF_per_min_pp',
 'home_last3_pk_TOI_per_game',
 'home_last20_xGA_per_min_pk',
 'away_last_3_SH%',
 'away_last_20_FF%_5v5',
 'away_last30_pk_TOI_per_game',
 'away_last_5_FF%_5v5',
 'home_last20_pp_TOI_per_game',
 'home_last_30_xGF%_5v5',
 'home_last_5_FF%_5v5',
 'home_last_5_xGF%_5v5',
 'away_last_10_xGF%_5v5',
 'home_last_10_FF%_5v5',
 'away_last_3_FF%_5v5',
 'home_last10_pp_TOI_per_game',
 'home_last30_pp_TOI_per_game',
 'home_last_3_SH%',
 'away_last3_pp_TOI_per_game',
 'home_last5_xGA_per_min_pk',
 'away_last_20_SH%',
 'away_last_30_FF%_5v5',
 'away_last10_xGA_per_min_pk',
 'home_last_3_GF%_5v5',
 'away_last30_xGF_per_min_pp',
 'home_last_20_GF%_5v5',
 'away_last5_xGA_per_min_pk',
 'home_last_5_SH%',
 'away_last_20_xGF%_5v5',
 'away_last_5_xGF%_5v5',
 'home_last_20_FF%_5v5',
 'away_Last_20_FenwickSV%',
 'away_last5_pk_TOI_per_game',
 'away_last10_xGF_per_min_pp',
 'away_last3_xGA_per_min_pk',
 'away_last20_pp_TOI_per_game',
 'away_last10_pp_TOI_per_game',
 'away_last_10_GF%_5v5',
 'away_last_30_GF%_5v5',
 'home_last5_pk_TOI_per_game',
 'away_last_10_FF%_5v5',
 'home_last30_xGA_per_min_pk',
 'away_last20_xGF_per_min_pp',
 'away_last20_xGA_per_min_pk',
 'away_last3_pk_TOI_per_game',
 'home_last3_xGF_per_min_pp']

In [44]:

#this code worked i think
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# categorical_features = ['home_B2B', 'away_B2B']

# categorical_transformer = Pipeline(steps=[
#     ('ohe', OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
 ])

skb_pipeline = Pipeline(steps=[('ss', StandardScaler()),
                      ('skb', SelectKBest(k = 20))])

In [34]:
X.head()

Unnamed: 0,away_last_20_xGF%_5v5,home_last_5_SH%,away_last_5_xGF%_5v5,away_last5_xGA_per_min_pk,home_last_3_FF%_5v5,away_last_5_FF%_5v5,home_last5_pk_TOI_per_game,away_last_3_FF%_5v5,home_last10_xGA_per_min_pk,home_last_20_xGF%_5v5,home_last_10_SH%,away_last3_pp_TOI_per_game,away_Last_20_FenwickSV%,home_last10_xGF_per_min_pp,home_last10_pk_TOI_per_game,away_last30_pk_TOI_per_game,away_last10_xGA_per_min_pk,away_last_30_GF%_5v5,away_last_30_SH%,home_last_5_GF%_5v5,away_last3_pk_TOI_per_game,home_last_5_xGF%_5v5,away_last10_pk_TOI_per_game,home_last20_xGA_per_min_pk,away_last_30_FF%_5v5,home_Last_20_HDCSV%,home_last5_pp_TOI_per_game,away_last_3_GF%_5v5,away_last20_xGA_per_min_pk,home_last30_xGF_per_min_pp,away_last_5_SH%,away_last_30_xGF%_5v5,home_Last_20_FenwickSV%,away_B2B,away_last10_xGF_per_min_pp,home_last3_pk_TOI_per_game,away_last_10_GF%_5v5,away_last20_xGF_per_min_pp,home_last_3_GF%_5v5,home_last_30_SH%,away_last30_xGF_per_min_pp,home_last_20_FF%_5v5,away_last5_pp_TOI_per_game,home_Last_20_GSAx/60,away_last_10_FF%_5v5,home_last_10_GF%_5v5,home_last20_pk_TOI_per_game,away_last_3_xGF%_5v5,away_last_3_SH%,away_last_10_xGF%_5v5,away_last30_pp_TOI_per_game,away_last20_pk_TOI_per_game,home_last30_xGA_per_min_pk,home_last_5_FF%_5v5,away_last_20_GF%_5v5,away_last_5_GF%_5v5,home_last_3_SH%,home_last_10_FF%_5v5,home_last_30_xGF%_5v5,away_last3_xGF_per_min_pp,home_last_20_GF%_5v5,home_last30_pk_TOI_per_game,home_last5_xGF_per_min_pp,home_last_30_FF%_5v5,home_last20_xGF_per_min_pp,home_B2B,away_last20_pp_TOI_per_game,away_last10_pp_TOI_per_game,away_last5_pk_TOI_per_game,away_Last_20_HDCSV%,home_last3_xGA_per_min_pk,home_last30_pp_TOI_per_game,away_last_20_FF%_5v5,home_last10_pp_TOI_per_game,away_last30_xGA_per_min_pk,home_last_3_xGF%_5v5,home_last20_pp_TOI_per_game,home_last3_pp_TOI_per_game,home_last_10_xGF%_5v5,away_Last_20_GSAx/60,home_last3_xGF_per_min_pp,away_last_10_SH%,away_last5_xGF_per_min_pp,home_last_20_SH%,home_last5_xGA_per_min_pk,away_last_20_SH%,home_last_30_GF%_5v5,away_last3_xGA_per_min_pk
0,48.707092,9.285714,51.673469,0.074267,50.222222,53.457447,3.693333,53.777778,0.128158,50.903164,7.228916,6.45,0.947245,0.091453,4.315,4.363889,0.114522,50.925926,7.462687,56.521739,3.394444,51.729107,4.235,0.105423,50.022655,0.917127,4.19,50.0,0.123361,0.119961,7.086614,50.175861,0.946716,0.0,0.08042,4.005556,48.648649,0.118431,60.0,8.720113,0.128519,49.213483,5.893333,0.259293,50.712251,50.0,5.44,51.673469,8.823529,50.580307,4.538889,4.195,0.115057,52.560647,48.611111,47.368421,10.11236,51.240876,48.848563,0.10646,51.190476,4.815,0.079714,49.276086,0.102801,0.0,4.525833,5.073333,3.07,0.874317,0.090707,5.171111,49.103448,5.128333,0.133978,46.666667,5.860833,3.444444,51.89374,0.043154,0.063871,7.860262,0.06991,8.977035,0.098556,7.526882,50.0,0.111948
1,51.396973,12.121212,50.637826,0.109128,42.180095,46.290801,3.546667,43.661972,0.113151,48.360562,8.558559,7.283333,0.952147,0.176422,3.358333,4.832222,0.088571,61.85567,8.196721,57.142857,5.25,45.309026,4.9,0.119825,48.991935,0.889831,3.336667,47.058824,0.07914,0.123707,11.570248,52.271593,0.944706,0.0,0.101255,3.683333,60.465116,0.102079,50.0,8.310992,0.096376,47.997139,6.0,0.45795,50.678733,45.238095,4.569167,47.54386,10.666667,55.805893,4.613889,5.041667,0.120859,40.960452,64.705882,50.0,13.559322,43.557423,50.927977,0.113043,52.439024,4.812778,0.143856,49.062049,0.134168,0.0,4.81,5.313333,4.966667,0.89881,0.171041,4.693889,48.849295,3.633333,0.087882,46.91358,4.7925,2.761111,43.04456,0.199829,0.182294,10.116732,0.096,8.811475,0.153383,8.835341,52.991453,0.067937
2,45.88685,8.571429,39.044289,0.112415,60.106383,42.5,4.54,42.307692,0.132087,53.204661,10.31746,5.316667,0.941538,0.11409,4.815,4.905,0.10188,54.385965,9.15805,57.142857,5.644444,58.893871,5.585,0.128953,49.484004,0.887892,6.283333,45.454545,0.099227,0.12684,9.375,47.297977,0.937824,0.0,0.111983,5.166667,40.540541,0.118899,66.666667,8.724832,0.12755,51.551855,4.816667,0.1235,46.463023,56.521739,4.020833,40.900563,8.928571,41.550191,4.732778,4.8525,0.125874,59.223301,54.794521,45.0,9.090909,53.588517,51.825667,0.136677,60.0,3.927222,0.113316,50.875438,0.132015,0.0,4.541667,5.438333,5.853333,0.911877,0.078065,4.756667,48.07396,6.091667,0.116276,64.403829,5.31,6.222222,53.596939,0.119962,0.109286,7.462687,0.153218,9.6,0.131278,9.070295,57.017544,0.08622
3,45.88685,8.571429,39.044289,0.112415,60.106383,42.5,4.54,42.307692,0.132087,53.204661,10.31746,5.316667,0.941538,0.11409,4.815,4.905,0.10188,54.385965,9.15805,57.142857,5.644444,58.893871,5.585,0.128953,49.484004,0.887892,6.283333,45.454545,0.099227,0.12684,9.375,47.297977,0.937824,0.0,0.111983,5.166667,40.540541,0.118899,66.666667,8.724832,0.12755,51.551855,4.816667,0.1235,46.463023,56.521739,4.020833,40.900563,8.928571,41.550191,4.732778,4.8525,0.125874,59.223301,54.794521,45.0,9.090909,53.588517,51.825667,0.136677,60.0,3.927222,0.113316,50.875438,0.132015,0.0,4.541667,5.438333,5.853333,0.911877,0.078065,4.756667,48.07396,6.091667,0.116276,64.403829,5.31,6.222222,53.596939,0.119962,0.109286,7.462687,0.153218,9.6,0.131278,9.070295,57.017544,0.08622
4,52.199883,10.091743,49.740035,0.086864,52.694611,51.351351,4.763333,55.617978,0.13802,53.15662,8.296943,4.672222,0.945069,0.129117,4.883333,4.803333,0.117872,48.913043,6.293706,57.894737,5.777778,51.612903,4.793333,0.123629,51.322751,0.825688,4.62,66.666667,0.096966,0.1095,6.779661,51.151585,0.925575,0.0,0.172222,6.3,60.606061,0.146841,63.636364,7.988981,0.141778,53.213166,5.173333,-0.456992,51.294498,45.238095,4.711667,57.032008,7.792208,52.518819,5.412222,4.888333,0.131552,52.861953,57.575758,57.142857,12.280702,53.015873,52.613531,0.11415,49.295775,4.49,0.118615,52.961672,0.1075,0.0,5.54,4.32,5.963333,0.865,0.115344,4.742778,50.595711,3.926667,0.093963,49.312169,4.8,4.45,52.869664,0.172382,0.098876,8.196721,0.137242,7.692308,0.137299,8.0,52.727273,0.110192


In [45]:

X = k_best_df[all_r]

y = k_best_df['Home_Team_Won']

In [46]:
skb_pipeline.fit(X,y)

Pipeline(steps=[('ss', StandardScaler()), ('skb', SelectKBest(k=20))])

In [47]:
cols = skb_pipeline.named_steps['skb'].get_support()
k_best_features = X.iloc[:,cols]

In [48]:
k = [n[9:] for n in k_best_features.columns]

The Select K best output is showing that GF%, xGF%, FF%, and GSAx/60 are most important. Since it is showing a mix of 10, 20 and 30 game rolling windows it is not helpful it narrowing the features down from that perspective

In [49]:
sorted(k)

['_10_FF%_5v5',
 '_10_FF%_5v5',
 '_10_GF%_5v5',
 '_10_xGF%_5v5',
 '_10_xGF%_5v5',
 '_20_FF%_5v5',
 '_20_FF%_5v5',
 '_20_FenwickSV%',
 '_20_GF%_5v5',
 '_20_GF%_5v5',
 '_20_GSAx/60',
 '_20_xGF%_5v5',
 '_20_xGF%_5v5',
 '_30_FF%_5v5',
 '_30_FF%_5v5',
 '_30_GF%_5v5',
 '_30_GF%_5v5',
 '_30_xGF%_5v5',
 '_30_xGF%_5v5',
 '_5_FF%_5v5']

# Evaluate With Venue and Scoring Adjustments

In [84]:
sva = pd.read_csv('/Users/gschwaeber/Documents/Data_Science/Capstone/data/all_games_multirolling_SVA_3.csv')

In [9]:
sva.head()

Unnamed: 0.1,Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Goalie_FenwickSV%,home_Goalie_GSAx/60,home_Goalie_HDCSV%,away_goalie,away_Goalie_FenwickSV%,away_Goalie_GSAx/60,away_Goalie_HDCSV%,home_Game_Number,home_Team_Key,home_last_1_FF%_5v5,home_last_1_GF%_5v5,home_last_1_xGF%_5v5,home_last_1_SH%,home_last1_pp_TOI_per_game,home_last1_xGF_per_min_pp,home_last1_GF_per_min_pp,home_last1_pk_TOI_per_game,home_last1_xGA_per_min_pk,home_last1_GA_per_min_pk,home_last_3_FF%_5v5,home_last_3_GF%_5v5,home_last_3_xGF%_5v5,home_last_3_SH%,home_last3_pp_TOI_per_game,home_last3_xGF_per_min_pp,home_last3_GF_per_min_pp,home_last3_pk_TOI_per_game,home_last3_xGA_per_min_pk,home_last3_GA_per_min_pk,home_Last_Game_Date,home_Days_Since_Last_Game,home_B2B,home_last_5_FF%_5v5,home_last_5_GF%_5v5,home_last_5_xGF%_5v5,home_last_5_SH%,home_last5_pp_TOI_per_game,home_last5_xGF_per_min_pp,home_last5_GF_per_min_pp,home_last5_pk_TOI_per_game,home_last5_xGA_per_min_pk,home_last5_GA_per_min_pk,home_last_10_FF%_5v5,home_last_10_GF%_5v5,home_last_10_xGF%_5v5,home_last_10_SH%,home_last10_pp_TOI_per_game,home_last10_xGF_per_min_pp,home_last10_GF_per_min_pp,home_last10_pk_TOI_per_game,home_last10_xGA_per_min_pk,home_last10_GA_per_min_pk,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_GF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_last20_GA_per_min_pk,home_last_30_FF%_5v5,home_last_30_GF%_5v5,home_last_30_xGF%_5v5,home_last_30_SH%,home_last30_pp_TOI_per_game,home_last30_xGF_per_min_pp,home_last30_GF_per_min_pp,home_last30_pk_TOI_per_game,home_last30_xGA_per_min_pk,home_last30_GA_per_min_pk,home_last_40_FF%_5v5,home_last_40_GF%_5v5,home_last_40_xGF%_5v5,home_last_40_SH%,home_last40_pp_TOI_per_game,home_last40_xGF_per_min_pp,home_last40_GF_per_min_pp,home_last40_pk_TOI_per_game,home_last40_xGA_per_min_pk,home_last40_GA_per_min_pk,home_last40_pp_TOI_per_game.1,home_last40_xGF_per_min_pp.1,home_last40_GF_per_min_pp.1,home_last40_pk_TOI_per_game.1,home_last40_xGA_per_min_pk.1,home_last40_GA_per_min_pk.1,home_last_50_FF%_5v5,home_last_50_GF%_5v5,home_last_50_xGF%_5v5,home_last_50_SH%,home_last50_pp_TOI_per_game,home_last50_xGF_per_min_pp,home_last50_GF_per_min_pp,home_last50_pk_TOI_per_game,home_last50_xGA_per_min_pk,home_last50_GA_per_min_pk,away_Game_Number,away_Team_Key,away_last_1_FF%_5v5,away_last_1_GF%_5v5,away_last_1_xGF%_5v5,away_last_1_SH%,away_last1_pp_TOI_per_game,away_last1_xGF_per_min_pp,away_last1_GF_per_min_pp,away_last1_pk_TOI_per_game,away_last1_xGA_per_min_pk,away_last1_GA_per_min_pk,away_last_3_FF%_5v5,away_last_3_GF%_5v5,away_last_3_xGF%_5v5,away_last_3_SH%,away_last3_pp_TOI_per_game,away_last3_xGF_per_min_pp,away_last3_GF_per_min_pp,away_last3_pk_TOI_per_game,away_last3_xGA_per_min_pk,away_last3_GA_per_min_pk,away_Last_Game_Date,away_Days_Since_Last_Game,away_B2B,away_last_5_FF%_5v5,away_last_5_GF%_5v5,away_last_5_xGF%_5v5,away_last_5_SH%,away_last5_pp_TOI_per_game,away_last5_xGF_per_min_pp,away_last5_GF_per_min_pp,away_last5_pk_TOI_per_game,away_last5_xGA_per_min_pk,away_last5_GA_per_min_pk,away_last_10_FF%_5v5,away_last_10_GF%_5v5,away_last_10_xGF%_5v5,away_last_10_SH%,away_last10_pp_TOI_per_game,away_last10_xGF_per_min_pp,away_last10_GF_per_min_pp,away_last10_pk_TOI_per_game,away_last10_xGA_per_min_pk,away_last10_GA_per_min_pk,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_GF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_last20_GA_per_min_pk,away_last_30_FF%_5v5,away_last_30_GF%_5v5,away_last_30_xGF%_5v5,away_last_30_SH%,away_last30_pp_TOI_per_game,away_last30_xGF_per_min_pp,away_last30_GF_per_min_pp,away_last30_pk_TOI_per_game,away_last30_xGA_per_min_pk,away_last30_GA_per_min_pk,away_last_40_FF%_5v5,away_last_40_GF%_5v5,away_last_40_xGF%_5v5,away_last_40_SH%,away_last40_pp_TOI_per_game,away_last40_xGF_per_min_pp,away_last40_GF_per_min_pp,away_last40_pk_TOI_per_game,away_last40_xGA_per_min_pk,away_last40_GA_per_min_pk,away_last40_pp_TOI_per_game.1,away_last40_xGF_per_min_pp.1,away_last40_GF_per_min_pp.1,away_last40_pk_TOI_per_game.1,away_last40_xGA_per_min_pk.1,away_last40_GA_per_min_pk.1,away_last_50_FF%_5v5,away_last_50_GF%_5v5,away_last_50_xGF%_5v5,away_last_50_SH%,away_last50_pp_TOI_per_game,away_last50_xGF_per_min_pp,away_last50_GF_per_min_pp,away_last50_pk_TOI_per_game,away_last50_xGA_per_min_pk,away_last50_GA_per_min_pk,home_Rating.A.Pre,away_Rating.A.Pre,B2B_Status,Season
0,0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,0.937294,-0.202922,0.858462,Frederik Andersen,0.942516,0.082345,0.873171,1.0,WPG_2017-10-04,54.224396,100.0,59.846547,2.921562,4.0,0.0225,0.0,10.016667,0.10183,0.0,52.073795,62.416999,48.839009,10.258649,3.444444,0.063871,0.193548,4.005556,0.090707,0.0,2017-04-08 00:00:00,179 days 00:00:00.000000000,0.0,52.399869,57.080799,51.663405,9.426112,4.19,0.079714,0.095465,3.693333,0.098556,0.054152,50.977189,50.738779,51.924105,7.380972,5.128333,0.091453,0.136497,4.315,0.128158,0.0927,49.296838,51.281437,51.260619,8.999958,5.860833,0.102801,0.119437,5.44,0.105423,0.128676,49.171944,50.109126,49.007605,8.809652,5.171111,0.119961,0.116029,4.815,0.115057,0.131533,48.803377,50.127801,48.992719,9.025236,5.328333,0.112699,0.117297,4.923333,0.104858,0.137102,5.328333,0.112699,0.117297,4.923333,0.104858,0.137102,49.534654,50.287715,50.318327,8.75361,5.188667,0.111744,0.123346,5.087333,0.109448,0.141528,1.0,TOR_2017-10-04,51.969337,48.73737,47.757256,8.725136,8.0,0.11125,0.0,4.0,0.11,0.0,51.594385,47.355164,48.770492,8.692972,6.45,0.10646,0.05168,3.394444,0.111948,0.196399,2017-04-09 00:00:00,178 days 00:00:00.000000000,0.0,52.562502,45.9375,48.770492,6.967375,5.893333,0.06991,0.10181,3.07,0.074267,0.19544,50.792085,48.572198,49.886878,7.837427,5.073333,0.08042,0.137976,4.235,0.114522,0.165289,49.687136,49.188289,49.131362,7.552033,4.525833,0.118431,0.165715,4.195,0.123361,0.131108,50.085902,51.013795,50.0,7.493438,4.538889,0.128519,0.154223,4.363889,0.133978,0.122215,49.991679,51.399425,49.339386,8.124451,4.646667,0.1224,0.139885,4.54,0.133976,0.121145,4.646667,0.1224,0.139885,4.54,0.133976,0.121145,49.548771,51.598889,48.943911,8.502526,4.534,0.132422,0.172034,4.888,0.124182,0.106383,1495.03,1500.66,Neither,2017-2018
1,1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,0.941904,0.169541,0.877358,Jake Allen,0.941294,-0.239655,0.864516,1.0,PIT_2017-10-04,48.81846,52.538071,53.104925,8.585649,0.0,0.078125,-0.0,0.233333,0.214286,4.285714,44.169509,52.837327,49.560117,13.82154,2.761111,0.182294,0.362173,3.683333,0.171041,0.271493,2017-04-09 00:00:00,178 days 00:00:00.000000000,0.0,42.564205,59.064609,46.860987,12.093988,3.336667,0.143856,0.2997,3.546667,0.153383,0.225564,43.807042,45.254958,43.294064,8.673423,3.633333,0.176422,0.192661,3.358333,0.113151,0.17866,48.750126,52.677702,49.122002,8.77659,4.7925,0.134168,0.166927,4.569167,0.119825,0.131315,50.060225,53.269034,51.932644,8.229601,4.693889,0.123707,0.142029,4.812778,0.120859,0.110816,50.828439,56.868932,51.954595,9.060588,4.705417,0.124909,0.138139,4.774167,0.129028,0.10473,4.705417,0.124909,0.138139,4.774167,0.129028,0.10473,50.876293,54.822144,51.887863,8.966081,4.849333,0.127894,0.148474,4.915667,0.121448,0.113921,1.0,STL_2017-10-04,39.115696,57.58197,50.0,13.51611,8.0,0.1,0.0,4.0,0.04,0.0,45.437316,48.43562,49.468085,10.395145,7.283333,0.113043,0.1373,5.25,0.067937,0.0,2017-04-09 00:00:00,178 days 00:00:00.000000000,0.0,46.882217,49.927641,51.204482,11.358025,6.0,0.096,0.1,4.966667,0.109128,0.040268,51.762995,60.235018,56.574746,9.833864,5.313333,0.101255,0.075282,4.9,0.088571,0.081633,50.637795,64.851704,52.974711,8.595064,4.81,0.102079,0.10395,5.041667,0.07914,0.069421,50.657828,62.391842,53.831533,8.068254,4.613889,0.096376,0.108368,4.832222,0.087882,0.068981,50.633643,58.184556,52.486645,8.420932,4.315417,0.102018,0.115864,4.92875,0.097844,0.086229,4.315417,0.102018,0.115864,4.92875,0.097844,0.086229,49.594194,55.39352,51.243102,8.567742,4.491,0.092095,0.12024,5.094333,0.09485,0.106,1577.1,1535.17,Neither,2017-2018
2,2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cam Talbot,0.942492,0.302087,0.897778,Mike Smith,0.938246,-0.097423,0.878613,1.0,EDM_2017-10-04,66.714754,65.935919,70.066519,9.403559,8.166667,0.036735,0.122449,2.0,0.115,0.0,61.871942,67.470882,66.234888,8.840201,6.222222,0.109286,0.214286,5.166667,0.078065,0.064516,2017-04-09 00:00:00,178 days 00:00:00.000000000,0.0,60.511924,58.385392,60.180542,8.478124,6.283333,0.113316,0.190981,4.54,0.131278,0.132159,54.257609,57.390731,54.170931,10.202647,6.091667,0.11409,0.147743,4.815,0.132087,0.062305,52.167745,59.967217,53.263517,9.399953,5.31,0.132015,0.178908,4.020833,0.128953,0.124352,51.448176,57.110051,52.072539,8.603438,4.756667,0.12684,0.15417,3.927222,0.125874,0.135804,50.407241,56.575634,49.851785,9.02546,4.6825,0.132248,0.149493,4.23375,0.116445,0.112194,4.6825,0.132248,0.149493,4.23375,0.116445,0.112194,50.909452,56.474007,50.031878,8.643084,4.692,0.127408,0.144928,4.306333,0.132828,0.125397,1.0,CGY_2017-10-04,44.264093,34.35374,50.196078,6.184936,6.0,0.095,0.0,3.35,0.253731,0.298507,44.065149,47.90146,43.499044,9.144748,5.316667,0.136677,0.062696,5.644444,0.08622,0.059055,2017-04-08 00:00:00,179 days 00:00:00.000000000,0.0,43.520998,45.427286,40.305523,9.286882,4.816667,0.153218,0.16609,5.853333,0.112415,0.068337,46.986842,40.884428,42.363112,7.444549,5.438333,0.111983,0.147104,5.585,0.10188,0.089526,48.845951,54.384033,46.495147,8.859738,4.541667,0.118899,0.13211,4.8525,0.099227,0.082432,50.449932,54.47604,48.071667,9.030071,4.732778,0.12755,0.119732,4.905,0.116276,0.095141,50.595552,50.499508,49.136336,7.879167,4.921667,0.120843,0.11683,5.185417,0.107127,0.106067,4.921667,0.120843,0.11683,5.185417,0.107127,0.106067,51.141322,49.110119,49.803726,7.689586,5.240333,0.126366,0.133579,5.553667,0.100942,0.093632,1522.11,1496.85,Neither,2017-2018
3,3,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,0.934447,-0.164139,0.869266,Brian Elliott,0.938305,-0.080476,0.848,1.0,S.J_2017-10-04,55.735907,65.646259,49.803922,9.52616,3.35,0.253731,0.298507,6.0,0.095,0.0,54.394882,62.234534,49.575372,11.527279,4.45,0.098876,0.074906,6.3,0.115344,0.10582,2017-04-08 00:00:00,179 days 00:00:00.000000000,0.0,54.316401,57.771883,52.571429,9.804628,4.62,0.118615,0.04329,4.763333,0.137299,0.125962,53.012374,45.443587,52.992908,8.268632,3.926667,0.129117,0.101868,4.883333,0.13802,0.163823,52.824405,48.85344,52.718405,7.64124,4.8,0.1075,0.083333,4.711667,0.123629,0.127343,53.049309,52.608934,52.598249,7.963363,4.742778,0.1095,0.084339,4.49,0.131552,0.13363,52.890654,53.260259,52.809227,7.970138,4.778333,0.105738,0.099407,4.379167,0.120913,0.125595,4.778333,0.105738,0.099407,4.379167,0.120913,0.125595,52.379287,54.209026,51.733083,8.489525,4.903,0.102794,0.09382,4.416333,0.117247,0.117745,1.0,PHI_2017-10-04,55.269201,56.93878,59.50783,8.184218,2.0,0.055,0.0,5.783333,0.216138,0.172911,55.404341,64.739229,56.31068,7.457229,4.672222,0.11415,0.071344,5.777778,0.110192,0.115385,2017-04-09 00:00:00,178 days 00:00:00.000000000,0.0,51.909534,56.272661,49.941995,6.524847,5.173333,0.137242,0.115979,5.963333,0.086864,0.100615,52.461604,60.718636,53.426249,8.037406,4.32,0.172222,0.115741,4.793333,0.117872,0.104312,51.361625,58.109135,52.868013,7.967239,5.54,0.146841,0.072202,4.888333,0.096966,0.153427,51.47572,49.303136,51.280317,6.34076,5.412222,0.141778,0.092383,4.803333,0.093963,0.131853,51.197815,45.246898,50.855171,5.932286,5.57125,0.143998,0.103208,5.305,0.093779,0.131951,5.57125,0.143998,0.103208,5.305,0.093779,0.131951,50.880317,43.291024,50.88389,5.745645,5.329667,0.143349,0.097567,5.097667,0.092709,0.129471,1525.37,1496.86,Neither,2017-2018
4,4,2017020005,2017-10-05,TD Garden,BOS,NSH,2017-10-05 23:00:00,4,3,Final,1,BOS_2017-10-05,NSH_2017-10-05,Tuukka Rask,0.933383,-0.310233,0.830721,Pekka Rinne,0.939698,-0.346771,0.839117,1.0,BOS_2017-10-05,42.398042,22.384428,45.864662,5.085683,4.0,0.0525,0.0,6.583333,0.04557,0.0,52.619501,30.693069,50.320807,2.564103,2.35,0.055319,0.283688,5.194444,0.046203,0.0,2017-04-08 00:00:00,180 days 00:00:00.000000000,0.0,52.400715,48.959081,50.929752,5.518246,2.69,0.098885,0.297398,5.446667,0.067197,0.03672,51.890397,48.997853,50.060569,6.215664,4.028333,0.11146,0.198593,6.178333,0.067818,0.064742,54.492135,54.287823,51.801536,7.607504,3.994167,0.127937,0.175256,5.6325,0.090546,0.088771,54.721558,53.584008,53.144172,8.31625,4.248333,0.123735,0.196155,5.275,0.089921,0.094787,55.762037,48.882718,54.871795,7.303942,4.482083,0.129293,0.189644,5.193333,0.084868,0.101091,4.482083,0.129293,0.189644,5.193333,0.084868,0.101091,56.472532,49.064324,55.811512,6.98974,4.848333,0.124785,0.165005,5.357,0.095352,0.089602,1.0,NSH_2017-10-05,45.775604,3.861645e-13,40.153453,1.298032e-14,10.016667,0.10183,0.0,4.0,0.0225,0.0,46.836396,61.161826,42.857143,9.393321,6.416667,0.125714,0.051948,3.333333,0.125,0.0,2017-04-08 00:00:00,180 days 00:00:00.000000000,0.0,47.102597,52.130045,43.6373,7.311321,6.066667,0.142088,0.065934,3.63,0.107438,0.0,50.832621,58.359804,49.720854,8.509832,5.921667,0.092035,0.067549,4.27,0.082436,0.070258,50.902461,56.250879,50.239234,8.154113,4.87,0.09384,0.082136,4.055,0.11381,0.135635,50.995332,52.058455,49.453715,8.414545,4.602222,0.094085,0.137615,4.324444,0.10722,0.154162,51.309591,52.122642,50.381002,7.885816,4.720833,0.087855,0.121801,4.475833,0.102718,0.128468,4.720833,0.087855,0.121801,4.475833,0.102718,0.128468,50.295362,52.858171,50.045846,7.866908,4.911667,0.084086,0.109942,4.781,0.095796,0.11713,1521.29,1545.81,Neither,2017-2018


In [10]:
list(sva.columns)

['Unnamed: 0',
 'game_id',
 'date',
 'venue',
 'home_team',
 'away_team',
 'start_time',
 'home_score',
 'away_score',
 'status',
 'Home_Team_Won',
 'Home_Team_Key',
 'Away_Team_Key',
 'home_goalie',
 'home_Goalie_FenwickSV%',
 'home_Goalie_GSAx/60',
 'home_Goalie_HDCSV%',
 'away_goalie',
 'away_Goalie_FenwickSV%',
 'away_Goalie_GSAx/60',
 'away_Goalie_HDCSV%',
 'home_Game_Number',
 'home_Team_Key',
 'home_last_1_FF%_5v5',
 'home_last_1_GF%_5v5',
 'home_last_1_xGF%_5v5',
 'home_last_1_SH%',
 'home_last1_pp_TOI_per_game',
 'home_last1_xGF_per_min_pp',
 'home_last1_GF_per_min_pp',
 'home_last1_pk_TOI_per_game',
 'home_last1_xGA_per_min_pk',
 'home_last1_GA_per_min_pk',
 'home_last_3_FF%_5v5',
 'home_last_3_GF%_5v5',
 'home_last_3_xGF%_5v5',
 'home_last_3_SH%',
 'home_last3_pp_TOI_per_game',
 'home_last3_xGF_per_min_pp',
 'home_last3_GF_per_min_pp',
 'home_last3_pk_TOI_per_game',
 'home_last3_xGA_per_min_pk',
 'home_last3_GA_per_min_pk',
 'home_Last_Game_Date',
 'home_Days_Since_Last_Game

In [76]:
common = ['home_Goalie_FenwickSV%',
 'home_Goalie_GSAx/60',
 'home_Goalie_HDCSV%',
 'away_Goalie_FenwickSV%',
 'away_Goalie_GSAx/60',
 'away_Goalie_HDCSV%', 
 'home_Rating.A.Pre',
 'away_Rating.A.Pre',
 'B2B_Status']

r1 = ['home_last_1_FF%_5v5',
 'home_last_1_GF%_5v5',
 'home_last_1_xGF%_5v5',
 'home_last_1_SH%',
 'home_last1_xGF_per_min_pp',
 'home_last1_GF_per_min_pp',
 'home_last1_xGA_per_min_pk',
 'home_last1_GA_per_min_pk',
 'away_last_1_FF%_5v5',
 'away_last_1_GF%_5v5',
 'away_last_1_xGF%_5v5',
 'away_last_1_SH%',
 'away_last1_xGF_per_min_pp',
 'away_last1_GF_per_min_pp',
 'away_last1_xGA_per_min_pk',
 'away_last1_GA_per_min_pk'] + common

r3 = ['home_last_3_FF%_5v5',
 'home_last_3_GF%_5v5',
 'home_last_3_xGF%_5v5',
 'home_last_3_SH%',
 'home_last3_xGF_per_min_pp',
 'home_last3_GF_per_min_pp',
 'home_last3_xGA_per_min_pk',
 'home_last3_GA_per_min_pk',
 'away_last_3_FF%_5v5',
 'away_last_3_GF%_5v5',
 'away_last_3_xGF%_5v5',
 'away_last_3_SH%',
 'away_last3_xGF_per_min_pp',
 'away_last3_GF_per_min_pp',
 'away_last3_xGA_per_min_pk',
 'away_last3_GA_per_min_pk'] + common

r5 =['home_last_5_FF%_5v5',
 'home_last_5_GF%_5v5',
 'home_last_5_xGF%_5v5',
 'home_last_5_SH%',
 'home_last5_xGF_per_min_pp',
 'home_last5_GF_per_min_pp',
 'home_last5_xGA_per_min_pk',
 'home_last5_GA_per_min_pk',
 'away_last_5_FF%_5v5',
 'away_last_5_GF%_5v5',
 'away_last_5_xGF%_5v5',
 'away_last_5_SH%',
 'away_last5_xGF_per_min_pp',
 'away_last5_GF_per_min_pp',
 'away_last5_xGA_per_min_pk',
 'away_last5_GA_per_min_pk'] + common

r10 =['home_last_10_FF%_5v5',
 'home_last_10_GF%_5v5',
 'home_last_10_xGF%_5v5',
 'home_last_10_SH%',
 'home_last10_xGF_per_min_pp',
 'home_last10_GF_per_min_pp',
 'home_last10_xGA_per_min_pk',
 'home_last10_GA_per_min_pk',
  'away_last_10_FF%_5v5',
 'away_last_10_GF%_5v5',
 'away_last_10_xGF%_5v5',
 'away_last_10_SH%',
 'away_last10_xGF_per_min_pp',
 'away_last10_GF_per_min_pp',
 'away_last10_xGA_per_min_pk',
 'away_last10_GA_per_min_pk'] + common


r20 = ['home_last_20_FF%_5v5',
 'home_last_20_GF%_5v5',
 'home_last_20_xGF%_5v5',
 'home_last_20_SH%',
 'home_last20_xGF_per_min_pp',
 'home_last20_GF_per_min_pp',
 'home_last20_xGA_per_min_pk',
 'home_last20_GA_per_min_pk',
 'away_last_20_FF%_5v5',
 'away_last_20_GF%_5v5',
 'away_last_20_xGF%_5v5',
 'away_last_20_SH%',
 'away_last20_xGF_per_min_pp',
 'away_last20_GF_per_min_pp',
 'away_last20_xGA_per_min_pk',
 'away_last20_GA_per_min_pk'] +common

r30 = ['home_last_30_FF%_5v5',
 'home_last_30_GF%_5v5',
 'home_last_30_xGF%_5v5',
 'home_last_30_SH%',
 'home_last30_xGF_per_min_pp',
 'home_last30_GF_per_min_pp',
 'home_last30_xGA_per_min_pk',
 'home_last30_GA_per_min_pk',
 'away_last_30_FF%_5v5',
 'away_last_30_GF%_5v5',
 'away_last_30_xGF%_5v5',
 'away_last_30_SH%',
 'away_last30_xGF_per_min_pp',
 'away_last30_GF_per_min_pp',
 'away_last30_xGA_per_min_pk',
 'away_last30_GA_per_min_pk'] + common


r40 = ['home_last_40_FF%_5v5',
 'home_last_40_GF%_5v5',
 'home_last_40_xGF%_5v5',
 'home_last_40_SH%',
 'home_last40_xGF_per_min_pp',
 'home_last40_GF_per_min_pp',
 'home_last40_xGA_per_min_pk',
 'home_last40_GA_per_min_pk',
 'away_last_40_FF%_5v5',
 'away_last_40_GF%_5v5',
 'away_last_40_xGF%_5v5',
 'away_last_40_SH%',
 'away_last40_xGF_per_min_pp',
 'away_last40_GF_per_min_pp',
 'away_last40_xGA_per_min_pk',
 'away_last40_GA_per_min_pk'] + common

r50 = ['home_last_50_FF%_5v5',
 'home_last_50_GF%_5v5',
 'home_last_50_xGF%_5v5',
 'home_last_50_SH%',
 'home_last50_xGF_per_min_pp',
 'home_last50_GF_per_min_pp',
 'home_last50_xGA_per_min_pk',
 'home_last50_GA_per_min_pk',
 'away_last_50_FF%_5v5',
 'away_last_50_GF%_5v5',
 'away_last_50_xGF%_5v5',
 'away_last_50_SH%',
 'away_last50_xGF_per_min_pp',
 'away_last50_GF_per_min_pp',
 'away_last50_xGA_per_min_pk',
 'away_last50_GA_per_min_pk'] + common


all_r = list(set(r3+r5+r10+r20+r30))

r3_30 =list(set(r3+r30))
r5_30 = list(set(r5+r30))
r10_30 = list(set(r10+r30))
r_3_5_30 = list(set(r3+r5+r30))
r_5_20 = list(set(r5+r20))
r_5_40 = list(set(r5+r40))

In [77]:
windows =  {'1': r1, '3' : r3,'5' : r5,'10' : r10,'20' : r20, '30' : r30, '40' : r40, '50': r50, 'all' : all_r, '3+30': r3_30, '5+30': r5_30, '10+30' : r10_30, '3+5+30': r_3_5_30, '5+20': r_5_20, '5+40' : r_5_40 }

After evaluating the data with no scoring and venue adjustment(SVA), I added a 40 game rolling window to review with the SVA data. The 40 game window is the most promising with the lowest log loss. The 5 and 40 combined windows also show promise. Additionally comparing the 30 game window from the raw vs the SVA data, the SVA data has a lower log loss and appeart to be superior.

In [70]:
#checking for infinite values and dropping
pd.set_option("display.max_rows", 20)
sva1 = sva.dropna(subset = ['home_Rating.A.Pre'])

r = sva1[r1].index[np.isinf(sva1[r1]).any(1)]

r

Int64Index([ 161,  308,  346,  399,  450,  451,  650,  872,  940, 1075, 1376,
            1608, 1768, 2008, 2223, 2244, 2265, 2410, 2461, 2535, 2548, 2728,
            2938, 3053, 3236, 3469, 3518, 3867, 3877, 3914, 3922, 4054, 4173,
            4174, 4177, 4371],
           dtype='int64')

In [71]:
sva1 = sva1.drop(index=r)

In [78]:
for k,v in windows.items():
    print(f'{k} rolling cv log loss = {model_and_evaluate(sva1, v)}')

1 rolling cv log loss = -0.681743840701009
3 rolling cv log loss = -0.6824750160737495
5 rolling cv log loss = -0.6806028925591109
10 rolling cv log loss = -0.6790897151577344
20 rolling cv log loss = -0.680232903596563
30 rolling cv log loss = -0.6803889424949425
40 rolling cv log loss = -0.6786612628646052
50 rolling cv log loss = -0.679133616838792
all rolling cv log loss = -0.6993469447370013
3+30 rolling cv log loss = -0.6867413175473175
5+30 rolling cv log loss = -0.6849884213438509
10+30 rolling cv log loss = -0.6854996362496376
3+5+30 rolling cv log loss = -0.6904086232018225
5+20 rolling cv log loss = -0.6852509541316462
5+40 rolling cv log loss = -0.6827305174117813


In [79]:
for k,v in windows.items():
    print(f'{k} rolling cv accuracy = {model_and_evaluate(sva1, v, "accuracy")}')

1 rolling cv accuracy = 0.5711180124223602
3 rolling cv accuracy = 0.5680124223602485
5 rolling cv accuracy = 0.5686335403726708
10 rolling cv accuracy = 0.5754658385093168
20 rolling cv accuracy = 0.5686335403726708
30 rolling cv accuracy = 0.567391304347826
40 rolling cv accuracy = 0.5711180124223602
50 rolling cv accuracy = 0.5720496894409939
all rolling cv accuracy = 0.5484472049689442
3+30 rolling cv accuracy = 0.5636645962732919
5+30 rolling cv accuracy = 0.5636645962732919
10+30 rolling cv accuracy = 0.5642857142857143
3+5+30 rolling cv accuracy = 0.5531055900621118
5+20 rolling cv accuracy = 0.5546583850931677
5+40 rolling cv accuracy = 0.5664596273291925


# Conclusion

Using Venue and Scoring adjustment for the 5v5 data looks promising. It also seems that the more rolling games, the more predictive the features are, however log loss diminishes after 40 . Including a mix of 5 and 40 game rolling features could provide the best mix of capturing season long team evaluation with short term streakiness. When modeling I will try using 40 games only, 5 and 40, and trying recursive feature elimiation on all windows to see if that algorithm can find patternes within the feature set.