In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import hockey_scraper
pd.set_option('display.max_columns', 100)

In [83]:
nst_to_sched = { 'Anaheim Ducks': 'ANA', 
                      'Arizona Coyotes' : 'ARI', 
                      'Boston Bruins': 'BOS', 
                      'Buffalo Sabres':'BUF',
                      'Calgary Flames': 'CGY', 
                      'Carolina Hurricanes': 'CAR', 
                      'Chicago Blackhawks': 'CHI', 
                      'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A.',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J.',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J.',
                     'St. Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B.',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                      'Vegas Golden Knights':'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}

In [15]:
goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}

In [86]:
#test if i should switch to score and venu adjusted?
#sit = sva
#test 5,10,15 rolling as well
def get_and_format_nst_team_stats(season, sit, rate):
    nst_to_sched = {'Anaheim Ducks': 'ANA',
                     'Arizona Coyotes': 'ARI',
                     'Boston Bruins': 'BOS',
                     'Buffalo Sabres': 'BUF',
                     'Calgary Flames': 'CGY',
                     'Carolina Hurricanes': 'CAR',
                     'Chicago Blackhawks': 'CHI',
                     'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J',
                     'St Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                     'Vegas Golden Knights': 'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}
    
    url = 'https://www.naturalstattrick.com/games.php?fromseason={}&thruseason={}&stype=2&sit={}&loc=B&team=All&rate={}'.format(
        season,
        season,
        sit,
        rate)
    df = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]
    df.reset_index(inplace = True)
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Game_Number'] = df.groupby('Team').cumcount() + 1
    #rename Team_Date to team key or something like that
    df = df.replace({'Team': nst_to_sched})

    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    return df

In [17]:
def merge_team_stats(primary_df, pp_df, pk_df):
    primary_df = primary_df.merge(pk_df[['Team_Key', 'TOI', 'xGA']], on = 'Team_Key', how = 'left', suffixes = ('','_pk') )
    primary_df = primary_df.merge(pp_df[['Team_Key', 'TOI', 'xGF']], on = 'Team_Key', how = 'left', suffixes = ('','_pp') )
    return primary_df

In [111]:
def calculate_team_features(df, rolling_games = 20):
    df['sum_rolling20_TOI_5v5'] = df.groupby('Team')['TOI'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_GF_5v5'] = df.groupby('Team')['GF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_GA_5v5'] = df.groupby('Team')['GA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGF_5v5'] = df.groupby('Team')['xGF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGA_5v5'] = df.groupby('Team')['xGA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_SF_5v5'] = df.groupby('Team')['SF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last_20_FF%_5v5'] = df['sum_rolling20_FF_5v5']*100/ (df['sum_rolling20_FF_5v5']+df['sum_rolling20_FA_5v5'])
    df['last_20_GF%_5v5'] = df['sum_rolling20_GF_5v5']*100/ (df['sum_rolling20_GF_5v5']+df['sum_rolling20_GA_5v5'])
    df['last_20_xGF%_5v5'] = df['sum_rolling20_xGF_5v5']*100/ (df['sum_rolling20_xGF_5v5']+df['sum_rolling20_xGA_5v5'])
    df['last_20_SH%'] = df['sum_rolling20_GF_5v5']*100 / df['sum_rolling20_SF_5v5']
    
    
    #fix NaNs in pp and pk features
    df['TOI_pp'] = np.where(df['TOI_pp'].isna(), 0, df['TOI_pp'])
    df['TOI_pk'] = np.where(df['TOI_pk'].isna(), 0, df['TOI_pk'])
    df['xGF_pp'] = np.where(df['xGF_pp'].isna(), 0, df['xGF_pp'])
    df['xGA_pk'] = np.where(df['xGA_pk'].isna(), 0, df['xGA_pk'])
    
    #pp features
    df['sum_rolling20_TOI_pp'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGF_pp'] = df.groupby('Team')['xGF_pp'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last20_pp_TOI_per_game'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(20, 20).mean().shift())
    df['last20_xGF_per_min_pp'] = df['sum_rolling20_xGF_pp'] / df['sum_rolling20_TOI_pp'] 
    
    #pk features
    df['sum_rolling20_TOI_pk'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGA_pk'] = df.groupby('Team')['xGA_pk'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last20_pk_TOI_per_game'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(20, 20).mean().shift())
    df['last20_xGA_per_min_pk'] = df['sum_rolling20_xGA_pk'] / df['sum_rolling20_TOI_pk'] 
    
    #to get back to back category
    df['Last_Game_Date'] = df.groupby('Team')['Date'].shift()
    df['Days_Since_Last_Game'] = df['Date'] - df['Last_Game_Date']
    df['B2B'] = np.where(df['Days_Since_Last_Game'] == '1 days', 1, 0)
    
    return df

In [51]:
#get starters
def get_starters(year):
    goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}
    counter = 0
    for k,v in goalie_table_teams.items():
        print(k)
        starter_url = 'http://hockeygoalies.org/bio/nhl/logs/{}{}.html'.format(v, year)
        goalies = pd.read_html(starter_url)[0]
        goalies.replace(to_replace=['(BU)', np.NaN], value = 'DNP', inplace = True)
        goalies.drop(columns = ['DEC'], inplace = True)
        goalies.drop(index  = goalies.iloc[-1].name, inplace = True)
        goalies['starter'] = 'placeholder'

        starter = []
        for i, row in goalies.iterrows():
            for n in range(len(row)):
                if row[n][0] == 'W' or row[n][0] == 'L':
                    starter.append(goalies.columns[n])

        goalies['starter'] = starter
        goalies['Team'] = k
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['Team_Key'] = goalies['Team'].astype(str)+'_'+goalies['DATE'].astype(str)
        columns = ['Team','DATE', 'OPPONENT', 'starter', 'Team_Key']
        if counter == 0:
            master = goalies
        if counter != 0:
            master = pd.concat( [master[columns], goalies[columns]])
        counter +=1
    return master
                

In [55]:
#'2017-10-04' to '2018-04-08'
def get_game_results(season_start, season_end):
    sched_df = hockey_scraper.scrape_schedule(season_start, season_end)
    sched_df['Home_Team_Won'] = np.where(sched_df['home_score'] > sched_df['away_score'], 1, 0)
    sched_df['Home_Team_Key'] = sched_df['home_team'].astype(str)+'_'+sched_df['date'].astype(str)
    sched_df['Away_Team_Key'] = sched_df['away_team'].astype(str)+'_'+sched_df['date'].astype(str)
    return sched_df

In [21]:
def merge_starters_and_features(game_results_df, starters_df, features_df, feature_columns):
    df = game_results_df.merge(starters_df[['Team_Key', 'starter']], left_on = 'Home_Team_Key', right_on = 'Team_Key', how = 'left').rename(columns ={'starter':'home_starter'}).drop(columns = 'Team_Key')
    df = df.merge(starters_df[['Team_Key', 'starter']], left_on = 'Away_Team_Key', right_on = 'Team_Key', how = 'left').rename(columns ={'starter':'away_starter'}).drop(columns = 'Team_Key')
    df = df.merge(features_df[feature_columns].add_prefix('home_'), left_on = 'Home_Team_Key', right_on = 'home_Team_Key', how = 'left')
    df = df.merge(features_df[feature_columns].add_prefix('away_'), left_on = 'Away_Team_Key', right_on = 'away_Team_Key', how = 'left')
    return df

In [22]:
feature_columns = ['Team_Key', 'last_20_FF%_5v5', 'last_20_GF%_5v5', 'last_20_xGF%_5v5', 'last_20_SH%', 'last20_pp_TOI_per_game', 'last20_xGF_per_min_pp','last20_pk_TOI_per_game', 'last20_xGA_per_min_pk', 'B2B']

### 2017-2018 Season

In [87]:
primary = get_and_format_nst_team_stats('20172018','5v5', 'n')
pp = get_and_format_nst_team_stats('20172018','pp', 'n')
pk = get_and_format_nst_team_stats('20172018','pk', 'n')

In [88]:
features = merge_team_stats(primary,pp,pk)

In [89]:
features = calculate_team_features(features)

In [90]:
features.tail()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling20_TOI_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_FF%_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
2537,"2018-04-07 - Canucks 2, Oilers 3",VAN,Limited ReportFull Report,51.583333,52,54,49.06,44,40,52.38,32,28,53.33,2,0,100.0,2.73,2.99,47.71,28,32,46.67,15,16,48.39,12,9,57.14,2,0,100.0,16.67,100.0,13,16,44.83,6,12,33.33,0,0,,0.0,100.0,22,16,57.89,12,5,70.59,0,0,,0.0,100.0,6.25,100.0,1.063,18347,2018-04-07,82,VAN_2018-04-07,1.516667,0.31,4.966667,0.44,997.166667,676.0,727.0,32.0,40.0,34.26,39.28,480.0,48.182466,44.444444,46.586891,6.666667,69.0,8.72,3.45,0.126377,105.366667,9.51,5.268333,0.090256,2018-04-05,2 days,0
2538,"2018-04-07 - Stars 4, Kings 2",DAL,Limited ReportFull Report,50.7,28,56,33.33,24,44,35.29,16,35,31.37,4,2,66.67,2.15,1.75,55.18,22,21,51.16,9,5,64.29,9,5,64.29,3,1,75.0,33.33,80.0,13,16,44.83,4,7,36.36,1,0,100.0,25.0,100.0,5,32,13.51,2,21,8.7,0,1,0.0,0.0,95.24,25.0,94.29,1.193,18230,2018-04-07,82,DAL_2018-04-07,6.0,0.14,2.0,0.17,958.366667,632.0,643.0,27.0,33.0,34.14,32.47,445.0,49.568627,45.0,51.253566,6.067416,94.066667,11.12,4.703333,0.118214,108.033333,14.14,5.401667,0.130886,2018-04-06,1 days,1
2539,"2018-04-07 - Stars 4, Kings 2",L.A,Limited ReportFull Report,50.7,56,28,66.67,44,24,64.71,35,16,68.63,2,4,33.33,1.75,2.15,44.82,21,22,48.84,5,9,35.71,5,9,35.71,1,3,25.0,20.0,66.67,16,13,55.17,7,4,63.64,0,1,0.0,0.0,75.0,32,5,86.49,21,2,91.3,1,0,100.0,4.76,100.0,5.71,75.0,0.807,18230,2018-04-07,82,L.A_2018-04-07,2.0,0.17,6.0,0.14,982.116667,679.0,688.0,41.0,35.0,32.65,35.32,481.0,49.670812,53.947368,48.035898,8.523909,92.366667,13.04,4.618333,0.141176,94.516667,9.07,4.725833,0.095962,2018-04-05,2 days,0
2540,"2018-04-07 - Wild 6, Sharks 3",MIN,Limited ReportFull Report,53.466667,52,56,48.15,31,37,45.59,23,22,51.11,5,2,71.43,1.84,1.53,54.62,25,21,54.35,14,10,58.33,12,6,66.67,3,2,60.0,25.0,66.67,11,11,50.0,4,4,50.0,1,0,100.0,25.0,100.0,22,27,44.9,7,12,36.84,1,0,100.0,14.29,100.0,21.74,90.91,1.126,17562,2018-04-07,82,MIN_2018-04-07,3.25,0.28,1.25,0.0,976.766667,677.0,660.0,37.0,34.0,36.27,30.75,492.0,50.635752,52.112676,54.118174,7.520325,90.283333,11.47,4.514167,0.127044,88.0,7.56,4.4,0.085909,2018-04-05,2 days,0
2541,"2018-04-07 - Wild 6, Sharks 3",S.J,Limited ReportFull Report,53.466667,56,52,51.85,37,31,54.41,22,23,48.89,2,5,28.57,1.53,1.84,45.38,21,25,45.65,10,14,41.67,6,12,33.33,2,3,40.0,33.33,75.0,11,11,50.0,4,4,50.0,0,1,0.0,0.0,75.0,27,22,55.1,12,7,63.16,0,1,0.0,0.0,85.71,9.09,78.26,0.874,17562,2018-04-07,82,S.J_2018-04-07,1.25,0.0,3.25,0.28,1017.316667,757.0,711.0,49.0,39.0,42.4,36.63,546.0,51.566757,55.681818,53.650512,8.974359,85.3,12.47,4.265,0.14619,68.7,7.74,3.435,0.112664,2018-04-05,2 days,0


In [91]:
starters = get_starters('2017')

ANA
ARI
BOS
BUF
CGY
CAR
CHI
COL
CBJ
DAL
DET
EDM
FLA
L.A
MIN
MTL
NSH
N.J
NYI
NYR
OTT
PHI
PIT
S.J
STL
T.B
TOR
VAN
WSH
WPG
VGK


In [92]:
starters.tail()

Unnamed: 0,Team,DATE,OPPONENT,starter,Team_Key
97,VGK,2018-05-28,vs Washington,Marc-Andre Fleury,VGK_2018-05-28
98,VGK,2018-05-30,vs Washington,Marc-Andre Fleury,VGK_2018-05-30
99,VGK,2018-06-02,at Washington,Marc-Andre Fleury,VGK_2018-06-02
100,VGK,2018-06-04,at Washington,Marc-Andre Fleury,VGK_2018-06-04
101,VGK,2018-06-07,vs Washington,Marc-Andre Fleury,VGK_2018-06-07


In [98]:
results = get_game_results('2017-10-04', '2018-04-08')

Scraping the schedule between 2017-10-04 and 2018-04-08


In [99]:
df_20172018 = merge_starters_and_features(results, starters , features, feature_columns)

In [100]:
df_20172018.isna().sum()

game_id                          0
date                             0
venue                            0
home_team                        0
away_team                        0
start_time                       0
home_score                       0
away_score                       0
status                           0
Home_Team_Won                    0
Home_Team_Key                    0
Away_Team_Key                    0
home_starter                     0
away_starter                     0
home_Team_Key                    0
home_last_20_FF%_5v5           309
home_last_20_GF%_5v5           309
home_last_20_xGF%_5v5          309
home_last_20_SH%               309
home_last20_pp_TOI_per_game    309
home_last20_xGF_per_min_pp     309
home_last20_pk_TOI_per_game    309
home_last20_xGA_per_min_pk     309
home_B2B                         0
away_Team_Key                    0
away_last_20_FF%_5v5           311
away_last_20_GF%_5v5           311
away_last_20_xGF%_5v5          311
away_last_20_SH%    

### EDA

In [109]:
df_20172018['Home_Team_Won'].value_counts(normalize = True)

1    0.563336
0    0.436664
Name: Home_Team_Won, dtype: float64

In [112]:
df_20172018.columns

Index(['game_id', 'date', 'venue', 'home_team', 'away_team', 'start_time',
       'home_score', 'away_score', 'status', 'Home_Team_Won', 'Home_Team_Key',
       'Away_Team_Key', 'home_starter', 'away_starter', 'home_Team_Key',
       'home_last_20_FF%_5v5', 'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5',
       'home_last_20_SH%', 'home_last20_pp_TOI_per_game',
       'home_last20_xGF_per_min_pp', 'home_last20_pk_TOI_per_game',
       'home_last20_xGA_per_min_pk', 'home_B2B', 'away_Team_Key',
       'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5', 'away_last_20_xGF%_5v5',
       'away_last_20_SH%', 'away_last20_pp_TOI_per_game',
       'away_last20_xGF_per_min_pp', 'away_last20_pk_TOI_per_game',
       'away_last20_xGA_per_min_pk', 'away_B2B'],
      dtype='object')

In [115]:
df_20172018[(df_20172018['home_B2B'] == 1) & (df_20172018['away_B2B'] == 0)]['Home_Team_Won'].value_counts(normalize = True)

0    0.538462
1    0.461538
Name: Home_Team_Won, dtype: float64

In [116]:
df_20172018[(df_20172018['home_B2B'] == 0) & (df_20172018['away_B2B'] == 1)]['Home_Team_Won'].value_counts(normalize = True)

1    0.607843
0    0.392157
Name: Home_Team_Won, dtype: float64

### Quick Model


In [117]:
model_df = df_20172018.dropna()

In [119]:
model_df.isna().sum()

game_id                        0
date                           0
venue                          0
home_team                      0
away_team                      0
start_time                     0
home_score                     0
away_score                     0
status                         0
Home_Team_Won                  0
Home_Team_Key                  0
Away_Team_Key                  0
home_starter                   0
away_starter                   0
home_Team_Key                  0
home_last_20_FF%_5v5           0
home_last_20_GF%_5v5           0
home_last_20_xGF%_5v5          0
home_last_20_SH%               0
home_last20_pp_TOI_per_game    0
home_last20_xGF_per_min_pp     0
home_last20_pk_TOI_per_game    0
home_last20_xGA_per_min_pk     0
home_B2B                       0
away_Team_Key                  0
away_last_20_FF%_5v5           0
away_last_20_GF%_5v5           0
away_last_20_xGF%_5v5          0
away_last_20_SH%               0
away_last20_pp_TOI_per_game    0
away_last2

In [134]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import log_loss
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.under_sampling import TomekLinks 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.pipeline import make_pipeline

In [121]:
model_df.columns

Index(['game_id', 'date', 'venue', 'home_team', 'away_team', 'start_time',
       'home_score', 'away_score', 'status', 'Home_Team_Won', 'Home_Team_Key',
       'Away_Team_Key', 'home_starter', 'away_starter', 'home_Team_Key',
       'home_last_20_FF%_5v5', 'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5',
       'home_last_20_SH%', 'home_last20_pp_TOI_per_game',
       'home_last20_xGF_per_min_pp', 'home_last20_pk_TOI_per_game',
       'home_last20_xGA_per_min_pk', 'home_B2B', 'away_Team_Key',
       'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5', 'away_last_20_xGF%_5v5',
       'away_last_20_SH%', 'away_last20_pp_TOI_per_game',
       'away_last20_xGF_per_min_pp', 'away_last20_pk_TOI_per_game',
       'away_last20_xGA_per_min_pk', 'away_B2B'],
      dtype='object')

In [143]:
features = ['home_last_20_FF%_5v5', 'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5',
       'home_last_20_SH%', 'home_last20_pp_TOI_per_game',
       'home_last20_xGF_per_min_pp', 'home_last20_pk_TOI_per_game',
       'home_last20_xGA_per_min_pk', 'home_B2B',
       'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5', 'away_last_20_xGF%_5v5',
       'away_last_20_SH%', 'away_last20_pp_TOI_per_game',
       'away_last20_xGF_per_min_pp', 'away_last20_pk_TOI_per_game',
       'away_last20_xGA_per_min_pk', 'away_B2B']

In [144]:
X = model_df[features]
y = model_df['Home_Team_Won']

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

In [146]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

In [148]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'accuracy').mean()

0.5756131192750911

In [142]:
X

Unnamed: 0,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
297,53.806735,45.454545,55.847287,6.445672,6.306667,0.128990,5.492500,0.133364,0,TOR_2017-11-18,48.654709,51.648352,50.045437,10.421286,5.183333,0.169871,5.990833,0.109417,0
309,46.914557,39.726027,45.009242,6.458797,5.718333,0.121102,5.308333,0.110958,0,CBJ_2017-11-20,53.541667,57.692308,56.086649,7.772021,4.890000,0.076789,4.104167,0.115249,0
310,48.571429,53.333333,50.723709,10.434783,5.240000,0.158874,5.290833,0.109151,0,ARI_2017-11-20,47.345455,38.823529,46.200109,7.236842,5.591667,0.127064,4.570000,0.094201,0
315,50.396825,56.666667,50.305117,7.218684,5.653333,0.135849,5.315833,0.115974,0,VAN_2017-11-21,48.363339,52.380952,48.693818,7.728337,6.749167,0.090159,5.660000,0.097615,0
316,53.404744,59.459459,54.597442,8.782435,5.312500,0.131859,5.546667,0.142788,0,EDM_2017-11-21,55.049787,44.285714,55.312540,5.626134,4.815833,0.130507,5.877500,0.113484,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,58.321965,39.743590,57.717758,5.381944,4.989167,0.156539,5.335000,0.102812,0,VGK_2018-04-07,48.785714,47.887324,47.852246,6.967213,4.763333,0.136774,5.145833,0.118640,0
1267,46.707400,47.252747,45.332105,8.531746,4.456667,0.102655,4.719167,0.098958,0,VAN_2018-04-07,48.182466,44.444444,46.586891,6.666667,3.450000,0.126377,5.268333,0.090256,0
1268,49.670812,53.947368,48.035898,8.523909,4.618333,0.141176,4.725833,0.095962,0,DAL_2018-04-07,49.568627,45.000000,51.253566,6.067416,4.703333,0.118214,5.401667,0.130886,1
1269,51.566757,55.681818,53.650512,8.974359,4.265000,0.146190,3.435000,0.112664,0,MIN_2018-04-07,50.635752,52.112676,54.118174,7.520325,4.514167,0.127044,4.400000,0.085909,0
