In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import hockey_scraper
import pickle
import time
import random
pd.set_option('display.max_columns', 100)

In [3]:
nst_to_sched = { 'Anaheim Ducks': 'ANA', 
                      'Arizona Coyotes' : 'ARI', 
                      'Boston Bruins': 'BOS', 
                      'Buffalo Sabres':'BUF',
                      'Calgary Flames': 'CGY', 
                      'Carolina Hurricanes': 'CAR', 
                      'Chicago Blackhawks': 'CHI', 
                      'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A.',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J.',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J.',
                     'St. Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B.',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                      'Vegas Golden Knights':'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}

In [4]:
goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}

In [5]:
#test if i should switch to score and venu adjusted?
#sit = sva
#test 5,10,15 rolling as well
def get_and_format_nst_team_stats(season, sit, rate):
    nst_to_sched = {'Anaheim Ducks': 'ANA',
                     'Arizona Coyotes': 'ARI',
                     'Boston Bruins': 'BOS',
                     'Buffalo Sabres': 'BUF',
                     'Calgary Flames': 'CGY',
                     'Carolina Hurricanes': 'CAR',
                     'Chicago Blackhawks': 'CHI',
                     'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J',
                     'St Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                     'Vegas Golden Knights': 'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}
    
    url = 'https://www.naturalstattrick.com/games.php?fromseason={}&thruseason={}&stype=2&sit={}&loc=B&team=All&rate={}'.format(
        season,
        season,
        sit,
        rate)
    df = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]
    df.reset_index(inplace = True)
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Game_Number'] = df.groupby('Team').cumcount() + 1
    #rename Team_Date to team key or something like that
    df = df.replace({'Team': nst_to_sched})

    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    return df

In [6]:
def merge_team_stats(primary_df, pp_df, pk_df):
    primary_df = primary_df.merge(pk_df[['Team_Key', 'TOI', 'xGA']], on = 'Team_Key', how = 'left', suffixes = ('','_pk') )
    primary_df = primary_df.merge(pp_df[['Team_Key', 'TOI', 'xGF']], on = 'Team_Key', how = 'left', suffixes = ('','_pp') )
    return primary_df

In [7]:
def calculate_team_features(df, rolling_games = 20):
    df['sum_rolling20_TOI_5v5'] = df.groupby('Team')['TOI'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_GF_5v5'] = df.groupby('Team')['GF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_GA_5v5'] = df.groupby('Team')['GA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGF_5v5'] = df.groupby('Team')['xGF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGA_5v5'] = df.groupby('Team')['xGA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_SF_5v5'] = df.groupby('Team')['SF'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last_20_FF%_5v5'] = df['sum_rolling20_FF_5v5']*100/ (df['sum_rolling20_FF_5v5']+df['sum_rolling20_FA_5v5'])
    df['last_20_GF%_5v5'] = df['sum_rolling20_GF_5v5']*100/ (df['sum_rolling20_GF_5v5']+df['sum_rolling20_GA_5v5'])
    df['last_20_xGF%_5v5'] = df['sum_rolling20_xGF_5v5']*100/ (df['sum_rolling20_xGF_5v5']+df['sum_rolling20_xGA_5v5'])
    df['last_20_SH%'] = df['sum_rolling20_GF_5v5']*100 / df['sum_rolling20_SF_5v5']
    
    
    #fix NaNs in pp and pk features
    df['TOI_pp'] = np.where(df['TOI_pp'].isna(), 0, df['TOI_pp'])
    df['TOI_pk'] = np.where(df['TOI_pk'].isna(), 0, df['TOI_pk'])
    df['xGF_pp'] = np.where(df['xGF_pp'].isna(), 0, df['xGF_pp'])
    df['xGA_pk'] = np.where(df['xGA_pk'].isna(), 0, df['xGA_pk'])
    
    #pp features
    df['sum_rolling20_TOI_pp'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGF_pp'] = df.groupby('Team')['xGF_pp'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last20_pp_TOI_per_game'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(20, 20).mean().shift())
    df['last20_xGF_per_min_pp'] = df['sum_rolling20_xGF_pp'] / df['sum_rolling20_TOI_pp'] 
    
    #pk features
    df['sum_rolling20_TOI_pk'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['sum_rolling20_xGA_pk'] = df.groupby('Team')['xGA_pk'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['last20_pk_TOI_per_game'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(20, 20).mean().shift())
    df['last20_xGA_per_min_pk'] = df['sum_rolling20_xGA_pk'] / df['sum_rolling20_TOI_pk'] 
    
    #to get back to back category
    df['Last_Game_Date'] = df.groupby('Team')['Date'].shift()
    df['Days_Since_Last_Game'] = df['Date'] - df['Last_Game_Date']
    df['B2B'] = np.where(df['Days_Since_Last_Game'] == '1 days', 1, 0)
    
    return df

In [8]:
#get starters
def get_starters(year):
    goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}
    counter = 0
    for k,v in goalie_table_teams.items():
        print(k)
        starter_url = 'http://hockeygoalies.org/bio/nhl/logs/{}{}.html'.format(v, year)
        goalies = pd.read_html(starter_url)[0]
        goalies.replace(to_replace=['(BU)', np.NaN], value = 'DNP', inplace = True)
        goalies.drop(columns = ['DEC'], inplace = True)
        goalies.drop(index  = goalies.iloc[-1].name, inplace = True)
        goalies['starter'] = 'placeholder'

        starter = []
        for i, row in goalies.iterrows():
            for n in range(len(row)):
                if row[n][0] == 'W' or row[n][0] == 'L':
                    starter.append(goalies.columns[n])

        goalies['starter'] = starter
        goalies['Team'] = k
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['Team_Key'] = goalies['Team'].astype(str)+'_'+goalies['DATE'].astype(str)
        columns = ['Team','DATE', 'OPPONENT', 'starter', 'Team_Key']
        if counter == 0:
            master = goalies
        if counter != 0:
            master = pd.concat( [master[columns], goalies[columns]])
        counter +=1
    return master
                

In [9]:
#'2017-10-04' to '2018-04-08'
def get_game_results(season_start, season_end):
    sched_df = hockey_scraper.scrape_schedule(season_start, season_end)
    sched_df['Home_Team_Won'] = np.where(sched_df['home_score'] > sched_df['away_score'], 1, 0)
    sched_df['Home_Team_Key'] = sched_df['home_team'].astype(str)+'_'+sched_df['date'].astype(str)
    sched_df['Away_Team_Key'] = sched_df['away_team'].astype(str)+'_'+sched_df['date'].astype(str)
    return sched_df

In [234]:
def merge_starters_and_features(game_results_df, goalies_df, features_df, feature_columns):
    goalies_df = goalies_df[goalies_df['TOI'] >=28]
    df = game_results_df.merge(goalies_df[goalie_feature_columns].add_prefix('home_'), left_on = 'Home_Team_Key', right_on = 'home_Team_Key', how = 'left').rename(columns ={'home_Name':'home_goalie'}).drop(columns = 'home_Team_Key')
    df = df.merge(goalies_df[goalie_feature_columns].add_prefix('away_'), left_on = 'Away_Team_Key', right_on = 'away_Team_Key', how = 'left').rename(columns ={'away_Name':'away_goalie'}).drop(columns = 'away_Team_Key')
    df = df.merge(features_df[feature_columns].add_prefix('home_'), left_on = 'Home_Team_Key', right_on = 'home_Team_Key', how = 'left')
    df = df.merge(features_df[feature_columns].add_prefix('away_'), left_on = 'Away_Team_Key', right_on = 'away_Team_Key', how = 'left')
    return df

In [235]:
feature_columns = ['Team_Key', 'last_20_FF%_5v5', 'last_20_GF%_5v5', 'last_20_xGF%_5v5', 'last_20_SH%', 'last20_pp_TOI_per_game', 'last20_xGF_per_min_pp','last20_pk_TOI_per_game', 'last20_xGA_per_min_pk', 'B2B']
goalie_feature_columns = ['Team_Key', 'Name', 'Last_20_FenwickSV%', 'Last_20_GSAx/60', 'Last_20_HDCSV%']

### 2017-2018 Season

In [12]:
primary = get_and_format_nst_team_stats('20172018','5v5', 'n')
pp = get_and_format_nst_team_stats('20172018','pp', 'n')
pk = get_and_format_nst_team_stats('20172018','pk', 'n')

In [13]:
features = merge_team_stats(primary,pp,pk)

In [14]:
features = calculate_team_features(features)

In [15]:
features.tail()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling20_TOI_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_FF%_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
2537,"2018-04-07 - Canucks 2, Oilers 3",VAN,Limited ReportFull Report,51.583333,52,54,49.06,44,40,52.38,32,28,53.33,2,0,100.0,2.73,2.99,47.71,28,32,46.67,15,16,48.39,12,9,57.14,2,0,100.0,16.67,100.0,13,16,44.83,6,12,33.33,0,0,,0.0,100.0,22,16,57.89,12,5,70.59,0,0,,0.0,100.0,6.25,100.0,1.063,18347,2018-04-07,82,VAN_2018-04-07,1.516667,0.31,4.966667,0.44,997.166667,676.0,727.0,32.0,40.0,34.26,39.28,480.0,48.182466,44.444444,46.586891,6.666667,69.0,8.72,3.45,0.126377,105.366667,9.51,5.268333,0.090256,2018-04-05,2 days,0
2538,"2018-04-07 - Stars 4, Kings 2",DAL,Limited ReportFull Report,50.7,28,56,33.33,24,44,35.29,16,35,31.37,4,2,66.67,2.15,1.75,55.18,22,21,51.16,9,5,64.29,9,5,64.29,3,1,75.0,33.33,80.0,13,16,44.83,4,7,36.36,1,0,100.0,25.0,100.0,5,32,13.51,2,21,8.7,0,1,0.0,0.0,95.24,25.0,94.29,1.193,18230,2018-04-07,82,DAL_2018-04-07,6.0,0.14,2.0,0.17,958.366667,632.0,643.0,27.0,33.0,34.14,32.47,445.0,49.568627,45.0,51.253566,6.067416,94.066667,11.12,4.703333,0.118214,108.033333,14.14,5.401667,0.130886,2018-04-06,1 days,1
2539,"2018-04-07 - Stars 4, Kings 2",L.A,Limited ReportFull Report,50.7,56,28,66.67,44,24,64.71,35,16,68.63,2,4,33.33,1.75,2.15,44.82,21,22,48.84,5,9,35.71,5,9,35.71,1,3,25.0,20.0,66.67,16,13,55.17,7,4,63.64,0,1,0.0,0.0,75.0,32,5,86.49,21,2,91.3,1,0,100.0,4.76,100.0,5.71,75.0,0.807,18230,2018-04-07,82,L.A_2018-04-07,2.0,0.17,6.0,0.14,982.116667,679.0,688.0,41.0,35.0,32.65,35.32,481.0,49.670812,53.947368,48.035898,8.523909,92.366667,13.04,4.618333,0.141176,94.516667,9.07,4.725833,0.095962,2018-04-05,2 days,0
2540,"2018-04-07 - Wild 6, Sharks 3",MIN,Limited ReportFull Report,53.466667,52,56,48.15,31,37,45.59,23,22,51.11,5,2,71.43,1.84,1.53,54.62,25,21,54.35,14,10,58.33,12,6,66.67,3,2,60.0,25.0,66.67,11,11,50.0,4,4,50.0,1,0,100.0,25.0,100.0,22,27,44.9,7,12,36.84,1,0,100.0,14.29,100.0,21.74,90.91,1.126,17562,2018-04-07,82,MIN_2018-04-07,3.25,0.28,1.25,0.0,976.766667,677.0,660.0,37.0,34.0,36.27,30.75,492.0,50.635752,52.112676,54.118174,7.520325,90.283333,11.47,4.514167,0.127044,88.0,7.56,4.4,0.085909,2018-04-05,2 days,0
2541,"2018-04-07 - Wild 6, Sharks 3",S.J,Limited ReportFull Report,53.466667,56,52,51.85,37,31,54.41,22,23,48.89,2,5,28.57,1.53,1.84,45.38,21,25,45.65,10,14,41.67,6,12,33.33,2,3,40.0,33.33,75.0,11,11,50.0,4,4,50.0,0,1,0.0,0.0,75.0,27,22,55.1,12,7,63.16,0,1,0.0,0.0,85.71,9.09,78.26,0.874,17562,2018-04-07,82,S.J_2018-04-07,1.25,0.0,3.25,0.28,1017.316667,757.0,711.0,49.0,39.0,42.4,36.63,546.0,51.566757,55.681818,53.650512,8.974359,85.3,12.47,4.265,0.14619,68.7,7.74,3.435,0.112664,2018-04-05,2 days,0


In [16]:
starters = get_starters('2017')

ANA
ARI
BOS
BUF
CGY
CAR
CHI
COL
CBJ
DAL
DET
EDM
FLA
L.A
MIN
MTL
NSH
N.J
NYI
NYR
OTT
PHI
PIT
S.J
STL
T.B
TOR
VAN
WSH
WPG
VGK


In [17]:
starters.tail()

Unnamed: 0,Team,DATE,OPPONENT,starter,Team_Key
97,VGK,2018-05-28,vs Washington,Marc-Andre Fleury,VGK_2018-05-28
98,VGK,2018-05-30,vs Washington,Marc-Andre Fleury,VGK_2018-05-30
99,VGK,2018-06-02,at Washington,Marc-Andre Fleury,VGK_2018-06-02
100,VGK,2018-06-04,at Washington,Marc-Andre Fleury,VGK_2018-06-04
101,VGK,2018-06-07,vs Washington,Marc-Andre Fleury,VGK_2018-06-07


In [18]:
results = get_game_results('2017-10-04', '2018-04-08')

Scraping the schedule between 2017-10-04 and 2018-04-08


In [241]:
results.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04
3,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04
4,2017020005,2017-10-05,TD Garden,BOS,NSH,2017-10-05 23:00:00,4,3,Final,1,BOS_2017-10-05,NSH_2017-10-05


In [243]:
df_20172018 = merge_starters_and_features(results, goalie_features_df, features, feature_columns)

In [265]:
df_20172018.tail(30)

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
1241,2017021243,2018-04-05,Wells Fargo Center,PHI,CAR,2018-04-05 23:00:00,4,3,Final,1,PHI_2018-04-05,CAR_2018-04-05,,,,,,,,,PHI_2018-04-05,50.909091,48.837209,48.087655,8.267717,5.266667,0.119525,3.953333,0.095742,0,CAR_2018-04-05,54.005525,48.421053,52.547086,8.378871,4.151667,0.114532,3.551667,0.104317,0
1242,2017021244,2018-04-05,Capital One Arena,WSH,NSH,2018-04-05 23:00:00,3,4,Final,0,WSH_2018-04-05,NSH_2018-04-05,,,,,,,,,WSH_2018-04-05,49.273256,54.054054,50.321143,8.230453,4.4675,0.120761,5.623333,0.120747,0,NSH_2018-04-05,53.15446,58.227848,52.720371,8.695652,4.895,0.107865,6.375833,0.115436,0
1243,2017021245,2018-04-05,Nationwide Arena,CBJ,PIT,2018-04-05 23:00:00,4,5,Final,0,CBJ_2018-04-05,PIT_2018-04-05,,,,,,,,,CBJ_2018-04-05,52.0,58.823529,51.786834,9.689922,4.414167,0.083821,4.964167,0.116233,0,PIT_2018-04-05,53.641457,53.684211,55.853832,8.994709,4.130833,0.144039,4.47,0.127405,0
1244,2017021246,2018-04-05,Little Caesars Arena,DET,MTL,2018-04-05 23:30:00,3,4,Final,0,DET_2018-04-05,MTL_2018-04-05,,,,,,,,,DET_2018-04-05,49.664929,46.341463,50.476324,8.0,4.736667,0.100176,5.620833,0.120801,0,MTL_2018-04-05,45.790251,44.776119,45.346777,6.741573,4.6775,0.144842,4.851667,0.153968,0
1245,2017021247,2018-04-05,BB&T Center,FLA,BOS,2018-04-05 23:30:00,3,2,Final,1,FLA_2018-04-05,BOS_2018-04-05,,,,,,,,,FLA_2018-04-05,51.016393,57.142857,50.108696,8.421053,5.120833,0.109357,3.725833,0.114739,0,BOS_2018-04-05,54.041916,46.666667,55.8126,6.889764,5.358333,0.144355,5.845,0.096578,0
1246,2017021248,2018-04-05,Bell MTS Place,WPG,CGY,2018-04-06 00:00:00,2,1,Final,1,WPG_2018-04-05,CGY_2018-04-05,,,,,,,,,WPG_2018-04-05,51.310345,55.952381,49.986406,8.867925,4.3875,0.100285,5.3775,0.14821,0,CGY_2018-04-05,58.639456,38.554217,57.543017,5.536332,4.896667,0.156229,5.125,0.106634,0
1247,2017021249,2018-04-05,Rogers Place,EDM,VGK,2018-04-06 01:00:00,4,3,Final,1,EDM_2018-04-05,VGK_2018-04-05,,,,,,,,,EDM_2018-04-05,46.143345,47.252747,44.400141,8.669355,4.353333,0.097167,4.738333,0.097292,0,VGK_2018-04-05,48.794326,49.295775,48.302469,7.12831,4.696667,0.130199,5.084167,0.115555,0
1248,2017021250,2018-04-05,Rogers Arena,VAN,ARI,2018-04-06 02:00:00,4,3,Final,1,VAN_2018-04-05,ARI_2018-04-05,,,,,,,,,VAN_2018-04-05,47.619048,43.055556,45.87291,6.485356,3.468333,0.12297,5.27,0.090797,0,ARI_2018-04-05,49.393291,58.571429,51.904762,8.506224,4.6375,0.111914,4.611667,0.16339,0
1249,2017021251,2018-04-05,STAPLES Center,L.A,MIN,2018-04-06 02:30:00,5,4,Final,1,L.A_2018-04-05,MIN_2018-04-05,,,,,,,,,L.A_2018-04-05,49.419448,52.777778,47.335062,7.933194,4.340833,0.140411,4.715833,0.092242,0,MIN_2018-04-05,50.99926,54.285714,55.064483,7.554672,4.49,0.127394,4.0225,0.085022,1
1250,2017021252,2018-04-05,SAP Center at San Jose,S.J,COL,2018-04-06 02:30:00,4,2,Final,1,S.J_2018-04-05,COL_2018-04-05,,,,,,,,,S.J_2018-04-05,51.191287,52.173913,52.808704,8.921933,4.638333,0.142939,3.570833,0.12252,0,COL_2018-04-05,46.320346,55.714286,47.203924,8.387097,5.81,0.088554,4.540833,0.116168,0


In [249]:
results.shape

(1271, 12)

In [246]:
goalie_features_df[goalie_features_df['Team_Key'] == 'FLA_2018-04-08']

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%


In [248]:
all_goalies_c[all_goalies_c['Team_Key'] == 'FLA_2018-04-08']

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%


In [244]:
df_20172018.isna().sum()

game_id                           0
date                              0
venue                             0
home_team                         0
away_team                         0
start_time                        0
home_score                        0
away_score                        0
status                            0
Home_Team_Won                     0
Home_Team_Key                     0
Away_Team_Key                     0
home_goalie                    1271
home_Last_20_FenwickSV%        1271
home_Last_20_GSAx/60           1271
home_Last_20_HDCSV%            1271
away_goalie                    1271
away_Last_20_FenwickSV%        1271
away_Last_20_GSAx/60           1271
away_Last_20_HDCSV%            1271
home_Team_Key                     0
home_last_20_FF%_5v5            309
home_last_20_GF%_5v5            309
home_last_20_xGF%_5v5           309
home_last_20_SH%                309
home_last20_pp_TOI_per_game     309
home_last20_xGF_per_min_pp      309
home_last20_pk_TOI_per_game 

---
### 2018-2019 Season

In [251]:
primary1819 = get_and_format_nst_team_stats('20182019','5v5', 'n')
pp1819 = get_and_format_nst_team_stats('20182019','pp', 'n')
pk1819 = get_and_format_nst_team_stats('20182019','pk', 'n')

In [252]:
features1819 = merge_team_stats(primary1819,pp1819,pk1819)

In [253]:
features1819 = calculate_team_features(features1819)

In [140]:
# starters1819 = get_starters('2018')

ANA
ARI
BOS
BUF
CGY
CAR
CHI
COL
CBJ
DAL
DET
EDM
FLA
L.A
MIN
MTL
NSH
N.J
NYI
NYR
OTT


ValueError: Length of values does not match length of index

In [254]:
results1819 = get_game_results('2018-10-03', '2019-04-06')

Scraping the schedule between 2018-10-03 and 2019-04-06


In [None]:
df_20182019 = merge_starters_and_features(results1819, starters1819 , features1819, feature_columns)

---
### 2019-2020 Season

In [255]:
primary1920 = get_and_format_nst_team_stats('20192020','5v5', 'n')
pp1920 = get_and_format_nst_team_stats('20192020','pp', 'n')
pk1920 = get_and_format_nst_team_stats('20192020','pk', 'n')

In [256]:
features1920 = merge_team_stats(primary1920,pp1920,pk1920)

In [257]:
features1920 = calculate_team_features(features1920)

In [None]:
# starters1920 = get_starters('2019')

In [258]:
results1920 = get_game_results('2019-10-02', '2020-03-12')

Scraping the schedule between 2019-10-02 and 2020-03-12


In [None]:
df_20192020 = merge_starters_and_features(results1920, starters1920 , features1920, feature_columns)

---
### 2020-2021 Season

In [259]:
primary2021 = get_and_format_nst_team_stats('20202021','5v5', 'n')
pp2021 = get_and_format_nst_team_stats('20202021','pp', 'n')
pk2021 = get_and_format_nst_team_stats('20202021','pk', 'n')

In [260]:
features2021 = merge_team_stats(primary2021,pp2021,pk2021)

In [261]:
features2021 = calculate_team_features(features2021)

In [None]:
# starters2021 = get_starters('2020')

In [262]:
results2021 = get_game_results('2021-01-13', '2021-04-29')

Scraping the schedule between 2021-01-13 and 2021-04-29


In [263]:
results.shape[0]+results1819.shape[0]+results1920.shape[0]+results2021.shape[0]

4393

### EDA

In [None]:
df_20172018['Home_Team_Won'].value_counts(normalize = True)

In [None]:
df_20172018.columns

In [None]:
df_20172018[(df_20172018['home_B2B'] == 1) & (df_20172018['away_B2B'] == 0)]['Home_Team_Won'].value_counts(normalize = True)

In [None]:
df_20172018[(df_20172018['home_B2B'] == 0) & (df_20172018['away_B2B'] == 1)]['Home_Team_Won'].value_counts(normalize = True)

### Quick Model


In [None]:
model_df = df_20172018.dropna()

In [None]:
model_df.isna().sum()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import log_loss
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.under_sampling import TomekLinks 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.pipeline import make_pipeline

In [None]:
model_df.columns

In [None]:
features = ['home_last_20_FF%_5v5', 'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5',
       'home_last_20_SH%', 'home_last20_pp_TOI_per_game',
       'home_last20_xGF_per_min_pp', 'home_last20_pk_TOI_per_game',
       'home_last20_xGA_per_min_pk', 'home_B2B',
       'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5', 'away_last_20_xGF%_5v5',
       'away_last_20_SH%', 'away_last20_pp_TOI_per_game',
       'away_last20_xGF_per_min_pp', 'away_last20_pk_TOI_per_game',
       'away_last20_xGA_per_min_pk', 'away_B2B']

In [None]:
X = model_df[features]
y = model_df['Home_Team_Won']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

In [None]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

In [None]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'accuracy').mean()

In [None]:
X

### Get Goalie Data

In [21]:
infile = open("data/goalie_ids.pickle",'rb')
goalie_ids = pickle.load(infile)
infile.close()

In [22]:
goalies = list(df_20172018['home_starter'].dropna())
away_starter = list(df_20172018['away_starter'].dropna())
goalies.extend(away_starter)

goalies = list(set(goalies))
goalies = sorted(goalies)

In [23]:
goalies[:5]

['Aaron Dell', 'Adam Wilcox', 'Adin Hill', 'Al Montoya', 'Alex Lyon']

In [24]:
missing_goalies = [goalie for goalie in goalies if goalie not in goalie_ids.keys()]

In [25]:
missing_goalies

[]

In [50]:
# for i,g in enumerate(missing_goalies[:len(missing_id)]):
#     goalie_ids[g] = missing_id[i]

In [57]:
url = 'https://www.naturalstattrick.com/playerreport.php?fromseason=20152016&thruseason=20182019&playerid=8475809&sit=all&stype=2&stdoi=oi&rate=n&v=g'

In [58]:
goalie1 = pd.read_html(url)[0]

In [59]:
goalie1.head()

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %
0,2016-03-20 CBJ at N.J,N.J,59.966667,44,47,48.35,33,38,46.48,21,28,42.86,2,1,66.67,2.23,2.05,52.11,21,19,52.5,2,0,100.0,8,9,47.06,2,0,100.00,13,10,56.52,0,0,-,23,25,47.92,0,1,0.00,9.52,96.43,1.06,1,3,0,0,100.00,20,16,27,42.55
1,2016-03-24 N.J at PIT,N.J,60.0,37,73,33.64,29,55,34.52,24,39,38.1,3,0,100.0,1.46,3.24,31.01,16,37,30.19,3,0,100.0,5,11,31.25,0,0,-,11,26,29.73,3,0,100.00,19,32,37.25,0,0,-,12.5,100.0,1.125,0,3,0,0,-,13,12,30,30.23
2,2016-03-25 WSH at N.J,N.J,62.283333,38,48,44.19,30,36,45.45,22,26,45.83,0,1,0.0,1.41,1.59,46.93,14,20,41.18,0,1,0.0,3,4,42.86,0,0,-,11,16,40.74,0,1,0.00,23,26,46.94,0,0,-,0.0,96.15,0.962,0,4,0,0,-,14,15,23,37.84
3,2016-03-27 N.J at CAR,N.J,58.5,49,39,55.68,36,31,53.73,25,23,52.08,2,3,40.0,1.86,2.52,42.44,20,21,48.78,2,3,40.0,10,9,52.63,2,2,50.00,10,12,45.45,0,1,0.00,21,15,58.33,0,0,-,8.0,86.96,0.95,0,3,0,0,-,17,18,15,53.13
4,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50.0,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98


In [60]:
url = 'https://www.naturalstattrick.com/playerreport.php?fromseason=20152016&thruseason=20182019&playerid=8468685&sit=all&stype=2&stdoi=oi&rate=n&v=g'
goalie2 = pd.read_html(url)[0]

In [61]:
pd.concat([goalie1,goalie2])

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %
0,2016-03-20 CBJ at N.J,N.J,59.966667,44,47,48.35,33,38,46.48,21,28,42.86,2,1,66.67,2.23,2.05,52.11,21,19,52.5,2,0,100,8,9,47.06,2,0,100.00,13,10,56.52,0,0,-,23,25,47.92,0,1,0.00,9.52,96.43,1.060,1,3,0,0,100.00,20,16,27,42.55
1,2016-03-24 N.J at PIT,N.J,60.000000,37,73,33.64,29,55,34.52,24,39,38.10,3,0,100.00,1.46,3.24,31.01,16,37,30.19,3,0,100,5,11,31.25,0,0,-,11,26,29.73,3,0,100.00,19,32,37.25,0,0,-,12.50,100.00,1.125,0,3,0,0,-,13,12,30,30.23
2,2016-03-25 WSH at N.J,N.J,62.283333,38,48,44.19,30,36,45.45,22,26,45.83,0,1,0.00,1.41,1.59,46.93,14,20,41.18,0,1,0,3,4,42.86,0,0,-,11,16,40.74,0,1,0.00,23,26,46.94,0,0,-,0.00,96.15,0.962,0,4,0,0,-,14,15,23,37.84
3,2016-03-27 N.J at CAR,N.J,58.500000,49,39,55.68,36,31,53.73,25,23,52.08,2,3,40.00,1.86,2.52,42.44,20,21,48.78,2,3,40,10,9,52.63,2,2,50.00,10,12,45.45,0,1,0.00,21,15,58.33,0,0,-,8.00,86.96,0.950,0,3,0,0,-,17,18,15,53.13
4,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,2018-03-20 CBJ at NYR,NYR,57.916667,62,62,50.00,40,50,44.44,30,30,50.00,2,4,33.33,2.05,3.03,40.39,35,36,49.30,1,4,20.00,11,15,42.31,1,1,50.00,24,21,53.33,0,3,0.00,18,23,43.90,1,0,100.00,6.67,86.67,0.933,1,5,0,0,100.00,21,18,14,60.00
181,2018-03-28 NYR at WSH,NYR,60.600000,63,62,50.40,51,47,52.04,37,33,52.86,2,3,40.00,3.78,3.59,51.27,39,30,56.52,2,3,40.00,16,18,47.06,1,2,33.33,23,12,65.71,1,1,50.00,21,25,45.65,0,0,-,5.41,90.91,0.963,1,4,0,0,100.00,25,17,20,55.56
182,2018-03-31 NYR at CAR,NYR,60.000000,38,84,31.15,28,64,30.43,20,41,32.79,2,1,66.67,2.24,3.95,36.20,19,37,33.93,2,1,66.67,10,15,40.00,2,1,66.67,9,22,29.03,0,0,-,16,44,26.67,0,0,-,10.00,97.56,1.076,0,3,0,0,-,12,14,40,23.08
183,2018-04-03 NYR at N.J,NYR,59.966667,49,67,42.24,40,55,42.11,24,44,35.29,2,5,28.57,1.78,3.26,35.32,17,29,36.96,2,3,40.00,5,9,35.71,2,2,50.00,12,20,37.50,0,1,0.00,28,37,43.08,0,2,0.00,8.33,88.64,0.970,1,3,0,0,100.00,20,21,21,48.78


In [91]:
counter = 0
for name, gid in goalie_ids.items():
    
    sequence = [x/10 for x in range(40, 80)]
    time.sleep(random.choice(sequence))
    url = 'https://www.naturalstattrick.com/playerreport.php?fromseason=20162017&thruseason=20172018&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(gid)
    individual_df = pd.read_html(url)[0]
    individual_df['Name'] = name
    individual_df['ID'] = gid



    if counter == 0:
        all_goalies = individual_df
        print(name)
        print(counter)
    elif counter != 0:
        all_goalies = pd.concat([all_goalies, individual_df])
        print(name)
        print(counter)

    
    counter +=1
    
    
        

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14
Jaroslav Halak
15
Tuukka Rask
16
Michael Houser
17
Ukko-Pekka Luukkonen
18
Dustin Tokarski
19
Carter Hutton
20
Linus Ullmark
21
Charlie Lindgren
22
Carey Price
23
Jake Allen
24
Cayden Primeau
25
Joey Daccord
26
Anton Forsberg
27
Matt Murray
28
Marcus Hogberg
29
Filip Gustavsson
30
Frederik Andersen
31
Jack Campbell
32
David Rittich
33
James Reimer
34
Petr Mrazek
35
Alex Nedeljkovic
36
Philippe Desrosiers
37
Sam Montembeault
38
Sergei Bobrovsky
39
Chris Driedger
40
Spencer Knight
41
Christopher Gibson
42
Curtis McElhinney
43
Andrei Vasilevskiy
44
Craig Anderson
45
Vitek Vanecek
46
Ilya Samsonov
47
Malcolm Subban
48
Collin Delia
49
Kevin Lankinen
50
Thomas Greiss
51
Jonathan Bernier
52
Kasimir Kaskisuo
53
Pekka Rinne
54
Juuse Saros
55
Jordan 

In [32]:
all_goalies1 = all_goalies.copy()

In [92]:
all_goalies

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID
0,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98,Scott Wedgewood,8475809
1,2017-10-31 ARI at DET,ARI,58.333333,54,63,46.15,41,53,43.62,32,39,45.07,3,4,42.86,1.73,2.50,41,28,32,46.67,1,2,33.33,11,15,42.31,1,1,50,17,17,50,0,1,0.00,24,29,45.28,2,2,50.00,9.38,89.74,0.991,1,4,0,0,100.00,18,22,24,42.86,Scott Wedgewood,8475809
2,2017-11-06 ARI at WSH,ARI,63.883333,49,75,39.52,38,60,38.78,26,40,39.39,2,3,40,1.81,2.84,38.95,20,31,39.22,2,1,66.67,8,10,44.44,2,1,66.67,12,21,36.36,0,0,-,26,37,41.27,0,2,0.00,7.69,92.5,1.002,1,4,0,0,100.00,14,18,26,35,Scott Wedgewood,8475809
3,2017-11-14 ARI at WPG,ARI,34.866667,32,26,55.17,20,21,48.78,18,17,51.43,1,1,50,1.15,1.41,45,17,15,53.13,0,1,0,5,7,41.67,0,1,0,12,8,60,0,0,-,14,9,60.87,1,0,100.00,5.56,94.12,0.997,0,2,0,0,-,10,14,7,58.82,Scott Wedgewood,8475809
4,2017-11-22 S.J at ARI,ARI,44.333333,32,32,50,23,24,48.94,19,16,54.29,1,2,33.33,1.22,1.41,46.35,12,16,42.86,1,2,33.33,8,7,53.33,1,2,33.33,4,9,30.77,0,0,-,15,15,50,0,0,-,5.26,87.5,0.928,0,3,1,0,0.00,9,13,22,29.03,Scott Wedgewood,8475809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2017-12-16 WPG at STL,WPG,58.850000,72,54,57.14,61,39,61,44,30,59.46,0,2,0.00,3.74,2.58,59.23,35,25,58.33,0,1,0.00,18,12,60.00,0,1,0.00,17,13,56.67,0,0,-,34,26,56.67,0,1,0.00,0,93.33,0.933,1,3,0,0,100.00,25,20,17,59.52,Steve Mason,8473461
67,2017-12-23 WPG at NYI,WPG,60.000000,71,44,61.74,55,36,60.44,40,29,57.97,2,5,28.57,3.56,2.40,59.73,41,21,66.13,1,4,20.00,23,10,69.70,1,3,25.00,18,11,62.07,0,1,0.00,26,18,59.09,1,1,50.00,5,82.76,0.878,0,3,0,0,-,28,16,15,65.12,Steve Mason,8473461
68,2018-01-09 WPG at BUF,WPG,60.000000,54,56,49.09,47,47,50,37,34,52.11,7,4,63.64,3.04,2.72,52.78,21,25,45.65,2,3,40.00,10,13,43.48,2,2,50.00,11,12,47.83,0,1,0.00,30,28,51.72,4,1,80.00,18.92,88.24,1.072,0,3,0,0,-,10,28,25,28.57,Steve Mason,8473461
69,2018-03-06 WPG at NYR,WPG,60.000000,63,60,51.22,45,46,49.45,34,31,52.31,3,0,100.00,2.45,2.46,49.99,22,30,42.31,2,0,100.00,6,13,31.58,0,0,-,16,17,48.48,2,0,100.00,31,26,54.39,0,0,-,8.82,100,1.088,0,3,0,0,-,24,15,18,57.14,Steve Mason,8473461


In [37]:
missing_goalies = [goalie for goalie in goalie_ids.keys() if goalie not in list(all_goalies1['Name'])]

In [44]:
missing_goalies.index('Adin Hill')

37

In [45]:
# counter = 0
# for name, gid in goalie_ids.items():
#     if name in missing_goalies[37:]:
#         sequence = [x/10 for x in range(40, 80)]
#         time.sleep(random.choice(sequence))
#         url = 'https://www.naturalstattrick.com/playerreport.php?fromseason=20162017&thruseason=20172018&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(gid)
#         individual_df = pd.read_html(url)[0]
#         individual_df['Name'] = name
#         individual_df['ID'] = gid



#         if counter == 0:
#             all_goalies2 = individual_df
#             print(name)
#             print(counter)
#         elif counter != 0:
#             all_goalies2 = pd.concat([all_goalies, individual_df])
#             print(name)
#             print(counter)


#         counter +=1
    

Adin Hill
0
Marc-Andre Fleury
1
Robin Lehner
2
Adam Wilcox
3
Al Montoya
4
Anders Nilsson
5
Andrei Vasilevski
6
Antti Niemi
7
Calvin Pickard
8
Cam Ward
9
Cameron Talbot
10
Chad Johnson
11
Corey Crawford
12
Daniel Taylor
13
Eddie Lack
14
Eric Comrie
15
Harri Sateri
16
Henrik Lundqvist
17
Jared Coreau
18
Jean-Francois Berube
19
Jeff Glass
20
Jimmy Howard
21
Jon Gillies
22
Kari Lehtonen
23
Ken Appleby
24
Maxime Lagace
25
Michael Hutchinson
26
Michal Neuvirth
27
Mike Condon
28
Mike McKenna
29
Ondrej Pavelec
30
Oscar Dansk
31
Peter Budaj
32
Reto Berra
33
Roberto Luongo
34
Scott Darling
35
Steve Mason
36


In [47]:
all_goalies_20172018 = pd.concat([all_goalies1, all_goalies2])

In [48]:
all_goalies_20172018 

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID
0,2016-03-20 CBJ at N.J,N.J,59.966667,44,47,48.35,33,38,46.48,21,28,42.86,2,1,66.67,2.23,2.05,52.11,21,19,52.5,2,0,100,8,9,47.06,2,0,100.00,13,10,56.52,0,0,-,23,25,47.92,0,1,0.00,9.52,96.43,1.06,1,3,0,0,100.00,20,16,27,42.55,Scott Wedgewood,8475809
1,2016-03-24 N.J at PIT,N.J,60.000000,37,73,33.64,29,55,34.52,24,39,38.1,3,0,100,1.46,3.24,31.01,16,37,30.19,3,0,100,5,11,31.25,0,0,-,11,26,29.73,3,0,100.00,19,32,37.25,0,0,-,12.5,100,1.125,0,3,0,0,-,13,12,30,30.23,Scott Wedgewood,8475809
2,2016-03-25 WSH at N.J,N.J,62.283333,38,48,44.19,30,36,45.45,22,26,45.83,0,1,0,1.41,1.59,46.93,14,20,41.18,0,1,0,3,4,42.86,0,0,-,11,16,40.74,0,1,0.00,23,26,46.94,0,0,-,0,96.15,0.962,0,4,0,0,-,14,15,23,37.84,Scott Wedgewood,8475809
3,2016-03-27 N.J at CAR,N.J,58.500000,49,39,55.68,36,31,53.73,25,23,52.08,2,3,40,1.86,2.52,42.44,20,21,48.78,2,3,40,10,9,52.63,2,2,50.00,10,12,45.45,0,1,0.00,21,15,58.33,0,0,-,8,86.96,0.95,0,3,0,0,-,17,18,15,53.13,Scott Wedgewood,8475809
0,2016-10-18 S.J at NYI,S.J,60.000000,54,44,55.1,37,31,54.41,27,23,54,3,2,60,2.58,1.70,60.28,25,16,60.98,3,2,60,11,5,68.75,3,2,60,14,11,56,0,0,-,22,22,50,0,0,-,11.11,91.3,1.024,0,3,0,0,-,18,25,20,47.37,Aaron Dell,8477180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,2017-03-26 PHI at PIT,PHI,60.000000,56,51,52.34,43,33,56.58,33,27,55,6,2,75.00,2.67,2.42,52.40,26,19,57.78,4,2,66.67,12,8,60.00,3,1,75.00,14,11,56.00,1,1,50.00,23,26,46.94,2,0,100.00,18.18,92.59,1.108,0,3,0,0,-,17,24,19,47.22,Steve Mason,8473461
159,2017-03-28 OTT at PHI,PHI,64.950000,70,62,53.03,53,42,55.79,35,28,55.56,2,2,50.00,3.25,2.40,57.55,35,23,60.34,2,1,66.67,15,8,65.22,2,0,100.00,20,15,57.14,0,1,0.00,31,25,55.36,0,1,0.00,5.71,92.86,0.986,1,4,0,0,100.00,17,17,20,45.95,Steve Mason,8473461
160,2017-03-30 NYI at PHI,PHI,60.000000,46,80,36.51,34,53,39.08,30,41,42.25,6,3,66.67,2.80,3.02,48.11,19,25,43.18,4,3,57.14,8,9,47.06,3,2,60.00,11,16,40.74,1,1,50.00,20,43,31.75,2,0,100.00,20,92.68,1.127,0,3,0,0,-,10,25,33,23.26,Steve Mason,8473461
161,2017-04-04 PHI at N.J,PHI,60.300000,52,40,56.52,43,32,57.33,34,27,55.74,0,1,0.00,2.49,1.84,57.46,25,17,59.52,0,1,0.00,10,10,50.00,0,0,-,15,7,68.18,0,1,0.00,23,21,52.27,0,0,-,0,96.3,0.963,1,4,0,0,100.00,15,17,19,44.12,Steve Mason,8473461


In [93]:
all_goalies['Date'] = all_goalies['Game'].apply(lambda x: pd.to_datetime(x[0:10]))

all_goalies['Team_Key'] = all_goalies['Team'].astype(str)+'_'+all_goalies['Date'].astype(str)

In [50]:
df_20172018.dropna()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_starter,away_starter,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,Frederik Andersen,WPG_2017-10-04,,,,,,,,,0,TOR_2017-10-04,,,,,,,,,0
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,Jake Allen,PIT_2017-10-04,,,,,,,,,0,STL_2017-10-04,,,,,,,,,0
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,Mike Smith,EDM_2017-10-04,,,,,,,,,0,CGY_2017-10-04,,,,,,,,,0
3,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,Brian Elliott,S.J_2017-10-04,,,,,,,,,0,PHI_2017-10-04,,,,,,,,,0
4,2017020005,2017-10-05,TD Garden,BOS,NSH,2017-10-05 23:00:00,4,3,Final,1,BOS_2017-10-05,NSH_2017-10-05,Tuukka Rask,Pekka Rinne,BOS_2017-10-05,,,,,,,,,0,NSH_2017-10-05,,,,,,,,,0


In [51]:
tk = list(df_20172018.dropna()['Home_Team_Key'])

In [53]:
tk.extend(list(df_20172018.dropna()['Away_Team_Key']))

In [104]:
agtk = all_goalies['Team_Key']

In [107]:
agtk

0     ARI_2017-10-30
1     ARI_2017-10-31
2     ARI_2017-11-06
3     ARI_2017-11-14
4     ARI_2017-11-22
           ...      
66    WPG_2017-12-16
67    WPG_2017-12-23
68    WPG_2018-01-09
69    WPG_2018-03-06
70    WPG_2018-04-03
Name: Team_Key, Length: 5596, dtype: object

In [109]:
'WPG_2017-12-16' in tk

True

In [103]:
'MTL_2017-11-18' in agtk

False

In [96]:
missing_tk = [key for key in tk if key not in agtk]

In [97]:
missing_tk

['MTL_2017-11-18',
 'BUF_2017-11-20',
 'TOR_2017-11-20',
 'PHI_2017-11-21',
 'STL_2017-11-21',
 'DAL_2017-11-21',
 'BUF_2017-11-22',
 'DET_2017-11-22',
 'FLA_2017-11-22',
 'NYI_2017-11-22',
 'PIT_2017-11-22',
 'CBJ_2017-11-22',
 'T.B_2017-11-22',
 'NSH_2017-11-22',
 'L.A_2017-11-22',
 'BOS_2017-11-24',
 'ANA_2017-11-24',
 'MIN_2017-11-24',
 'PHI_2017-11-24',
 'WSH_2017-11-24',
 'VGK_2017-11-24',
 'BUF_2017-11-24',
 'N.J_2017-11-24',
 'NYR_2017-11-24',
 'CBJ_2017-11-24',
 'CAR_2017-11-24',
 'STL_2017-11-24',
 'ARI_2017-11-24',
 'DAL_2017-11-24',
 'TOR_2017-11-25',
 'MTL_2017-11-25',
 'OTT_2017-11-25',
 'DET_2017-11-25',
 'FLA_2017-11-25',
 'PIT_2017-11-25',
 'ARI_2017-11-25',
 'STL_2017-11-25',
 'S.J_2017-11-25',
 'COL_2017-11-25',
 'L.A_2017-11-25',
 'CAR_2017-11-26',
 'NYR_2017-11-26',
 'BOS_2017-11-26',
 'N.J_2017-11-27',
 'PIT_2017-11-27',
 'MTL_2017-11-27',
 'WPG_2017-11-27',
 'CHI_2017-11-27',
 'BUF_2017-11-28',
 'NYI_2017-11-28',
 'NYR_2017-11-28',
 'PHI_2017-11-28',
 'CBJ_2017-1

In [70]:
df_20172018[df_20172018['Home_Team_Key'] == 'MTL_2017-11-18']

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_starter,away_starter,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
297,2017020298,2017-11-18,Centre Bell,MTL,TOR,2017-11-19,0,6,Final,0,MTL_2017-11-18,TOR_2017-11-18,Charlie Lindgren,Frederik Andersen,MTL_2017-11-18,53.806735,45.454545,55.847287,6.445672,6.306667,0.12899,5.4925,0.133364,0,TOR_2017-11-18,48.654709,51.648352,50.045437,10.421286,5.183333,0.169871,5.990833,0.109417,0


In [101]:
all_goalies[all_goalies['Team_Key'] == 'MTL_2017-11-18']

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key
8,2017-11-18 TOR at MTL,MTL,48.983333,49,51,49.0,35,40,46.67,28,27,50.91,0,5,0.0,1.4,3.17,30.59,18,32,36,0,5,0.0,8,16,33.33,0,3,0.0,10,16,38.46,0,2,0.00,28,16,63.64,0,0,-,0,81.48,0.815,0,3,0,0,-,20,14,15,57.14,Charlie Lindgren,8479292,2017-11-18,MTL_2017-11-18
42,2017-11-18 TOR at MTL,MTL,11.016667,11,8,57.89,6,4,60.0,5,4,55.56,0,1,0.0,0.25,0.17,59.39,4,4,50,0,1,0.0,1,2,33.33,0,1,0.0,3,2,60.0,0,0,-,7,2,77.78,0,0,-,0,75.0,0.75,0,1,0,0,-,7,6,3,70.0,Antti Niemi,8474550,2017-11-18,MTL_2017-11-18


In [98]:
all_goalies[all_goalies['Date'] == '2017-11-18']

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key
26,2017-11-18 BOS at S.J,S.J,57.516667,73,47,60.83,47,35,57.32,30,20,60.0,1,3,25.00,2.91,1.89,60.64,26,21,55.32,1,3,25.00,14,12,53.85,1,3,25.00,12,9,57.14,0,0,-,29,21,58.0,0,0,-,3.33,85,0.883,0,3,0,0,-,28,25,17,62.22,Aaron Dell,8477180,2017-11-18,S.J_2017-11-18
73,2017-11-18 N.J at WPG,N.J,40.0,47,40,54.02,38,29,56.72,28,22,56.0,1,5,16.67,1.54,1.24,55.34,17,10,62.96,0,3,0,5,5,50.00,0,2,0.00,12,5,70.59,0,1,0.00,20,20,50.0,1,2,33.33,3.57,77.27,0.808,0,2,0,0,-,17,11,12,58.62,Cory Schneider,8471239,2017-11-18,N.J_2017-11-18
35,2017-11-18 COL at NSH,COL,60.0,56,43,56.57,38,32,54.29,31,24,56.36,2,5,28.57,2.24,1.7,56.91,26,15,63.41,1,3,25.00,7,6,53.85,1,3,25.00,19,9,67.86,0,0,-,28,23,54.9,1,2,33.33,6.45,79.17,0.856,0,3,0,0,-,28,27,8,77.78,Semyon Varlamov,8473575,2017-11-18,COL_2017-11-18
32,2017-11-18 N.J at WPG,N.J,20.0,17,17,50.0,11,15,42.31,8,12,40.0,1,0,100,0.73,0.4,64.84,7,5,58.33,1,0,100,4,2,66.67,1,0,100.00,3,3,50.0,0,0,-,6,9,40.0,0,0,-,12.5,100,1.125,0,1,0,0,-,2,5,6,25.0,Keith Kinkaid,8476234,2017-11-18,N.J_2017-11-18
62,2017-11-18 CGY at PHI,PHI,61.033333,62,59,51.24,46,42,52.27,39,31,55.71,4,5,44.44,2.51,2.9,46.45,25,26,49.02,2,4,33.33,9,11,45,2,1,66.67,16,15,51.61,0,3,0.00,29,26,52.73,2,1,66.67,10.26,83.87,0.941,1,4,0,0,100.00,22,21,19,53.66,Brian Elliott,8470880,2017-11-18,PHI_2017-11-18
8,2017-11-18 TOR at MTL,MTL,48.983333,49,51,49.0,35,40,46.67,28,27,50.91,0,5,0.00,1.4,3.17,30.59,18,32,36.0,0,5,0.00,8,16,33.33,0,3,0.00,10,16,38.46,0,2,0.00,28,16,63.64,0,0,-,0.0,81.48,0.815,0,3,0,0,-,20,14,15,57.14,Charlie Lindgren,8479292,2017-11-18,MTL_2017-11-18
77,2017-11-18 STL at VAN,STL,62.683333,57,54,51.35,39,36,52.0,31,23,57.41,4,3,57.14,1.62,2.07,43.84,25,23,52.08,2,3,40.00,5,10,33.33,0,2,0.00,20,13,60.61,2,1,66.67,25,27,48.08,2,0,100.00,12.9,86.96,0.999,0,4,0,0,-,21,17,19,52.5,Jake Allen,8474596,2017-11-18,STL_2017-11-18
6,2017-11-18 CHI at PIT,CHI,2.866667,3,1,75.0,1,0,100.0,1,0,100.0,0,0,-,0.03,0.0,100.0,1,1,50.0,0,0,-,0,0,-,0,0,-,1,1,50.0,0,0,-,2,0,100.0,0,0,-,0.0,-,-,1,0,0,0,100.00,1,0,0,100.0,Anton Forsberg,8476341,2017-11-18,CHI_2017-11-18
66,2017-11-18 CHI at PIT,PIT,58.45,56,67,45.53,42,46,47.73,35,38,47.95,1,2,33.33,2.37,4.26,35.74,24,41,36.92,0,2,0.00,7,15,31.82,0,0,-,17,26,39.53,0,2,0.00,28,23,54.9,1,0,100.00,2.86,94.74,0.976,0,3,0,0,-,24,19,24,50.0,Matt Murray,8476899,2017-11-18,PIT_2017-11-18
83,2017-11-18 TOR at MTL,TOR,60.0,59,60,49.58,44,41,51.76,31,33,48.44,6,0,100,3.33,1.64,67.01,36,22,62.07,6,0,100.00,18,9,66.67,4,0,100.00,18,13,58.06,2,0,100.00,18,35,33.96,0,0,-,19.35,100,1.194,0,3,0,0,-,18,20,27,40.0,Frederik Andersen,8475883,2017-11-18,TOR_2017-11-18


In [100]:
all_goalies[all_goalies['Name'] == 'Charlie Lindgren']

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key
0,2017-04-03 MTL at FLA,MTL,60.0,56,49,53.33,47,44,51.65,31,32,49.21,4,1,80.00,3.24,2.57,55.78,27,27,50.0,2,0,100.00,12,12,50.0,2,0,100.00,15,15,50.0,0,0,-,25,20,55.56,1,1,50.00,12.9,96.88,1.098,0,3,0,0,-,16,20,22,42.11,Charlie Lindgren,8479292,2017-04-03,MTL_2017-04-03
1,2017-04-08 MTL at DET,MTL,61.7,48,44,52.17,35,36,49.3,30,27,52.63,3,2,60.00,2.33,2.02,53.54,22,20,52.38,2,1,66.67,13,8,61.9,1,1,50.00,9,12,42.86,1,0,100.00,22,20,52.38,1,1,50.00,10.0,92.59,1.026,0,4,0,0,-,16,14,19,45.71,Charlie Lindgren,8479292,2017-04-08,MTL_2017-04-08
2,2017-11-05 MTL at CHI,MTL,60.0,64,69,48.12,47,51,47.96,35,38,47.95,2,0,100.00,2.79,2.64,51.38,23,35,39.66,1,0,100.00,13,8,61.9,1,0,100.00,10,27,27.03,0,0,-,37,34,52.11,1,0,100.00,5.71,100.0,1.057,0,3,0,0,-,19,18,23,45.24,Charlie Lindgren,8479292,2017-11-05,MTL_2017-11-05
3,2017-11-07 VGK at MTL,MTL,60.0,51,65,43.97,41,48,46.07,28,31,47.46,3,2,60.00,2.84,2.13,57.13,25,32,43.86,3,2,60.00,12,9,57.14,2,1,66.67,13,23,36.11,1,1,50.00,24,29,45.28,0,0,-,10.71,93.55,1.043,0,3,0,0,-,13,27,21,38.24,Charlie Lindgren,8479292,2017-11-07,MTL_2017-11-07
4,2017-11-09 MIN at MTL,MTL,58.166667,76,73,51.01,49,51,49.0,39,34,53.42,0,2,0.00,3.04,3.05,49.92,31,34,47.69,0,2,0.00,8,18,30.77,0,2,0.00,23,16,58.97,0,0,-,39,31,55.71,0,0,-,0.0,94.12,0.941,0,4,0,0,-,25,21,22,53.19,Charlie Lindgren,8479292,2017-11-09,MTL_2017-11-09
5,2017-11-11 BUF at MTL,MTL,63.133333,53,50,51.46,41,44,48.24,29,35,45.31,2,1,66.67,1.42,3.52,28.8,20,29,40.82,1,1,50.00,6,14,30.0,1,1,50.00,14,15,48.28,0,0,-,32,21,60.38,1,0,100.00,6.9,97.14,1.04,0,4,0,0,-,18,18,18,50.0,Charlie Lindgren,8479292,2017-11-11,MTL_2017-11-11
6,2017-11-14 CBJ at MTL,MTL,61.15,67,58,53.6,41,36,53.25,29,25,53.7,1,2,33.33,3.3,2.1,61.07,35,29,54.69,1,2,33.33,16,11,59.26,1,1,50.00,19,18,51.35,0,1,0.00,25,23,52.08,0,0,-,3.45,92.0,0.954,0,4,0,0,-,27,16,18,60.0,Charlie Lindgren,8479292,2017-11-14,MTL_2017-11-14
7,2017-11-16 ARI at MTL,MTL,58.383333,68,55,55.28,49,44,52.69,36,32,52.94,4,5,44.44,2.35,3.53,39.95,26,28,48.15,2,4,33.33,7,17,29.17,1,4,20.00,19,11,63.33,1,0,100.00,37,23,61.67,2,1,66.67,11.11,84.38,0.955,0,4,0,0,-,24,26,13,64.86,Charlie Lindgren,8479292,2017-11-16,MTL_2017-11-16
8,2017-11-18 TOR at MTL,MTL,48.983333,49,51,49.0,35,40,46.67,28,27,50.91,0,5,0.00,1.4,3.17,30.59,18,32,36.0,0,5,0.00,8,16,33.33,0,3,0.00,10,16,38.46,0,2,0.00,28,16,63.64,0,0,-,0.0,81.48,0.815,0,3,0,0,-,20,14,15,57.14,Charlie Lindgren,8479292,2017-11-18,MTL_2017-11-18
9,2017-11-21 MTL at DAL,MTL,59.216667,56,58,49.12,47,43,52.22,30,28,51.72,1,2,33.33,3.13,3.01,50.98,26,30,46.43,1,2,33.33,17,17,50.0,1,2,33.33,9,13,40.91,0,0,-,24,23,51.06,0,0,-,3.33,92.86,0.962,0,5,0,0,-,28,26,23,54.9,Charlie Lindgren,8479292,2017-11-21,MTL_2017-11-21


In [80]:
all_goalies[all_goalies['Name'] == 'Charlie Lindgren']

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID
0,2016-04-07 MTL at CAR,MTL,60.0,57,49,53.77,43,41,51.19,31,28,52.54,4,2,66.67,2.36,3.1,43.24,25,22,53.19,2,1,66.67,9,12,42.86,2,0,100,16,10,61.54,0,1,0.00,29,24,54.72,2,1,66.67,12.9,92.86,1.058,0,3,0,0,-,27,21,17,61.36,Charlie Lindgren,8479292
1,2017-04-03 MTL at FLA,MTL,60.0,56,49,53.33,47,44,51.65,31,32,49.21,4,1,80.0,3.24,2.57,55.78,27,27,50.0,2,0,100.0,12,12,50.0,2,0,100,15,15,50.0,0,0,-,25,20,55.56,1,1,50.0,12.9,96.88,1.098,0,3,0,0,-,16,20,22,42.11,Charlie Lindgren,8479292
2,2017-04-08 MTL at DET,MTL,61.7,48,44,52.17,35,36,49.3,30,27,52.63,3,2,60.0,2.33,2.02,53.54,22,20,52.38,2,1,66.67,13,8,61.9,1,1,50,9,12,42.86,1,0,100.00,22,20,52.38,1,1,50.0,10.0,92.59,1.026,0,4,0,0,-,16,14,19,45.71,Charlie Lindgren,8479292


In [130]:
all_goalies = all_goalies[all_goalies['Name'] != 'Andrei Vasilevski']

In [131]:
agtoi = all_goalies[all_goalies['TOI'] >= 28]

In [132]:
agtoi

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key
0,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98,Scott Wedgewood,8475809,2017-10-30,ARI_2017-10-30
1,2017-10-31 ARI at DET,ARI,58.333333,54,63,46.15,41,53,43.62,32,39,45.07,3,4,42.86,1.73,2.50,41,28,32,46.67,1,2,33.33,11,15,42.31,1,1,50,17,17,50,0,1,0.00,24,29,45.28,2,2,50.00,9.38,89.74,0.991,1,4,0,0,100.00,18,22,24,42.86,Scott Wedgewood,8475809,2017-10-31,ARI_2017-10-31
2,2017-11-06 ARI at WSH,ARI,63.883333,49,75,39.52,38,60,38.78,26,40,39.39,2,3,40,1.81,2.84,38.95,20,31,39.22,2,1,66.67,8,10,44.44,2,1,66.67,12,21,36.36,0,0,-,26,37,41.27,0,2,0.00,7.69,92.5,1.002,1,4,0,0,100.00,14,18,26,35,Scott Wedgewood,8475809,2017-11-06,ARI_2017-11-06
3,2017-11-14 ARI at WPG,ARI,34.866667,32,26,55.17,20,21,48.78,18,17,51.43,1,1,50,1.15,1.41,45,17,15,53.13,0,1,0,5,7,41.67,0,1,0,12,8,60,0,0,-,14,9,60.87,1,0,100.00,5.56,94.12,0.997,0,2,0,0,-,10,14,7,58.82,Scott Wedgewood,8475809,2017-11-14,ARI_2017-11-14
4,2017-11-22 S.J at ARI,ARI,44.333333,32,32,50,23,24,48.94,19,16,54.29,1,2,33.33,1.22,1.41,46.35,12,16,42.86,1,2,33.33,8,7,53.33,1,2,33.33,4,9,30.77,0,0,-,15,15,50,0,0,-,5.26,87.5,0.928,0,3,1,0,0.00,9,13,22,29.03,Scott Wedgewood,8475809,2017-11-22,ARI_2017-11-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,2017-12-16 WPG at STL,WPG,58.850000,72,54,57.14,61,39,61,44,30,59.46,0,2,0.00,3.74,2.58,59.23,35,25,58.33,0,1,0.00,18,12,60.00,0,1,0.00,17,13,56.67,0,0,-,34,26,56.67,0,1,0.00,0,93.33,0.933,1,3,0,0,100.00,25,20,17,59.52,Steve Mason,8473461,2017-12-16,WPG_2017-12-16
67,2017-12-23 WPG at NYI,WPG,60.000000,71,44,61.74,55,36,60.44,40,29,57.97,2,5,28.57,3.56,2.40,59.73,41,21,66.13,1,4,20.00,23,10,69.70,1,3,25.00,18,11,62.07,0,1,0.00,26,18,59.09,1,1,50.00,5,82.76,0.878,0,3,0,0,-,28,16,15,65.12,Steve Mason,8473461,2017-12-23,WPG_2017-12-23
68,2018-01-09 WPG at BUF,WPG,60.000000,54,56,49.09,47,47,50,37,34,52.11,7,4,63.64,3.04,2.72,52.78,21,25,45.65,2,3,40.00,10,13,43.48,2,2,50.00,11,12,47.83,0,1,0.00,30,28,51.72,4,1,80.00,18.92,88.24,1.072,0,3,0,0,-,10,28,25,28.57,Steve Mason,8473461,2018-01-09,WPG_2018-01-09
69,2018-03-06 WPG at NYR,WPG,60.000000,63,60,51.22,45,46,49.45,34,31,52.31,3,0,100.00,2.45,2.46,49.99,22,30,42.31,2,0,100.00,6,13,31.58,0,0,-,16,17,48.48,2,0,100.00,31,26,54.39,0,0,-,8.82,100,1.088,0,3,0,0,-,24,15,18,57.14,Steve Mason,8473461,2018-03-06,WPG_2018-03-06


In [113]:
df_20172018.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_starter,away_starter,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,Frederik Andersen,WPG_2017-10-04,,,,,,,,,0,TOR_2017-10-04,,,,,,,,,0
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,Jake Allen,PIT_2017-10-04,,,,,,,,,0,STL_2017-10-04,,,,,,,,,0
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,Mike Smith,EDM_2017-10-04,,,,,,,,,0,CGY_2017-10-04,,,,,,,,,0
3,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,Brian Elliott,S.J_2017-10-04,,,,,,,,,0,PHI_2017-10-04,,,,,,,,,0
4,2017020005,2017-10-05,TD Garden,BOS,NSH,2017-10-05 23:00:00,4,3,Final,1,BOS_2017-10-05,NSH_2017-10-05,Tuukka Rask,Pekka Rinne,BOS_2017-10-05,,,,,,,,,0,NSH_2017-10-05,,,,,,,,,0


In [133]:
test = df_20172018.merge(agtoi[['Team_Key', 'Name']].add_prefix('ghome_'), left_on = 'Home_Team_Key', right_on = 'ghome_Team_Key', how = 'left', suffixes = ('', '_goalie'))
test = test.merge(agtoi[['Team_Key', 'Name']].add_prefix('gaway_'), left_on = 'Away_Team_Key', right_on = 'gaway_Team_Key', how = 'left', suffixes = ('', '_goalie'))

In [134]:
test.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_starter,away_starter,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B,ghome_Team_Key,ghome_Name,gaway_Team_Key,gaway_Name
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,Frederik Andersen,WPG_2017-10-04,,,,,,,,,0,TOR_2017-10-04,,,,,,,,,0,WPG_2017-10-04,Steve Mason,TOR_2017-10-04,Frederik Andersen
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,Jake Allen,PIT_2017-10-04,,,,,,,,,0,STL_2017-10-04,,,,,,,,,0,PIT_2017-10-04,Matt Murray,STL_2017-10-04,Jake Allen
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,Mike Smith,EDM_2017-10-04,,,,,,,,,0,CGY_2017-10-04,,,,,,,,,0,EDM_2017-10-04,Cam Talbot,CGY_2017-10-04,Mike Smith
3,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,Mike Smith,EDM_2017-10-04,,,,,,,,,0,CGY_2017-10-04,,,,,,,,,0,EDM_2017-10-04,Cameron Talbot,CGY_2017-10-04,Mike Smith
4,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,Brian Elliott,S.J_2017-10-04,,,,,,,,,0,PHI_2017-10-04,,,,,,,,,0,S.J_2017-10-04,Martin Jones,PHI_2017-10-04,Brian Elliott


In [136]:
test[test['gaway_Name'].isna()]

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_starter,away_starter,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B,ghome_Team_Key,ghome_Name,gaway_Team_Key,gaway_Name


In [141]:
goalie_list = pd.read_html('https://www.naturalstattrick.com/playerteams.php?fromseason=20182019&thruseason=20202021&stype=2&sit=5v5&score=all&stdoi=g&rate=n&team=ALL&pos=S&loc=B&toi=0&gpfilt=none&fd=&td=&tgp=410&lines=single&draftteam=ALL')[0]

Unnamed: 0.1,Unnamed: 0,Player,Team,GP,TOI,Shots Against,Saves,Goals Against,SV%,GAA,GSAA,xG Against,HD Shots Against,HD Saves,HD Goals Against,HDSV%,HDGAA,HDGSAA,MD Shots Against,MD Saves,MD Goals Against,MDSV%,MDGAA,MDGSAA,LD Shots Against,LD Saves,LD Goals Against,LDSV%,LDGAA,LDGSAA,Rush Attempts Against,Rebound Attempts Against,Avg. Shot Distance,Avg. Goal Distance
0,1,Roberto Luongo,FLA,43,1906.016667,994,901,93,0.906,2.93,-12.34,76.55,263,207,56,0.787,1.76,-9.95,241,215,26,0.892,0.82,-4.6,419,409,10,0.976,0.31,1.7,58,106,37.84,20.38
1,2,Craig Anderson,"OTT, WSH",87,3817.316667,2184,1990,194,0.911,3.05,-16.78,163.65,564,462,102,0.819,1.6,-3.25,529,467,62,0.883,0.97,-15.03,1007,978,29,0.971,0.46,-0.88,94,285,36.47,22.63
2,3,Ryan Miller,ANA,57,2450.516667,1256,1149,107,0.915,2.62,-5.08,100.64,352,291,61,0.827,1.49,0.63,352,326,26,0.926,0.64,5.25,516,496,20,0.961,0.49,-5.59,57,158,33.84,22.62
3,4,Henrik Lundqvist,NYR,82,3726.666667,1972,1815,157,0.92,2.53,3.02,154.99,544,456,88,0.838,1.42,7.24,495,448,47,0.905,0.76,-3.05,833,811,22,0.974,0.35,1.26,118,242,36.52,22.08
4,5,Peter Budaj,L.A,3,59.733333,21,18,3,0.857,3.01,-1.3,1.37,8,7,1,0.875,1.0,0.4,3,2,1,0.667,1.0,-0.73,8,7,1,0.875,1.0,-0.78,1,2,41.24,27.0


In [143]:
missing_goalies2 = [g for g in list(goalie_list['Player']) if g not in goalie_ids.keys() ]

In [146]:
missing_goalies2.remove('Cal Petersen')

In [147]:
missing_goalies2

['Richard Bachman',
 'Edward Pasquale',
 'Garret Sparks',
 'Antoine Bibeau',
 'Pheonix Copley',
 'Dan Vladar',
 'Landon Bow',
 'David Ayres',
 'Kevin Boyle',
 'Stuart Skinner',
 'Hunter Miska',
 'Matiss Kivlenieks',
 'Gilles Senn',
 'Jeremy Swayman',
 'Logan Thompson',
 'Kaden Fulcher',
 'Veini Vehvilainen',
 'Ivan Prosvetov',
 'Alexei Melnichuk']

In [145]:
goalie_ids['Calvin Petersen']

8477361

In [148]:
missing_id = ['8473614', '8475277', '8476343', '8477312', '8477831', '8478435', '8479016', '8479188', '8479294', '8479973', '8480112',  '8480162', '8480213', '8480280',  '8480313', '8480363',  '8481001', '8481031', '8482246']

In [149]:
for i,g in enumerate(missing_goalies2[:len(missing_id)]):
    goalie_ids[g] = missing_id[i]

In [167]:
counter = 0
for name, gid in goalie_ids.items():
    
    sequence = [x/10 for x in range(40, 80)]
    time.sleep(random.choice(sequence))
    url = 'https://www.naturalstattrick.com/playerreport.php?fromseason=20182019&thruseason=20202021&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(gid)
    individual_df4 = pd.read_html(url)[0]
    individual_df4['Name'] = name
    individual_df4['ID'] = gid



    if counter == 0:
        all_goalies4 = individual_df4
        print(name)
        print(counter)
    elif counter != 0:
        all_goalies4 = pd.concat([all_goalies4, individual_df4])
        print(name)
        print(counter)

    
    counter +=1
    

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14
Jaroslav Halak
15
Tuukka Rask
16
Michael Houser
17
Ukko-Pekka Luukkonen
18
Dustin Tokarski
19
Carter Hutton
20
Linus Ullmark
21
Charlie Lindgren
22
Carey Price
23
Jake Allen
24
Cayden Primeau
25
Joey Daccord
26
Anton Forsberg
27
Matt Murray
28
Marcus Hogberg
29
Filip Gustavsson
30
Frederik Andersen
31
Jack Campbell
32
David Rittich
33
James Reimer
34
Petr Mrazek
35
Alex Nedeljkovic
36
Philippe Desrosiers
37
Sam Montembeault
38
Sergei Bobrovsky
39
Chris Driedger
40
Spencer Knight
41
Christopher Gibson
42
Curtis McElhinney
43
Andrei Vasilevskiy
44
Craig Anderson
45
Vitek Vanecek
46
Ilya Samsonov
47
Malcolm Subban
48
Collin Delia
49
Kevin Lankinen
50
Thomas Greiss
51
Jonathan Bernier
52
Kasimir Kaskisuo
53
Pekka Rinne
54
Juuse Saros
55
Jordan 

In [209]:
t = goalie_ids

In [210]:
del t['Andrei Vasilevski']

In [214]:
goalie_ids['Andrei Vasilevskiy']

8476883

In [215]:
pickle_out = open("data/goalie_ids.pickle","wb")
pickle.dump(goalie_ids, pickle_out)
pickle_out.close()

In [168]:
all_goalies4

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID
0,2021-01-21 N.J at NYI,N.J,60.000000,59,61,49.17,37,47,44.05,31,35,46.97,1,4,20,1.44,3.21,30.98,21,34,38.18,1,4,20,2,16,11.11,0,2,0.00,19,18,51.35,1,2,33.33,31,26,54.39,0,0,-,3.23,88.57,0.918,0,3,0,0,-,24,15,21,53.33,Scott Wedgewood,8475809
1,2021-01-24 NYI at N.J,N.J,60.000000,46,46,50,37,40,48.05,24,28,46.15,2,0,100,2.73,2.54,51.77,24,23,51.06,2,0,100,10,9,52.63,2,0,100.00,14,14,50,0,0,-,19,22,46.34,0,0,-,8.33,100,1.083,0,3,0,0,-,17,16,26,39.53,Scott Wedgewood,8475809
2,2021-01-26 PHI at N.J,N.J,58.066667,46,44,51.11,36,40,47.37,26,30,46.43,3,4,42.86,2.39,3.89,38.13,25,29,46.3,3,4,42.86,8,20,28.57,3,4,42.86,17,9,65.38,0,0,-,20,14,58.82,0,0,-,11.54,86.67,0.982,0,4,0,0,-,18,17,24,42.86,Scott Wedgewood,8475809
3,2021-01-28 PHI at N.J,N.J,57.350000,54,36,60,39,27,59.09,31,17,64.58,1,3,25,2.30,1.85,55.41,25,23,52.08,0,3,0,9,8,52.94,0,3,0.00,16,15,51.61,0,0,-,28,13,68.29,1,0,100.00,3.23,82.35,0.856,3,5,0,0,100.00,24,14,10,70.59,Scott Wedgewood,8475809
4,2021-01-30 N.J at BUF,N.J,64.950000,59,53,52.68,47,41,53.41,34,31,52.31,3,3,50,2.56,2.48,50.78,21,23,47.73,2,2,50,8,13,38.1,1,0,100.00,13,10,56.52,1,2,33.33,34,28,54.84,1,1,50.00,8.82,90.32,0.991,0,4,0,0,-,23,16,24,48.94,Scott Wedgewood,8475809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2021-03-06 CBJ at DAL,CBJ,10.666667,5,6,45.45,3,5,37.5,3,4,42.86,0,1,0,0.06,0.15,26.35,3,2,60,0,1,0,0,0,-,0,0,-,3,2,60,0,1,0,1,3,25,0,0,-,0,75,0.75,1,1,0,0,100,2,5,4,33.33,Veini Vehvilainen,8481001
0,2021-03-31 ARI at COL,ARI,53.633333,42,52,44.68,34,39,46.58,29,28,50.88,2,5,28.57,1.42,2.32,37.94,15,29,34.09,1,5,16.67,6,11,35.29,0,1,0,9,18,33.33,1,4,20,26,21,55.32,1,0,100.00,6.9,82.14,0.89,1,3,0,0,100.00,22,18,12,64.71,Ivan Prosvetov,8481031
1,2021-04-09 ARI at VGK,ARI,18.416667,18,10,64.29,12,5,70.59,7,4,63.64,3,1,75,0.63,0.44,58.94,10,8,55.56,3,1,75,3,3,50,1,1,50,7,5,58.33,2,0,100,7,0,100,0,0,-,42.86,75,1.179,0,2,0,0,-,5,10,2,71.43,Ivan Prosvetov,8481031
2,2021-04-12 ARI at COL,ARI,57.966667,58,51,53.21,47,32,59.49,33,19,63.46,2,3,40,2.87,1.80,61.36,19,29,39.58,2,3,40,9,7,56.25,1,1,50,10,22,31.25,1,2,33.33,36,21,63.16,0,0,-,6.06,84.21,0.903,0,4,0,0,-,18,18,22,45,Ivan Prosvetov,8481031


In [216]:
all_goalies_c = pd.concat([all_goalies4, all_goalies_20172018])

all_goalies_c = all_goalies_c[all_goalies_c['Name'] != 'Andrei Vasilevski']

all_goalies_c = all_goalies_c.sort_values('Date').reset_index()

In [217]:
all_goalies_c.head()

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key
0,0,2014-10-08 PHI at BOS,BOS,59.95,58,44,56.86,43,29,59.72,33,20,62.26,2,1,66.67,1.73,1.44,54.61,17,14,54.84,2,1,66.67,4,8,33.33,2,1,66.67,13,6,68.42,0,0,-,37,28,56.92,0,0,-,6.06,95.0,1.011,1,3,0,0,100.00,23,14,24,48.94,Tuukka Rask,8471695,2014-10-08,BOS_2014-10-08
1,0,2014-10-08 VAN at CGY,VAN,60.0,50,45,52.63,39,32,54.93,33,25,56.9,4,2,66.67,2.79,1.61,63.39,19,17,52.78,4,2,66.67,9,6,60.0,4,0,100.0,10,11,47.62,0,2,0.00,28,24,53.85,0,0,-,12.12,92.0,1.041,0,3,0,0,-,24,22,24,50.0,Ryan Miller,8468011,2014-10-08,VAN_2014-10-08
2,0,2014-10-08 S.J at L.A,L.A,40.0,48,45,51.61,37,36,50.68,24,27,47.06,0,4,0.0,1.49,2.54,36.93,14,23,37.84,0,4,0.0,5,8,38.46,0,2,0.0,9,15,37.5,0,2,0.00,33,21,61.11,0,0,-,0.0,85.19,0.852,0,2,0,0,-,17,16,18,48.57,Jonathan Quick,8471734,2014-10-08,L.A_2014-10-08
3,0,2014-10-08 MTL at TOR,MTL,59.333333,56,57,49.56,45,37,54.88,31,27,53.45,4,3,57.14,2.37,2.0,54.29,24,27,47.06,2,3,40.0,9,10,47.37,0,2,0.0,15,17,46.88,2,1,66.67,27,26,50.94,2,0,100.00,12.9,88.89,1.018,1,3,0,0,100.00,18,20,25,41.86,Carey Price,8471679,2014-10-08,MTL_2014-10-08
4,0,2014-10-08 MTL at TOR,TOR,59.483333,56,57,49.56,36,47,43.37,26,32,44.83,3,4,42.86,1.9,2.51,43.08,26,25,50.98,3,2,60.0,9,10,47.37,2,0,100.0,17,15,53.13,1,2,33.33,26,27,49.06,0,2,0.00,11.54,87.5,0.99,1,3,0,0,100.00,24,20,18,57.14,Jonathan Bernier,8473541,2014-10-08,TOR_2014-10-08


In [233]:
def goalie_features(df):
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    
    df['Last_20_TOI'] = df.groupby('ID')['TOI'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_FA'] = df.groupby('ID')['FA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_SA'] = df.groupby('ID')['SA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_GA'] = df.groupby('ID')['GA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_xGA'] = df.groupby('ID')['xGA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_HDCA'] = df.groupby('ID')['HDCA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    df['Last_20_HDGA'] = df.groupby('ID')['HDGA'].transform(lambda x: x.rolling(20, 20).sum().shift())
    
    df['Last_20_FenwickSV%'] =  (df['Last_20_FA'] - df['Last_20_GA']) /  df['Last_20_FA']
    df['Last_20_GSAx'] = df['Last_20_xGA'] - df['Last_20_GA']
    df['Last_20_GSAx/60'] =  df['Last_20_GSAx']*60 /  df['Last_20_TOI']
    df['Last_20_HDCSV%'] = (df['Last_20_HDCA'] - df['Last_20_HDGA'] ) / df['Last_20_HDCA'] 
    return df

In [236]:
goalie_features_df = goalie_features(all_goalies_c)

In [192]:
all_goalies_c['Last_20_TOI'] = all_goalies_c.groupby('ID')['TOI'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_FA'] = all_goalies_c.groupby('ID')['FA'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_SA'] = all_goalies_c.groupby('ID')['SA'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_GA'] = all_goalies_c.groupby('ID')['GA'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_xGA'] = all_goalies_c.groupby('ID')['xGA'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_HDCA'] = all_goalies_c.groupby('ID')['HDCA'].transform(lambda x: x.rolling(20, 20).sum().shift())
all_goalies_c['Last_20_HDGA'] = all_goalies_c.groupby('ID')['HDGA'].transform(lambda x: x.rolling(20, 20).sum().shift())

In [193]:
all_goalies_c['Last_20_FenwickSV%'] =  (all_goalies_c['Last_20_FA'] - all_goalies_c['Last_20_GA']) /  all_goalies_c['Last_20_FA']
all_goalies_c['Last_20_GSAx'] = all_goalies_c['Last_20_xGA'] - all_goalies_c['Last_20_GA']
all_goalies_c['Last_20_GSAx/60'] =  all_goalies_c['Last_20_GSAx']*60 /  all_goalies_c['Last_20_TOI']
all_goalies_c['Last_20_HDCSV%'] = (all_goalies_c['Last_20_HDCA'] - all_goalies_c['Last_20_HDGA'] ) / all_goalies_c['Last_20_HDCA'] 

In [194]:
all_goalies_c

Unnamed: 0,level_0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%
0,12153,0,2014-10-08 PHI at BOS,BOS,59.950000,58,44,56.86,43,29,59.72,33,20,62.26,2,1,66.67,1.73,1.44,54.61,17,14,54.84,2,1,66.67,4,8,33.33,2,1,66.67,13,6,68.42,0,0,-,37,28,56.92,0,0,-,6.06,95,1.011,1,3,0,0,100.00,23,14,24,48.94,Tuukka Rask,8471695,2014-10-08,BOS_2014-10-08,,,,,,,,,,,
1,10179,0,2014-10-08 VAN at CGY,VAN,60.000000,50,45,52.63,39,32,54.93,33,25,56.9,4,2,66.67,2.79,1.61,63.39,19,17,52.78,4,2,66.67,9,6,60,4,0,100.00,10,11,47.62,0,2,0.00,28,24,53.85,0,0,-,12.12,92,1.041,0,3,0,0,-,24,22,24,50,Ryan Miller,8468011,2014-10-08,VAN_2014-10-08,,,,,,,,,,,
2,15414,0,2014-10-08 S.J at L.A,L.A,40.000000,48,45,51.61,37,36,50.68,24,27,47.06,0,4,0.00,1.49,2.54,36.93,14,23,37.84,0,4,0.00,5,8,38.46,0,2,0.00,9,15,37.5,0,2,0.00,33,21,61.11,0,0,-,0,85.19,0.852,0,2,0,0,-,17,16,18,48.57,Jonathan Quick,8471734,2014-10-08,L.A_2014-10-08,,,,,,,,,,,
3,7725,0,2014-10-08 MTL at TOR,MTL,59.333333,56,57,49.56,45,37,54.88,31,27,53.45,4,3,57.14,2.37,2.00,54.29,24,27,47.06,2,3,40.00,9,10,47.37,0,2,0.00,15,17,46.88,2,1,66.67,27,26,50.94,2,0,100.00,12.9,88.89,1.018,1,3,0,0,100.00,18,20,25,41.86,Carey Price,8471679,2014-10-08,MTL_2014-10-08,,,,,,,,,,,
4,13807,0,2014-10-08 MTL at TOR,TOR,59.483333,56,57,49.56,36,47,43.37,26,32,44.83,3,4,42.86,1.90,2.51,43.08,26,25,50.98,3,2,60.00,9,10,47.37,2,0,100.00,17,15,53.13,1,2,33.33,26,27,49.06,0,2,0.00,11.54,87.50,0.990,1,3,0,0,100.00,24,20,18,57.14,Jonathan Bernier,8473541,2014-10-08,TOR_2014-10-08,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16373,6730,0,2021-03-06 CBJ at DAL,CBJ,10.666667,5,6,45.45,3,5,37.5,3,4,42.86,0,1,0,0.06,0.15,26.35,3,2,60,0,1,0,0,0,-,0,0,-,3,2,60,0,1,0,1,3,25,0,0,-,0,75,0.75,1,1,0,0,100,2,5,4,33.33,Veini Vehvilainen,8481001,NaT,,,,,,,,,,,,
16374,6731,0,2021-03-31 ARI at COL,ARI,53.633333,42,52,44.68,34,39,46.58,29,28,50.88,2,5,28.57,1.42,2.32,37.94,15,29,34.09,1,5,16.67,6,11,35.29,0,1,0,9,18,33.33,1,4,20,26,21,55.32,1,0,100.00,6.9,82.14,0.89,1,3,0,0,100.00,22,18,12,64.71,Ivan Prosvetov,8481031,NaT,,,,,,,,,,,,
16375,6732,1,2021-04-09 ARI at VGK,ARI,18.416667,18,10,64.29,12,5,70.59,7,4,63.64,3,1,75,0.63,0.44,58.94,10,8,55.56,3,1,75,3,3,50,1,1,50,7,5,58.33,2,0,100,7,0,100,0,0,-,42.86,75,1.179,0,2,0,0,-,5,10,2,71.43,Ivan Prosvetov,8481031,NaT,,,,,,,,,,,,
16376,6733,2,2021-04-12 ARI at COL,ARI,57.966667,58,51,53.21,47,32,59.49,33,19,63.46,2,3,40,2.87,1.80,61.36,19,29,39.58,2,3,40,9,7,56.25,1,1,50,10,22,31.25,1,2,33.33,36,21,63.16,0,0,-,6.06,84.21,0.903,0,4,0,0,-,18,18,22,45,Ivan Prosvetov,8481031,NaT,,,,,,,,,,,,


In [250]:
goalie_features_df.()

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%
16373,0,2021-03-06 CBJ at DAL,CBJ,10.666667,5,6,45.45,3,5,37.5,3,4,42.86,0,1,0,0.06,0.15,26.35,3,2,60.0,0,1,0,0,0,-,0,0,-,3,2,60.0,0,1,0,1,3,25.0,0,0,-,0.0,75.0,0.75,1,1,0,0,100,2,5,4,33.33,Veini Vehvilainen,8481001,2021-03-06,CBJ_2021-03-06,,,,,,,,,,,
16374,0,2021-03-31 ARI at COL,ARI,53.633333,42,52,44.68,34,39,46.58,29,28,50.88,2,5,28.57,1.42,2.32,37.94,15,29,34.09,1,5,16.67,6,11,35.29,0,1,0,9,18,33.33,1,4,20,26,21,55.32,1,0,100.00,6.9,82.14,0.89,1,3,0,0,100.00,22,18,12,64.71,Ivan Prosvetov,8481031,2021-03-31,ARI_2021-03-31,,,,,,,,,,,
16375,1,2021-04-09 ARI at VGK,ARI,18.416667,18,10,64.29,12,5,70.59,7,4,63.64,3,1,75,0.63,0.44,58.94,10,8,55.56,3,1,75,3,3,50,1,1,50,7,5,58.33,2,0,100,7,0,100.0,0,0,-,42.86,75.0,1.179,0,2,0,0,-,5,10,2,71.43,Ivan Prosvetov,8481031,2021-04-09,ARI_2021-04-09,,,,,,,,,,,
16376,2,2021-04-12 ARI at COL,ARI,57.966667,58,51,53.21,47,32,59.49,33,19,63.46,2,3,40,2.87,1.8,61.36,19,29,39.58,2,3,40,9,7,56.25,1,1,50,10,22,31.25,1,2,33.33,36,21,63.16,0,0,-,6.06,84.21,0.903,0,4,0,0,-,18,18,22,45.0,Ivan Prosvetov,8481031,2021-04-12,ARI_2021-04-12,,,,,,,,,,,
16377,0,2021-02-11 S.J at L.A,S.J,9.183333,10,6,62.5,10,6,62.5,7,5,58.33,0,0,-,0.59,0.15,80.17,3,1,75.0,0,0,-,3,0,100,0,0,-,0,1,0.0,0,0,-,7,2,77.78,0,0,-,0.0,100.0,1.0,0,1,0,0,-,3,2,0,100.0,Alexei Melnichuk,8482246,2021-02-11,S.J_2021-02-11,,,,,,,,,,,


In [240]:
goalie_features_df[goalie_features_df['Name'] == 'Carter Hart'].iloc[0:21
                                                              ]

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%
10278,0,2018-12-18 DET at PHI,PHI,60.0,59,51,53.64,46,33,58.23,33,22,60.0,3,2,60.0,2.29,1.07,68.28,21,13,61.76,2,2,50.0,10,4,71.43,1,1,50.00,11,9,55.0,1,1,50.00,31,31,50.0,1,0,100.00,9.09,90.91,1.0,0,3,0,0,-,20,19,21,48.78,Carter Hart,8479394,2018-12-18,PHI_2018-12-18,,,,,,,,,,,
10279,1,2018-12-20 NSH at PHI,PHI,60.0,47,65,41.96,41,41,50.0,31,32,49.21,2,1,66.67,1.9,2.17,46.67,15,24,38.46,1,1,50.0,3,9,25.0,1,1,50.00,12,15,44.44,0,0,-,25,35,41.67,1,0,100.00,6.45,96.88,1.033,0,3,0,0,-,13,10,23,36.11,Carter Hart,8479394,2018-12-20,PHI_2018-12-20,,,,,,,,,,,
10280,2,2018-12-22 CBJ at PHI,PHI,57.883333,59,30,66.29,45,23,66.18,34,19,64.15,2,4,33.33,3.67,1.34,73.22,26,12,68.42,2,4,33.33,11,5,68.75,1,2,33.33,15,7,68.18,1,2,33.33,24,16,60.0,0,0,-,5.88,78.95,0.848,2,4,0,0,100.00,17,19,16,51.52,Carter Hart,8479394,2018-12-22,PHI_2018-12-22,,,,,,,,,,,
10281,3,2018-12-29 PHI at FLA,PHI,59.066667,56,67,45.53,48,46,51.06,33,36,47.83,1,2,33.33,2.65,3.12,45.9,23,42,35.38,0,2,0.0,12,8,60.0,0,0,-,11,34,24.44,0,2,0.00,31,22,58.49,1,0,100.00,3.03,94.44,0.975,0,3,0,0,-,23,15,21,52.27,Carter Hart,8479394,2018-12-29,PHI_2018-12-29,,,,,,,,,,,
10282,4,2018-12-31 PHI at CAR,PHI,22.316667,27,18,60.0,17,13,56.67,12,10,54.55,0,3,0.0,0.95,0.65,59.47,12,6,66.67,0,2,0.0,4,2,66.67,0,1,0.00,8,4,66.67,0,1,0.00,15,8,65.22,0,1,0.00,0.0,70.0,0.7,0,2,0,0,-,7,6,8,46.67,Carter Hart,8479394,2018-12-31,PHI_2018-12-31,,,,,,,,,,,
10283,5,2019-01-05 CGY at PHI,PHI,61.983333,66,63,51.16,51,46,52.58,34,32,51.52,2,3,40.0,2.82,2.02,58.24,29,18,61.7,0,2,0.0,9,7,56.25,0,1,0.00,20,11,64.52,0,1,0.00,31,40,43.66,2,1,66.67,5.88,90.63,0.965,0,4,0,0,-,21,14,21,50.0,Carter Hart,8479394,2019-01-05,PHI_2019-01-05,,,,,,,,,,,
10284,6,2019-01-07 STL at PHI,PHI,58.783333,48,55,46.6,33,44,42.86,24,36,40.0,0,2,0.0,1.09,2.42,30.99,19,21,47.5,0,1,0.0,5,11,31.25,0,1,0.00,14,10,58.33,0,0,-,23,28,45.1,0,1,0.00,0.0,94.44,0.944,0,4,0,0,-,15,19,25,37.5,Carter Hart,8479394,2019-01-07,PHI_2019-01-07,,,,,,,,,,,
10285,7,2019-01-10 DAL at PHI,PHI,59.333333,49,67,42.24,34,49,40.96,21,38,35.59,1,1,50.0,1.7,2.72,38.41,21,27,43.75,1,1,50.0,7,10,41.18,1,0,100.00,14,17,45.16,0,1,0.00,18,34,34.62,0,0,-,4.76,97.37,1.021,1,4,0,0,100.00,13,20,26,33.33,Carter Hart,8479394,2019-01-10,PHI_2019-01-10,,,,,,,,,,,
10286,8,2019-01-12 PHI at N.J,PHI,57.233333,59,51,53.64,47,37,55.95,32,26,55.17,1,3,25.0,3.11,2.09,59.8,27,16,62.79,1,2,33.33,13,6,68.42,1,2,33.33,14,10,58.33,0,0,-,23,23,50.0,0,1,0.00,3.13,88.46,0.916,0,3,0,0,-,21,18,24,46.67,Carter Hart,8479394,2019-01-12,PHI_2019-01-12,,,,,,,,,,,
10287,9,2019-01-14 MIN at PHI,PHI,59.95,50,69,42.02,41,49,45.56,27,38,41.54,7,4,63.64,2.82,3.6,43.93,20,28,41.67,6,4,60.0,9,15,37.5,5,3,62.50,11,13,45.83,1,1,50.00,25,34,42.37,1,0,100.00,25.93,89.47,1.154,1,3,0,0,100.00,15,24,20,42.86,Carter Hart,8479394,2019-01-14,PHI_2019-01-14,,,,,,,,,,,


In [267]:
goalie_features_df[goalie_features_df['Date'] == '2018-04-05']

Unnamed: 0,index,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Last_20_TOI,Last_20_FA,Last_20_SA,Last_20_GA,Last_20_xGA,Last_20_HDCA,Last_20_HDGA,Last_20_FenwickSV%,Last_20_GSAx,Last_20_GSAx/60,Last_20_HDCSV%


In [268]:
all_goalies_c['Date']

0       2014-10-08
1       2014-10-08
2       2014-10-08
3       2014-10-08
4       2014-10-08
           ...    
16373   2021-03-06
16374   2021-03-31
16375   2021-04-09
16376   2021-04-12
16377   2021-02-11
Name: Date, Length: 16378, dtype: datetime64[ns]

In [272]:
all_goalies_c['Year'] = all_goalies_c['Date'].apply(lambda x: x.year)
all_goalies_c['Month'] = all_goalies_c['Date'].apply(lambda x: x.month)

In [274]:
all_goalies_c[all_goalies_c['Year'] == 2018]['Month'].value_counts()

12    477
11    470
10    388
Name: Month, dtype: int64

In [275]:
def get_goalie_data(goalie_ids, start_year, end_year):
    counter = 0
    for name, gid in goalie_ids.items():

        sequence = [x/10 for x in range(40, 80)]
        time.sleep(random.choice(sequence))
        url = 'https://www.naturalstattrick.com/playerreport.php?fromseason={}&thruseason={}&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(start_year, end_year, gid)
        individual_df = pd.read_html(url)[0]
        individual_df['Name'] = name
        individual_df['ID'] = gid



        if counter == 0:
            all_goalies4 = individual_df
            print(name)
            print(counter)
        elif counter != 0:
            all_goalies4 = pd.concat([all_goalies4, individual_df])
            print(name)
            print(counter)


        counter +=1
    
    return all_goalies4

In [None]:
goalies_161718 = get_goalie_data(goalie_ids, 20162017, 20172018)

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14


In [None]:
goalies_181920 = get_goalie_data(goalie_ids, 20182019, 20192020)

In [None]:
goalies_2021 = get_goalie_data(goalie_ids, 20202021, 20202021)

In [None]:
goalies_all_B = pd.concat([goalies_161718, goalies_181920, goalies_2021])

In [None]:
goalie_features_dfB = goalie_features(goalies_all_B)