In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import hockey_scraper
import pickle
import time
import random
import datetime
pd.set_option('display.max_columns', None)

In [50]:
#game boxscore url
url = 'https://statsapi.web.nhl.com/api/v1/game/{}/boxscore'.format(game_id)

In [None]:
#roster url for future uses 
'https://statsapi.web.nhl.com/api/v1/teams?expand=team.roster&season=20142015'

In [2]:
#schedule url, date format: 2018-01-09

def get_yesterdays_game_ids():
    yesterday = datetime.date.today()- datetime.timedelta(days=1)
    url = 'https://statsapi.web.nhl.com/api/v1/schedule?date={}'.format(yesterday.strftime('%Y-%m-%d'))
    r = requests.get(url)
    y_sched = r.json()

    game_id = []
    for game in y_sched['dates'][0]['games']:
        game_id.append(game['gamePk'])
    
    return game_id

In [3]:
get_yesterdays_game_ids()

[2020030323]

In [73]:
#get goalie ids from yesterdays games
def get_goalie_ids():
    goalie_id = {}
    for game_id in get_yesterdays_game_ids():
        url = 'https://statsapi.web.nhl.com/api/v1/game/{}/boxscore'.format(game_id)
        r = requests.get(url)
        box_score = r.json()
        for e in ['home', 'away']:
            temp_id = []
            temp_id.extend(box_score['teams'][e]['goalies'])
            for pid in temp_id:
                name = box_score['teams'][e]['players']['ID'+str(pid)]['person']['fullName']
                goalie_id[name] = pid
    return goalie_id
    

        
    
    

In [74]:
get_goalie_ids()

{'Semyon Varlamov': 8473575, 'Andrei Vasilevskiy': 8476883}

In [28]:
def get_goalie_data(goalie_ids, start_year =2021, end_year = 2022):
    counter = 0
    for name, gid in goalie_ids.items():

        sequence = [x/10 for x in range(60, 120)]
        time.sleep(random.choice(sequence))
        url = 'https://www.naturalstattrick.com/playerreport.php?fromseason={}&thruseason={}&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(start_year, end_year, gid)
        #due to number of http requests, NST may ban your IP before the loop finishes. I needed to use a VPN to get around this. If IP gets banned, this function will still return the current DF and you can call the function again and pass in an updated goalie dictionary to get the rest
        try:
            individual_df = pd.read_html(url)[0]
            individual_df['Name'] = name
            individual_df['ID'] = gid
        except:
            print(f'Ended before {name}')
            return all_goalies

        if counter == 0:
            all_goalies = individual_df
            print(name)
            print(counter)
        elif counter != 0:
            all_goalies = pd.concat([all_goalies, individual_df])
            print(name)
            print(counter)


        counter +=1
    
    return all_goalies

In [75]:
get_goalie_data(get_goalie_ids())

Semyon Varlamov
0
Andrei Vasilevskiy
1


Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID


In [None]:
def goalie_features(df, rolling_games = 40):
    rolling_games = rolling_games
    min_games = 10
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    
    df['Rolling_TOI'] = df.groupby('ID')['TOI'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_FA'] = df.groupby('ID')['FA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_SA'] = df.groupby('ID')['SA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_GA'] = df.groupby('ID')['GA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_xGA'] = df.groupby('ID')['xGA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_HDCA'] = df.groupby('ID')['HDCA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_HDGA'] = df.groupby('ID')['HDGA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    
    df['Goalie_FenwickSV%'] =  (df['Rolling_FA'] - df['Rolling_GA']) /  df['Rolling_FA']
    df['Goalie_GSAx'] = df['Rolling_xGA'] - df['Rolling_GA']
    df['Goalie_GSAx/60'] =  df['Goalie_GSAx']*60 /  df['Rolling_TOI']
    df['Goalie_HDCSV%'] = (df['Rolling_HDCA'] - df['Rolling_HDGA'] ) / df['Rolling_HDCA'] 
    return df