In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import hockey_scraper
import pickle
import time
import random
pd.set_option('display.max_columns', None)

### Define Helper Functions

In [3]:
nst_to_sched = { 'Anaheim Ducks': 'ANA', 
                      'Arizona Coyotes' : 'ARI', 
                      'Boston Bruins': 'BOS', 
                      'Buffalo Sabres':'BUF',
                      'Calgary Flames': 'CGY', 
                      'Carolina Hurricanes': 'CAR', 
                      'Chicago Blackhawks': 'CHI', 
                      'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A.',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J.',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J.',
                     'St. Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B.',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                      'Vegas Golden Knights':'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}

In [4]:
goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}

In [5]:
#test if i should switch to score and venu adjusted?
#sit = sva
#test 5,10,15 rolling as well
def get_and_format_nst_team_stats(season, sit, rate):
    nst_to_sched = {'Anaheim Ducks': 'ANA',
                     'Arizona Coyotes': 'ARI',
                     'Boston Bruins': 'BOS',
                     'Buffalo Sabres': 'BUF',
                     'Calgary Flames': 'CGY',
                     'Carolina Hurricanes': 'CAR',
                     'Chicago Blackhawks': 'CHI',
                     'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J',
                     'St Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                     'Vegas Golden Knights': 'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}
    
    url = 'https://www.naturalstattrick.com/games.php?fromseason={}&thruseason={}&stype=2&sit={}&loc=B&team=All&rate={}'.format(
        season,
        season,
        sit,
        rate)
    df = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]
    df.reset_index(inplace = True)
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Game_Number'] = df.groupby('Team').cumcount() + 1
    #rename Team_Date to team key or something like that
    df = df.replace({'Team': nst_to_sched})

    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    return df

In [6]:
def merge_team_stats(primary_df, pp_df, pk_df):
    primary_df = primary_df.merge(pk_df[['Team_Key', 'TOI', 'xGA']], on = 'Team_Key', how = 'left', suffixes = ('','_pk') )
    primary_df = primary_df.merge(pp_df[['Team_Key', 'TOI', 'xGF']], on = 'Team_Key', how = 'left', suffixes = ('','_pp') )
    return primary_df

In [7]:
def calculate_team_features(df, rolling_games = 20):
    rolling_games, rolling_games 
    df['sum_rolling20_TOI_5v5'] = df.groupby('Team')['TOI'].transform(lambda x: x.rolling(rolling_games, rolling_games).sum().shift())
    df['sum_rolling20_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_GF_5v5'] = df.groupby('Team')['GF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_GA_5v5'] = df.groupby('Team')['GA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_xGF_5v5'] = df.groupby('Team')['xGF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_xGA_5v5'] = df.groupby('Team')['xGA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_SF_5v5'] = df.groupby('Team')['SF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['last_20_FF%_5v5'] = df['sum_rolling20_FF_5v5']*100/ (df['sum_rolling20_FF_5v5']+df['sum_rolling20_FA_5v5'])
    df['last_20_GF%_5v5'] = df['sum_rolling20_GF_5v5']*100/ (df['sum_rolling20_GF_5v5']+df['sum_rolling20_GA_5v5'])
    df['last_20_xGF%_5v5'] = df['sum_rolling20_xGF_5v5']*100/ (df['sum_rolling20_xGF_5v5']+df['sum_rolling20_xGA_5v5'])
    df['last_20_SH%'] = df['sum_rolling20_GF_5v5']*100 / df['sum_rolling20_SF_5v5']
    
    
    #fix NaNs in pp and pk features
    df['TOI_pp'] = np.where(df['TOI_pp'].isna(), 0, df['TOI_pp'])
    df['TOI_pk'] = np.where(df['TOI_pk'].isna(), 0, df['TOI_pk'])
    df['xGF_pp'] = np.where(df['xGF_pp'].isna(), 0, df['xGF_pp'])
    df['xGA_pk'] = np.where(df['xGA_pk'].isna(), 0, df['xGA_pk'])
    
    #pp features
    df['sum_rolling20_TOI_pp'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_xGF_pp'] = df.groupby('Team')['xGF_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['last20_pp_TOI_per_game'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df['last20_xGF_per_min_pp'] = df['sum_rolling20_xGF_pp'] / df['sum_rolling20_TOI_pp'] 
    
    #pk features
    df['sum_rolling20_TOI_pk'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['sum_rolling20_xGA_pk'] = df.groupby('Team')['xGA_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df['last20_pk_TOI_per_game'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df['last20_xGA_per_min_pk'] = df['sum_rolling20_xGA_pk'] / df['sum_rolling20_TOI_pk'] 
    
    #to get back to back category
    df['Last_Game_Date'] = df.groupby('Team')['Date'].shift()
    df['Days_Since_Last_Game'] = df['Date'] - df['Last_Game_Date']
    df['B2B'] = np.where(df['Days_Since_Last_Game'] == '1 days', 1, 0)
    
    return df

In [8]:
#get starters
def get_starters(year):
    goalie_table_teams = { 'ANA': 'ANA', 
                      'ARI' : 'ARI',
                      'BOS': 'BOS', 
                      'BUF':'BUF',
                      'CGY': 'CGY', 
                      'CAR': 'CAR', 
                      'CHI': 'CHI', 
                      'COL': 'COL',
                     'CBJ': 'CBJ',
                     'DAL': 'DAL',
                     'DET': 'DET',
                     'EDM': 'EDM',
                     'FLA': 'FLO',
                     'L.A': 'LOS',
                     'MIN': 'MIN',
                     'MTL': 'MON',
                     'NSH': 'NSH',
                     'N.J': 'NJD',
                     'NYI': 'NYI',
                     'NYR': 'NYR',
                     'OTT': 'OTT',
                     'PHI': 'PHI',
                     'PIT': 'PIT',
                     'S.J': 'SJS',
                     'STL': 'STL',
                     'T.B': 'TBL',
                     'TOR': 'TOR',
                     'VAN': 'VAN',
                     'WSH': 'WSH',
                     'WPG': 'WPG',
                     'VGK':'VGK'}
    counter = 0
    for k,v in goalie_table_teams.items():
        print(k)
        starter_url = 'http://hockeygoalies.org/bio/nhl/logs/{}{}.html'.format(v, year)
        goalies = pd.read_html(starter_url)[0]
        goalies.replace(to_replace=['(BU)', np.NaN], value = 'DNP', inplace = True)
        goalies.drop(columns = ['DEC'], inplace = True)
        goalies.drop(index  = goalies.iloc[-1].name, inplace = True)
        goalies['starter'] = 'placeholder'

        starter = []
        for i, row in goalies.iterrows():
            for n in range(len(row)):
                if row[n][0] == 'W' or row[n][0] == 'L':
                    starter.append(goalies.columns[n])

        goalies['starter'] = starter
        goalies['Team'] = k
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['DATE'] = pd.to_datetime(goalies['DATE'])
        goalies['Team_Key'] = goalies['Team'].astype(str)+'_'+goalies['DATE'].astype(str)
        columns = ['Team','DATE', 'OPPONENT', 'starter', 'Team_Key']
        if counter == 0:
            master = goalies
        if counter != 0:
            master = pd.concat( [master[columns], goalies[columns]])
        counter +=1
    return master
                

In [9]:
#'2017-10-04' to '2018-04-08'
def get_game_results(season_start, season_end):
    sched_df = hockey_scraper.scrape_schedule(season_start, season_end)
    sched_df['Home_Team_Won'] = np.where(sched_df['home_score'] > sched_df['away_score'], 1, 0)
    sched_df['Home_Team_Key'] = sched_df['home_team'].astype(str)+'_'+sched_df['date'].astype(str)
    sched_df['Away_Team_Key'] = sched_df['away_team'].astype(str)+'_'+sched_df['date'].astype(str)
    return sched_df

In [10]:
def merge_starters_and_features(game_results_df, goalies_df, features_df, feature_columns, goalie_feature_columns):
    goalies_df = goalies_df[goalies_df['TOI'] >=28]
    df = game_results_df.merge(goalies_df[goalie_feature_columns].add_prefix('home_'), left_on = 'Home_Team_Key', right_on = 'home_Team_Key', how = 'left').rename(columns ={'home_Name':'home_goalie'}).drop(columns = 'home_Team_Key')
    df = df.merge(goalies_df[goalie_feature_columns].add_prefix('away_'), left_on = 'Away_Team_Key', right_on = 'away_Team_Key', how = 'left').rename(columns ={'away_Name':'away_goalie'}).drop(columns = 'away_Team_Key')
    df = df.merge(features_df[feature_columns].add_prefix('home_'), left_on = 'Home_Team_Key', right_on = 'home_Team_Key', how = 'left')
    df = df.merge(features_df[feature_columns].add_prefix('away_'), left_on = 'Away_Team_Key', right_on = 'away_Team_Key', how = 'left')
    return df

In [11]:
feature_columns = ['Team_Key', 'last_20_FF%_5v5', 'last_20_GF%_5v5', 'last_20_xGF%_5v5', 'last_20_SH%', 'last20_pp_TOI_per_game', 'last20_xGF_per_min_pp','last20_pk_TOI_per_game', 'last20_xGA_per_min_pk', 'B2B']
goalie_feature_columns = ['Team_Key', 'Name', 'Goalie_FenwickSV%', 'Goalie_GSAx/60', 'Goalie_HDCSV%']

### Get Goalie Data

In [21]:
#import dictionary with goalie names and IDs from NHL API
infile = open("data/goalie_ids.pickle",'rb')
goalie_ids = pickle.load(infile)
infile.close()

In [141]:
## scrape season long stats to get name of all goalies who played in time frame
goalie_list = pd.read_html('https://www.naturalstattrick.com/playerteams.php?fromseason=20182019&thruseason=20202021&stype=2&sit=5v5&score=all&stdoi=g&rate=n&team=ALL&pos=S&loc=B&toi=0&gpfilt=none&fd=&td=&tgp=410&lines=single&draftteam=ALL')[0]

In [143]:
#find which goalies are missing from goalie_ids dictionary
missing_goalies2 = [g for g in list(goalie_list['Player']) if g not in goalie_ids.keys() ]

In [146]:
## Cal Petersen already in Data Dictionary as Calvin Petersen 
missing_goalies2.remove('Cal Petersen')

In [147]:
missing_goalies2

['Richard Bachman',
 'Edward Pasquale',
 'Garret Sparks',
 'Antoine Bibeau',
 'Pheonix Copley',
 'Dan Vladar',
 'Landon Bow',
 'David Ayres',
 'Kevin Boyle',
 'Stuart Skinner',
 'Hunter Miska',
 'Matiss Kivlenieks',
 'Gilles Senn',
 'Jeremy Swayman',
 'Logan Thompson',
 'Kaden Fulcher',
 'Veini Vehvilainen',
 'Ivan Prosvetov',
 'Alexei Melnichuk']

In [145]:
goalie_ids['Calvin Petersen']

8477361

In [148]:
#manually looked up missing ids. Will write code in future to grab any missing active IDs
missing_id = ['8473614', '8475277', '8476343', '8477312', '8477831', '8478435', '8479016', '8479188', '8479294', '8479973', '8480112',  '8480162', '8480213', '8480280',  '8480313', '8480363',  '8481001', '8481031', '8482246']

In [149]:
for i,g in enumerate(missing_goalies2[:len(missing_id)]):
    goalie_ids[g] = missing_id[i]

In [548]:
def goalie_features(df, rolling_games = 30):
    rolling_games = rolling_games
    min_games = 10
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    
    df['Rolling_TOI'] = df.groupby('ID')['TOI'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_FA'] = df.groupby('ID')['FA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_SA'] = df.groupby('ID')['SA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_GA'] = df.groupby('ID')['GA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_xGA'] = df.groupby('ID')['xGA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_HDCA'] = df.groupby('ID')['HDCA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    df['Rolling_HDGA'] = df.groupby('ID')['HDGA'].transform(lambda x: x.rolling(rolling_games, min_games).sum().shift())
    
    df['Goalie_FenwickSV%'] =  (df['Rolling_FA'] - df['Rolling_GA']) /  df['Rolling_FA']
    df['Goalie_GSAx'] = df['Rolling_xGA'] - df['Rolling_GA']
    df['Goalie_GSAx/60'] =  df['Goalie_GSAx']*60 /  df['Rolling_TOI']
    df['Goalie_HDCSV%'] = (df['Rolling_HDCA'] - df['Rolling_HDGA'] ) / df['Rolling_HDCA'] 
    return df

In [289]:
# only use at most 2 seasons
def get_goalie_data(goalie_ids, start_year, end_year):
    counter = 0
    for name, gid in goalie_ids.items():

        sequence = [x/10 for x in range(60, 120)]
        time.sleep(random.choice(sequence))
        url = 'https://www.naturalstattrick.com/playerreport.php?fromseason={}&thruseason={}&playerid={}&sit=all&stype=2&stdoi=oi&rate=n&v=g'.format(start_year, end_year, gid)
        #due to number of http requests, NST may ban your IP before the loop finishes. I needed to use a VPN to get around this. If IP gets banned, this function will still return the current DF and you can call the function again and pass in an updated goalie dictionary to get the rest
        try:
            individual_df = pd.read_html(url)[0]
            individual_df['Name'] = name
            individual_df['ID'] = gid
        except:
            print(f'Ended before {name}')
            return all_goalies4 

        if counter == 0:
            all_goalies4 = individual_df
            print(name)
            print(counter)
        elif counter != 0:
            all_goalies4 = pd.concat([all_goalies4, individual_df])
            print(name)
            print(counter)


        counter +=1
    
    return all_goalies4

In [278]:
goalies_161718 = get_goalie_data(goalie_ids, 20162017, 20172018)

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14
Jaroslav Halak
15
Tuukka Rask
16
Michael Houser
17
Ukko-Pekka Luukkonen
18
Dustin Tokarski
19
Carter Hutton
20
Linus Ullmark
21
Charlie Lindgren
22
Carey Price
23
Jake Allen
24
Cayden Primeau
25
Joey Daccord
26
Anton Forsberg
27
Matt Murray
28
Marcus Hogberg
29
Filip Gustavsson
30
Frederik Andersen
31
Jack Campbell
32
David Rittich
33
James Reimer
34
Petr Mrazek
35
Alex Nedeljkovic
36
Philippe Desrosiers
37
Sam Montembeault
38
Sergei Bobrovsky
39
Chris Driedger
40
Spencer Knight
41
Christopher Gibson
42
Curtis McElhinney
43
Andrei Vasilevskiy
44
Craig Anderson
45
Vitek Vanecek
46
Ilya Samsonov
47
Malcolm Subban
48
Collin Delia
49
Kevin Lankinen
50
Thomas Greiss
51
Jonathan Bernier
52
Kasimir Kaskisuo
53
Pekka Rinne
54
Juuse Saros
55
Jordan 

In [283]:
missing1617 = [g for g in goalie_ids.keys() if g not in list(goalies_161718['Name'])]

In [284]:
len(missing1617)

83

In [287]:
missing1617_dict = {k:v for k,v in goalie_ids.items() if k in missing1617}

In [291]:
goalies_161718b = get_goalie_data(missing1617_dict, 20162017, 20172018)

Mackenzie Blackwood
0
Ilya Sorokin
1
Igor Shesterkin
2
Carter Hart
3
Emil Larmi
4
Michael Houser
5
Ukko-Pekka Luukkonen
6
Cayden Primeau
7
Joey Daccord
8
Marcus Hogberg
9
Filip Gustavsson
10
Philippe Desrosiers
11
Sam Montembeault
12
Spencer Knight
13
Vitek Vanecek
14
Ilya Samsonov
15
Kevin Lankinen
16
Kasimir Kaskisuo
17
Jordan Binnington
18
Ville Husso
19
Artyom Zagidulin
20
Pavel Francouz
21
Adam Werner
22
Peyton Jones
23
Jonas Johansson
24
Mikko Koskinen
25
Michael Dipietro
26
Arturs Silovs
27
Jake Oettinger
28
Troy Grosenick
29
Calvin Petersen
30
Josef Korenar
31
Elvis Merzlikins
32
Kaapo Kahkonen
33
Antti Niemi
34
Calvin Pickard
35
Cam Ward
36
Cameron Talbot
37
Chad Johnson
38
Corey Crawford
39
Daniel Taylor
40
Eddie Lack
41
Eric Comrie
42
Harri Sateri
43
Henrik Lundqvist
44
Jared Coreau
45
Jean-Francois Berube
46
Jeff Glass
47
Jimmy Howard
48
Jon Gillies
49
Kari Lehtonen
50
Ken Appleby
51
Maxime Lagace
52
Michael Hutchinson
53
Michal Neuvirth
54
Mike Condon
55
Mike McKenna
56
On

In [292]:
goalies_181920 = get_goalie_data(goalie_ids, 20182019, 20192020)

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14
Jaroslav Halak
15
Tuukka Rask
16
Michael Houser
17
Ukko-Pekka Luukkonen
18
Dustin Tokarski
19
Carter Hutton
20
Linus Ullmark
21
Charlie Lindgren
22
Carey Price
23
Jake Allen
24
Cayden Primeau
25
Joey Daccord
26
Anton Forsberg
27
Matt Murray
28
Marcus Hogberg
29
Filip Gustavsson
30
Frederik Andersen
31
Jack Campbell
32
David Rittich
33
James Reimer
34
Petr Mrazek
35
Alex Nedeljkovic
36
Philippe Desrosiers
37
Sam Montembeault
38
Sergei Bobrovsky
39
Chris Driedger
40
Spencer Knight
41
Christopher Gibson
42
Curtis McElhinney
43
Andrei Vasilevskiy
44
Craig Anderson
45
Vitek Vanecek
46
Ilya Samsonov
47
Malcolm Subban
48
Collin Delia
49
Kevin Lankinen
50
Thomas Greiss
51
Jonathan Bernier
52
Kasimir Kaskisuo
53
Pekka Rinne
54
Juuse Saros
55
Jordan 

In [293]:
goalies_2021 = get_goalie_data(goalie_ids, 20202021, 20202021)

Scott Wedgewood
0
Aaron Dell
1
Mackenzie Blackwood
2
Cory Schneider
3
Semyon Varlamov
4
Ilya Sorokin
5
Keith Kinkaid
6
Igor Shesterkin
7
Alexandar Georgiev
8
Brian Elliott
9
Alex Lyon
10
Carter Hart
11
Emil Larmi
12
Tristan Jarry
13
Casey DeSmith
14
Jaroslav Halak
15
Tuukka Rask
16
Michael Houser
17
Ukko-Pekka Luukkonen
18
Dustin Tokarski
19
Carter Hutton
20
Linus Ullmark
21
Charlie Lindgren
22
Carey Price
23
Jake Allen
24
Cayden Primeau
25
Joey Daccord
26
Anton Forsberg
27
Matt Murray
28
Marcus Hogberg
29
Filip Gustavsson
30
Frederik Andersen
31
Jack Campbell
32
David Rittich
33
James Reimer
34
Petr Mrazek
35
Alex Nedeljkovic
36
Philippe Desrosiers
37
Sam Montembeault
38
Sergei Bobrovsky
39
Chris Driedger
40
Spencer Knight
41
Christopher Gibson
42
Curtis McElhinney
43
Andrei Vasilevskiy
44
Craig Anderson
45
Vitek Vanecek
46
Ilya Samsonov
47
Malcolm Subban
48
Collin Delia
49
Kevin Lankinen
50
Thomas Greiss
51
Jonathan Bernier
52
Kasimir Kaskisuo
53
Pekka Rinne
54
Juuse Saros
55
Jordan 

In [549]:
goalies_all_B = pd.concat([goalies_161718, goalies_161718b, goalies_181920, goalies_2021])

In [550]:
goalie_features_dfB = goalie_features(goalies_all_B)

In [551]:
goalie_features_dfB

Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Rolling_TOI,Rolling_FA,Rolling_SA,Rolling_GA,Rolling_xGA,Rolling_HDCA,Rolling_HDGA,Goalie_FenwickSV%,Goalie_GSAx,Goalie_GSAx/60,Goalie_HDCSV%
0,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98,Scott Wedgewood,8475809,2017-10-30,ARI_2017-10-30,,,,,,,,,,,
1,2017-10-31 ARI at DET,ARI,58.333333,54,63,46.15,41,53,43.62,32,39,45.07,3,4,42.86,1.73,2.50,41,28,32,46.67,1,2,33.33,11,15,42.31,1,1,50,17,17,50,0,1,0.00,24,29,45.28,2,2,50.00,9.38,89.74,0.991,1,4,0,0,100.00,18,22,24,42.86,Scott Wedgewood,8475809,2017-10-31,ARI_2017-10-31,,,,,,,,,,,
2,2017-11-06 ARI at WSH,ARI,63.883333,49,75,39.52,38,60,38.78,26,40,39.39,2,3,40,1.81,2.84,38.95,20,31,39.22,2,1,66.67,8,10,44.44,2,1,66.67,12,21,36.36,0,0,-,26,37,41.27,0,2,0.00,7.69,92.5,1.002,1,4,0,0,100.00,14,18,26,35,Scott Wedgewood,8475809,2017-11-06,ARI_2017-11-06,,,,,,,,,,,
3,2017-11-14 ARI at WPG,ARI,34.866667,32,26,55.17,20,21,48.78,18,17,51.43,1,1,50,1.15,1.41,45,17,15,53.13,0,1,0,5,7,41.67,0,1,0,12,8,60,0,0,-,14,9,60.87,1,0,100.00,5.56,94.12,0.997,0,2,0,0,-,10,14,7,58.82,Scott Wedgewood,8475809,2017-11-14,ARI_2017-11-14,,,,,,,,,,,
4,2017-11-22 S.J at ARI,ARI,44.333333,32,32,50,23,24,48.94,19,16,54.29,1,2,33.33,1.22,1.41,46.35,12,16,42.86,1,2,33.33,8,7,53.33,1,2,33.33,4,9,30.77,0,0,-,15,15,50,0,0,-,5.26,87.5,0.928,0,3,1,0,0.00,9,13,22,29.03,Scott Wedgewood,8475809,2017-11-22,ARI_2017-11-22,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,2021-04-16 NYI at BOS,BOS,60.000000,46,44,51.11,35,35,50,28,25,52.83,3,0,100,2.58,2.17,54.29,22,25,46.81,2,0,100,10,8,55.56,2,0,100.00,12,17,41.38,0,0,-,24,17,58.54,1,0,100.00,10.71,100,1.107,0,3,0,0,-,15,13,27,35.71,Jeremy Swayman,8480280,2021-04-16,BOS_2021-04-16,,,,,,,,,,,
5,2021-04-22 BOS at BUF,BOS,59.983333,55,54,50.46,46,42,52.27,38,30,55.88,5,1,83.33,1.96,2.23,46.77,19,20,48.72,4,1,80,7,11,38.89,1,1,50.00,12,9,57.14,3,0,100.00,34,29,53.97,1,0,100.00,13.16,96.67,1.098,1,3,0,0,100.00,20,18,19,51.28,Jeremy Swayman,8480280,2021-04-22,BOS_2021-04-22,,,,,,,,,,,
6,2021-04-25 BOS at PIT,BOS,58.166667,43,52,45.26,35,38,47.95,28,29,49.12,0,1,0,1.28,2.07,38.25,13,30,30.23,0,1,0,2,11,15.38,0,0,-,11,19,36.67,0,1,0.00,25,22,53.19,0,0,-,0,96.55,0.966,0,3,1,0,0.00,19,11,24,44.19,Jeremy Swayman,8480280,2021-04-25,BOS_2021-04-25,,,,,,,,,,,
0,2021-03-10 VGK at MIN,VGK,8.250000,6,5,54.55,6,3,66.67,6,2,75,1,0,100,0.27,0.15,64.31,5,3,62.5,1,0,100,0,1,0,0,0,-,5,2,71.43,1,0,100,1,2,33.33,0,0,-,16.67,100,1.167,0,1,1,2,0,2,4,1,66.67,Logan Thompson,8480313,2021-03-10,VGK_2021-03-10,,,,,,,,,,,


In [552]:
#save to CSV
goalie_features_dfB.to_csv('goalie_game_logs.csv')

In [13]:
goalie_features_dfB = pd.read_csv('goalie_game_logs.csv', low_memory=False)

### Determine Goalie Stats to Impute for Goalies Having Player Less Than 10 Games

In [15]:
goalie_features_dfB.isna().sum()

Unnamed: 0              0
Game                    0
Team                    0
TOI                     0
CF                      0
                     ... 
Rolling_HDGA         1212
Goalie_FenwickSV%    1212
Goalie_GSAx          1212
Goalie_GSAx/60       1212
Goalie_HDCSV%        1212
Length: 70, dtype: int64

In [16]:
#set df for goalies who hadnt played 10 games at that point
ig_df = goalie_features_dfB[(goalie_features_dfB['Goalie_FenwickSV%'].isna()) & (goalie_features_dfB['Date'] >= '2017-10-04')]

In [17]:
ig_df.head()

Unnamed: 0.1,Unnamed: 0,Game,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,SCGF,SCGA,SCGF%,HDCF,HDCA,HDCF%,HDGF,HDGA,HDGF%,MDCF,MDCA,MDCF%,MDGF,MDGA,MDGF%,LDCF,LDCA,LDCF%,LDGF,LDGA,LDGF%,On-Ice SH%,On-Ice SV%,PDO,Off. Zone Starts,Neu. Zone Starts,Def. Zone Starts,On The Fly Starts,Off. Zone Start %,Off. Zone Faceoffs,Neu. Zone Faceoffs,Def. Zone Faceoffs,Off. Zone Faceoff %,Name,ID,Date,Team_Key,Rolling_TOI,Rolling_FA,Rolling_SA,Rolling_GA,Rolling_xGA,Rolling_HDCA,Rolling_HDGA,Goalie_FenwickSV%,Goalie_GSAx,Goalie_GSAx/60,Goalie_HDCSV%
0,0,2017-10-30 ARI at PHI,ARI,64.666667,58,60,49.15,42,48,46.67,33,31,51.56,4,3,57.14,2.98,2.27,56.78,32,21,60.38,3,3,50.0,13,8,61.9,1,2,33.33,19,13,59.38,2,1,66.67,18,35,33.96,1,0,100.00,12.12,90.32,1.024,1,4,0,0,100.00,24,23,25,48.98,Scott Wedgewood,8475809,2017-10-30,ARI_2017-10-30,,,,,,,,,,,
1,1,2017-10-31 ARI at DET,ARI,58.333333,54,63,46.15,41,53,43.62,32,39,45.07,3,4,42.86,1.73,2.5,41.0,28,32,46.67,1,2,33.33,11,15,42.31,1,1,50.0,17,17,50.0,0,1,0.00,24,29,45.28,2,2,50.00,9.38,89.74,0.991,1,4,0,0,100.00,18,22,24,42.86,Scott Wedgewood,8475809,2017-10-31,ARI_2017-10-31,,,,,,,,,,,
2,2,2017-11-06 ARI at WSH,ARI,63.883333,49,75,39.52,38,60,38.78,26,40,39.39,2,3,40.0,1.81,2.84,38.95,20,31,39.22,2,1,66.67,8,10,44.44,2,1,66.67,12,21,36.36,0,0,-,26,37,41.27,0,2,0.00,7.69,92.5,1.002,1,4,0,0,100.00,14,18,26,35.0,Scott Wedgewood,8475809,2017-11-06,ARI_2017-11-06,,,,,,,,,,,
3,3,2017-11-14 ARI at WPG,ARI,34.866667,32,26,55.17,20,21,48.78,18,17,51.43,1,1,50.0,1.15,1.41,45.0,17,15,53.13,0,1,0.0,5,7,41.67,0,1,0.0,12,8,60.0,0,0,-,14,9,60.87,1,0,100.00,5.56,94.12,0.997,0,2,0,0,-,10,14,7,58.82,Scott Wedgewood,8475809,2017-11-14,ARI_2017-11-14,,,,,,,,,,,
4,4,2017-11-22 S.J at ARI,ARI,44.333333,32,32,50.0,23,24,48.94,19,16,54.29,1,2,33.33,1.22,1.41,46.35,12,16,42.86,1,2,33.33,8,7,53.33,1,2,33.33,4,9,30.77,0,0,-,15,15,50.0,0,0,-,5.26,87.5,0.928,0,3,1,0,0.00,9,13,22,29.03,Scott Wedgewood,8475809,2017-11-22,ARI_2017-11-22,,,,,,,,,,,


In [18]:
ig_TOI = ig_df['TOI'].sum()
ig_FA = ig_df['FA'].sum()
ig_GA = ig_df['GA'].sum()
ig_xGA = ig_df['xGA'].sum()
ig_HDCA = ig_df['HDCA'].sum()
ig_HDGA = ig_df['HDGA'].sum()

In [19]:
ig_FenwickSV = (ig_FA - ig_GA) /ig_FA
ig_GSAx = ig_xGA - ig_GA
ig_GSAx60 = (ig_GSAx*60) / ig_TOI
ig_HDCSV = (ig_HDCA - ig_HDGA )/ ig_HDCA

In [20]:
#experience goalie df for comparison
eg_df = goalie_features_dfB[(~goalie_features_dfB['Goalie_FenwickSV%'].isna()) & (goalie_features_dfB['Date'] >= '2017-10-04')]

In [21]:
eg_TOI = eg_df['TOI'].sum()
eg_FA = eg_df['FA'].sum()
eg_GA = eg_df['GA'].sum()
eg_xGA = eg_df['xGA'].sum()
eg_HDCA = eg_df['HDCA'].sum()
eg_HDGA = eg_df['HDGA'].sum()
eg_FenwickSV = (eg_FA - eg_GA) /eg_FA
eg_GSAx = eg_xGA - eg_GA
eg_GSAx60 = (eg_GSAx*60) / eg_TOI
eg_HDCSV = (eg_HDCA - eg_HDGA )/ eg_HDCA

In [22]:
display(ig_FenwickSV)
display(eg_FenwickSV)
display(goalie_features_dfB['Goalie_FenwickSV%'].quantile(.5))
display(goalie_features_dfB['Goalie_FenwickSV%'].mean())

0.9346311576658926

0.9347083882774769

0.936355290183019

0.9363198791489435

In [23]:
goalie_features_dfB['Goalie_FenwickSV%'].std()

0.00861252815571879

In [24]:
display(ig_GSAx60)
display(eg_GSAx60)
display(goalie_features_dfB['Goalie_GSAx/60'].quantile(.5))
display(goalie_features_dfB['Goalie_GSAx/60'].mean())

-0.2815221605089356

-0.23263350120211818

-0.16603633214085703

-0.17256252989388246

In [25]:
goalie_features_dfB['Goalie_GSAx/60'].std()

0.36999829059900696

In [26]:
display(ig_HDCSV )
display(eg_HDCSV)
display(goalie_features_dfB['Goalie_HDCSV%'].quantile(.5))
display(goalie_features_dfB['Goalie_HDCSV%'].mean())

0.8591820987654321

0.8618742661970328

0.8652772175855543

0.8642324173784127

In [28]:
ig = [ig_FenwickSV,
ig_GSAx ,
ig_GSAx60 ,
ig_HDCSV ]

In [31]:
pickle_out = open("data/inexperienced_goalie_imputes.pickle","wb")
pickle.dump(ig, pickle_out)
pickle_out.close()

### 2017-2018 Season

In [27]:
primary1718 = get_and_format_nst_team_stats('20172018','5v5', 'n')
pp1718 = get_and_format_nst_team_stats('20172018','pp', 'n')
pk1718 = get_and_format_nst_team_stats('20172018','pk', 'n')

In [13]:
features1718 = merge_team_stats(primary1718 ,pp1718 ,pk1718  )

features1718 = calculate_team_features(features1718 )

In [15]:
features1718.tail()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling20_TOI_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_FF%_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
2537,"2018-04-07 - Canucks 2, Oilers 3",VAN,Limited ReportFull Report,51.583333,52,54,49.06,44,40,52.38,32,28,53.33,2,0,100.0,2.73,2.99,47.71,28,32,46.67,15,16,48.39,12,9,57.14,2,0,100.0,16.67,100.0,13,16,44.83,6,12,33.33,0,0,,0.0,100.0,22,16,57.89,12,5,70.59,0,0,,0.0,100.0,6.25,100.0,1.063,18347,2018-04-07,82,VAN_2018-04-07,1.516667,0.31,4.966667,0.44,997.166667,676.0,727.0,32.0,40.0,34.26,39.28,480.0,48.182466,44.444444,46.586891,6.666667,69.0,8.72,3.45,0.126377,105.366667,9.51,5.268333,0.090256,2018-04-05,2 days,0
2538,"2018-04-07 - Stars 4, Kings 2",DAL,Limited ReportFull Report,50.7,28,56,33.33,24,44,35.29,16,35,31.37,4,2,66.67,2.15,1.75,55.18,22,21,51.16,9,5,64.29,9,5,64.29,3,1,75.0,33.33,80.0,13,16,44.83,4,7,36.36,1,0,100.0,25.0,100.0,5,32,13.51,2,21,8.7,0,1,0.0,0.0,95.24,25.0,94.29,1.193,18230,2018-04-07,82,DAL_2018-04-07,6.0,0.14,2.0,0.17,958.366667,632.0,643.0,27.0,33.0,34.14,32.47,445.0,49.568627,45.0,51.253566,6.067416,94.066667,11.12,4.703333,0.118214,108.033333,14.14,5.401667,0.130886,2018-04-06,1 days,1
2539,"2018-04-07 - Stars 4, Kings 2",L.A,Limited ReportFull Report,50.7,56,28,66.67,44,24,64.71,35,16,68.63,2,4,33.33,1.75,2.15,44.82,21,22,48.84,5,9,35.71,5,9,35.71,1,3,25.0,20.0,66.67,16,13,55.17,7,4,63.64,0,1,0.0,0.0,75.0,32,5,86.49,21,2,91.3,1,0,100.0,4.76,100.0,5.71,75.0,0.807,18230,2018-04-07,82,L.A_2018-04-07,2.0,0.17,6.0,0.14,982.116667,679.0,688.0,41.0,35.0,32.65,35.32,481.0,49.670812,53.947368,48.035898,8.523909,92.366667,13.04,4.618333,0.141176,94.516667,9.07,4.725833,0.095962,2018-04-05,2 days,0
2540,"2018-04-07 - Wild 6, Sharks 3",MIN,Limited ReportFull Report,53.466667,52,56,48.15,31,37,45.59,23,22,51.11,5,2,71.43,1.84,1.53,54.62,25,21,54.35,14,10,58.33,12,6,66.67,3,2,60.0,25.0,66.67,11,11,50.0,4,4,50.0,1,0,100.0,25.0,100.0,22,27,44.9,7,12,36.84,1,0,100.0,14.29,100.0,21.74,90.91,1.126,17562,2018-04-07,82,MIN_2018-04-07,3.25,0.28,1.25,0.0,976.766667,677.0,660.0,37.0,34.0,36.27,30.75,492.0,50.635752,52.112676,54.118174,7.520325,90.283333,11.47,4.514167,0.127044,88.0,7.56,4.4,0.085909,2018-04-05,2 days,0
2541,"2018-04-07 - Wild 6, Sharks 3",S.J,Limited ReportFull Report,53.466667,56,52,51.85,37,31,54.41,22,23,48.89,2,5,28.57,1.53,1.84,45.38,21,25,45.65,10,14,41.67,6,12,33.33,2,3,40.0,33.33,75.0,11,11,50.0,4,4,50.0,0,1,0.0,0.0,75.0,27,22,55.1,12,7,63.16,0,1,0.0,0.0,85.71,9.09,78.26,0.874,17562,2018-04-07,82,S.J_2018-04-07,1.25,0.0,3.25,0.28,1017.316667,757.0,711.0,49.0,39.0,42.4,36.63,546.0,51.566757,55.681818,53.650512,8.974359,85.3,12.47,4.265,0.14619,68.7,7.74,3.435,0.112664,2018-04-05,2 days,0


In [18]:
results = get_game_results('2017-10-04', '2018-04-08')

Scraping the schedule between 2017-10-04 and 2018-04-08


In [241]:
results.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04
3,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04
4,2017020005,2017-10-05,TD Garden,BOS,NSH,2017-10-05 23:00:00,4,3,Final,1,BOS_2017-10-05,NSH_2017-10-05


In [301]:
df_20172018 = merge_starters_and_features(results, goalie_features_dfB, features1718 , feature_columns, goalie_feature_columns)

In [265]:
df_20172018.tail(30)

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
1241,2017021243,2018-04-05,Wells Fargo Center,PHI,CAR,2018-04-05 23:00:00,4,3,Final,1,PHI_2018-04-05,CAR_2018-04-05,,,,,,,,,PHI_2018-04-05,50.909091,48.837209,48.087655,8.267717,5.266667,0.119525,3.953333,0.095742,0,CAR_2018-04-05,54.005525,48.421053,52.547086,8.378871,4.151667,0.114532,3.551667,0.104317,0
1242,2017021244,2018-04-05,Capital One Arena,WSH,NSH,2018-04-05 23:00:00,3,4,Final,0,WSH_2018-04-05,NSH_2018-04-05,,,,,,,,,WSH_2018-04-05,49.273256,54.054054,50.321143,8.230453,4.4675,0.120761,5.623333,0.120747,0,NSH_2018-04-05,53.15446,58.227848,52.720371,8.695652,4.895,0.107865,6.375833,0.115436,0
1243,2017021245,2018-04-05,Nationwide Arena,CBJ,PIT,2018-04-05 23:00:00,4,5,Final,0,CBJ_2018-04-05,PIT_2018-04-05,,,,,,,,,CBJ_2018-04-05,52.0,58.823529,51.786834,9.689922,4.414167,0.083821,4.964167,0.116233,0,PIT_2018-04-05,53.641457,53.684211,55.853832,8.994709,4.130833,0.144039,4.47,0.127405,0
1244,2017021246,2018-04-05,Little Caesars Arena,DET,MTL,2018-04-05 23:30:00,3,4,Final,0,DET_2018-04-05,MTL_2018-04-05,,,,,,,,,DET_2018-04-05,49.664929,46.341463,50.476324,8.0,4.736667,0.100176,5.620833,0.120801,0,MTL_2018-04-05,45.790251,44.776119,45.346777,6.741573,4.6775,0.144842,4.851667,0.153968,0
1245,2017021247,2018-04-05,BB&T Center,FLA,BOS,2018-04-05 23:30:00,3,2,Final,1,FLA_2018-04-05,BOS_2018-04-05,,,,,,,,,FLA_2018-04-05,51.016393,57.142857,50.108696,8.421053,5.120833,0.109357,3.725833,0.114739,0,BOS_2018-04-05,54.041916,46.666667,55.8126,6.889764,5.358333,0.144355,5.845,0.096578,0
1246,2017021248,2018-04-05,Bell MTS Place,WPG,CGY,2018-04-06 00:00:00,2,1,Final,1,WPG_2018-04-05,CGY_2018-04-05,,,,,,,,,WPG_2018-04-05,51.310345,55.952381,49.986406,8.867925,4.3875,0.100285,5.3775,0.14821,0,CGY_2018-04-05,58.639456,38.554217,57.543017,5.536332,4.896667,0.156229,5.125,0.106634,0
1247,2017021249,2018-04-05,Rogers Place,EDM,VGK,2018-04-06 01:00:00,4,3,Final,1,EDM_2018-04-05,VGK_2018-04-05,,,,,,,,,EDM_2018-04-05,46.143345,47.252747,44.400141,8.669355,4.353333,0.097167,4.738333,0.097292,0,VGK_2018-04-05,48.794326,49.295775,48.302469,7.12831,4.696667,0.130199,5.084167,0.115555,0
1248,2017021250,2018-04-05,Rogers Arena,VAN,ARI,2018-04-06 02:00:00,4,3,Final,1,VAN_2018-04-05,ARI_2018-04-05,,,,,,,,,VAN_2018-04-05,47.619048,43.055556,45.87291,6.485356,3.468333,0.12297,5.27,0.090797,0,ARI_2018-04-05,49.393291,58.571429,51.904762,8.506224,4.6375,0.111914,4.611667,0.16339,0
1249,2017021251,2018-04-05,STAPLES Center,L.A,MIN,2018-04-06 02:30:00,5,4,Final,1,L.A_2018-04-05,MIN_2018-04-05,,,,,,,,,L.A_2018-04-05,49.419448,52.777778,47.335062,7.933194,4.340833,0.140411,4.715833,0.092242,0,MIN_2018-04-05,50.99926,54.285714,55.064483,7.554672,4.49,0.127394,4.0225,0.085022,1
1250,2017021252,2018-04-05,SAP Center at San Jose,S.J,COL,2018-04-06 02:30:00,4,2,Final,1,S.J_2018-04-05,COL_2018-04-05,,,,,,,,,S.J_2018-04-05,51.191287,52.173913,52.808704,8.921933,4.638333,0.142939,3.570833,0.12252,0,COL_2018-04-05,46.320346,55.714286,47.203924,8.387097,5.81,0.088554,4.540833,0.116168,0


In [302]:

df_20172018.isna().sum()

game_id                          0
date                             0
venue                            0
home_team                        0
away_team                        0
start_time                       0
home_score                       0
away_score                       0
status                           0
Home_Team_Won                    0
Home_Team_Key                    0
Away_Team_Key                    0
home_goalie                      0
home_Last_20_FenwickSV%        126
home_Last_20_GSAx/60           126
home_Last_20_HDCSV%            126
away_goalie                      0
away_Last_20_FenwickSV%        162
away_Last_20_GSAx/60           162
away_Last_20_HDCSV%            162
home_Team_Key                    0
home_last_20_FF%_5v5           329
home_last_20_GF%_5v5           329
home_last_20_xGF%_5v5          329
home_last_20_SH%               329
home_last20_pp_TOI_per_game    329
home_last20_xGF_per_min_pp     329
home_last20_pk_TOI_per_game    329
home_last20_xGA_per_

### 2018-2019 Season

In [251]:
primary1819 = get_and_format_nst_team_stats('20182019','5v5', 'n')
pp1819 = get_and_format_nst_team_stats('20182019','pp', 'n')
pk1819 = get_and_format_nst_team_stats('20182019','pk', 'n')

In [252]:
features1819 = merge_team_stats(primary1819,pp1819,pk1819)

In [253]:
features1819 = calculate_team_features(features1819)

In [254]:
results1819 = get_game_results('2018-10-03', '2019-04-06')

Scraping the schedule between 2018-10-03 and 2019-04-06


In [307]:
df_20182019 = merge_starters_and_features(results1819, goalie_features_dfB , features1819, feature_columns, goalie_feature_columns)

In [308]:
df_20182019.tail()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
1318,2018021267,2019-04-06,American Airlines Center,DAL,MIN,2019-04-07 00:00:00,3,0,Final,1,DAL_2019-04-06,MIN_2019-04-06,Ben Bishop,0.958922,0.689728,0.923077,Alex Stalock,0.927746,-0.614854,0.808219,DAL_2019-04-06,49.125364,54.098361,52.671958,6.470588,4.323333,0.120625,4.574167,0.12188,1.0,MIN_2019-04-06,49.752999,45.16129,52.003938,5.645161,5.189167,0.097318,4.146667,0.082838,0.0
1319,2018021269,2019-04-06,Gila River Arena,ARI,WPG,2019-04-07 02:00:00,2,4,Final,0,ARI_2019-04-06,WPG_2019-04-06,Calvin Pickard,0.908805,-1.222352,0.811429,Connor Hellebuyck,0.937294,-0.151051,0.866071,ARI_2019-04-06,49.018182,56.140351,51.445636,6.543967,5.61,0.112121,4.285,0.102917,0.0,WPG_2019-04-06,46.690647,50.0,43.688785,8.510638,5.008333,0.147854,4.606667,0.113097,0.0
1320,2018021268,2019-04-06,Scotiabank Saddledome,CGY,EDM,2019-04-07 02:00:00,1,3,Final,0,CGY_2019-04-06,EDM_2019-04-06,Mike Smith,0.935829,0.043749,0.890995,Mikko Koskinen,0.929716,-0.480499,0.869565,CGY_2019-04-06,56.752768,61.363636,57.474336,9.782609,5.12,0.09248,4.489167,0.123408,0.0,EDM_2019-04-06,45.687812,44.94382,44.327894,8.350731,3.325833,0.104635,4.376667,0.118812,0.0
1321,2018021270,2019-04-06,STAPLES Center,L.A,VGK,2019-04-07 02:30:00,5,2,Final,1,L.A_2019-04-06,VGK_2019-04-06,Jonathan Quick,0.907407,-1.504394,0.823834,Marc-Andre Fleury,0.936629,0.047983,0.833333,L.A_2019-04-06,46.001367,39.784946,47.427932,8.078603,4.2225,0.12907,4.0325,0.125728,1.0,VGK_2019-04-06,55.547898,58.823529,56.39079,8.605852,3.981667,0.135245,4.005,0.10362,0.0
1322,2018021271,2019-04-06,SAP Center at San Jose,S.J,COL,2019-04-07 02:30:00,5,2,Final,1,S.J_2019-04-06,COL_2019-04-06,Martin Jones,0.921196,-0.641724,0.82266,Semyon Varlamov,0.93253,-0.217921,0.861607,S.J_2019-04-06,53.20959,46.590909,53.004873,8.418891,4.920833,0.124776,4.365,0.118442,0.0,COL_2019-04-06,49.443207,50.0,49.039005,6.150794,5.243333,0.101271,4.666667,0.136821,0.0


In [310]:
df_20182019.isna().sum()

game_id                          0
date                             0
venue                            0
home_team                        0
away_team                        0
start_time                       0
home_score                       0
away_score                       0
status                           0
Home_Team_Won                    0
Home_Team_Key                    0
Away_Team_Key                    0
home_goalie                      1
home_Last_20_FenwickSV%        119
home_Last_20_GSAx/60           119
home_Last_20_HDCSV%            119
away_goalie                      2
away_Last_20_FenwickSV%        137
away_Last_20_GSAx/60           137
away_Last_20_HDCSV%            137
home_Team_Key                    1
home_last_20_FF%_5v5           328
home_last_20_GF%_5v5           328
home_last_20_xGF%_5v5          328
home_last_20_SH%               328
home_last20_pp_TOI_per_game    328
home_last20_xGF_per_min_pp     328
home_last20_pk_TOI_per_game    328
home_last20_xGA_per_

### 2019-2020 Season

In [255]:
primary1920 = get_and_format_nst_team_stats('20192020','5v5', 'n')
pp1920 = get_and_format_nst_team_stats('20192020','pp', 'n')
pk1920 = get_and_format_nst_team_stats('20192020','pk', 'n')

In [256]:
features1920 = merge_team_stats(primary1920,pp1920,pk1920)

In [257]:
features1920 = calculate_team_features(features1920)

In [258]:
results1920 = get_game_results('2019-10-02', '2020-03-12')

Scraping the schedule between 2019-10-02 and 2020-03-12


In [320]:
df_20192020 = merge_starters_and_features(results1920, goalie_features_dfB , features1920, feature_columns, goalie_feature_columns)

In [321]:
df_20192020.isna().sum()

game_id                          0
date                             0
venue                            0
home_team                        0
away_team                        0
start_time                       0
home_score                       0
away_score                       0
status                           0
Home_Team_Won                    0
Home_Team_Key                    0
Away_Team_Key                    0
home_goalie                      1
home_Last_20_FenwickSV%         78
home_Last_20_GSAx/60            78
home_Last_20_HDCSV%             78
away_goalie                      2
away_Last_20_FenwickSV%         82
away_Last_20_GSAx/60            82
away_Last_20_HDCSV%             82
home_Team_Key                    1
home_last_20_FF%_5v5           320
home_last_20_GF%_5v5           320
home_last_20_xGF%_5v5          320
home_last_20_SH%               320
home_last20_pp_TOI_per_game    320
home_last20_xGF_per_min_pp     320
home_last20_pk_TOI_per_game    320
home_last20_xGA_per_

In [344]:
df_20192020[df_20192020['home_Team_Key'].isna()]

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
792,2019120001,2020-01-24,Enterprise Center,AMERICAN ALL-STARS,CANADIAN ALL-STARS,2020-01-25 02:30:00,1,2,Final,0,AMERICAN ALL-STARS_2020-01-24,CANADIAN ALL-STARS_2020-01-24,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### 2020-2021 Season

In [328]:
primary2021 = get_and_format_nst_team_stats('20202021','5v5', 'n')
pp2021 = get_and_format_nst_team_stats('20202021','pp', 'n')
pk2021 = get_and_format_nst_team_stats('20202021','pk', 'n')

In [331]:
pp2021.shape

(1506, 62)

In [332]:
features2021 = merge_team_stats(primary2021,pp2021,pk2021)

In [333]:
features2021 = calculate_team_features(features2021)

In [335]:
features2021.head()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling20_TOI_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_FF%_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
0,"2021-01-13 - Penguins 3, Flyers 6",PHI,Limited ReportFull Report,51.933333,38,48,44.19,30,37,44.78,22,30,42.31,4,2,66.67,1.94,1.58,55.04,18,19,48.65,10,7,58.82,8,6,57.14,4,1,80.0,50.0,83.33,8,12,40.0,2,7,22.22,0,1,0.0,0.0,85.71,16,23,41.03,10,15,40.0,0,0,,0.0,100.0,18.18,93.33,1.115,0,2021-01-13,1,PHI_2021-01-13,5.783333,0.28,2.133333,0.28,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0
1,"2021-01-13 - Penguins 3, Flyers 6",PIT,Limited ReportFull Report,51.933333,48,38,55.81,37,30,55.22,30,22,57.69,2,4,33.33,1.58,1.94,44.96,19,18,51.35,7,10,41.18,6,8,42.86,1,4,20.0,16.67,50.0,12,8,60.0,7,2,77.78,1,0,100.0,14.29,100.0,23,16,58.97,15,10,60.0,0,0,,0.0,100.0,6.67,81.82,0.885,0,2021-01-13,1,PIT_2021-01-13,2.133333,0.28,5.783333,0.28,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0
2,"2021-01-13 - Blackhawks 1, Lightning 5",CHI,Limited ReportFull Report,45.583333,33,34,49.25,23,27,46.0,19,22,46.34,0,3,0.0,1.07,1.79,37.29,19,24,44.19,2,7,22.22,2,7,22.22,0,2,0.0,0.0,71.43,17,17,50.0,12,9,57.14,0,1,0.0,0.0,88.89,8,9,47.06,5,6,45.45,0,0,,0.0,100.0,0.0,86.36,0.864,0,2021-01-13,1,CHI_2021-01-13,7.35,0.93,6.3,0.49,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0
3,"2021-01-13 - Blackhawks 1, Lightning 5",T.B,Limited ReportFull Report,45.583333,34,33,50.75,27,23,54.0,22,19,53.66,3,0,100.0,1.79,1.07,62.71,24,19,55.81,7,2,77.78,7,2,77.78,2,0,100.0,28.57,100.0,17,17,50.0,9,12,42.86,1,0,100.0,11.11,100.0,9,8,52.94,6,5,54.55,0,0,,0.0,100.0,13.64,100.0,1.136,0,2021-01-13,1,T.B_2021-01-13,6.3,0.49,7.35,0.93,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0
4,"2021-01-13 - Canadiens 4, Maple Leafs 5",MTL,Limited ReportFull Report,48.6,47,52,47.47,33,33,50.0,22,20,52.38,2,2,50.0,1.78,1.77,50.06,25,30,45.45,9,6,60.0,5,4,55.56,1,1,50.0,20.0,75.0,16,24,40.0,7,9,43.75,1,0,100.0,14.29,100.0,18,20,47.37,8,7,53.33,0,1,0.0,0.0,85.71,9.09,90.0,0.991,0,2021-01-13,1,MTL_2021-01-13,5.983333,1.64,5.416667,0.88,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0


In [336]:
results2021 = get_game_results('2021-01-13', '2021-04-29')

Scraping the schedule between 2021-01-13 and 2021-04-29


In [337]:
results2021.shape

(767, 12)

In [338]:
df_20202021 = merge_starters_and_features(results2021, goalie_features_dfB, features2021, feature_columns, goalie_feature_columns)

In [339]:
df_20202021.isna().sum()

game_id                          0
date                             0
venue                            0
home_team                        0
away_team                        0
start_time                       0
home_score                       0
away_score                       0
status                           0
Home_Team_Won                    0
Home_Team_Key                    0
Away_Team_Key                    0
home_goalie                     10
home_Last_20_FenwickSV%        110
home_Last_20_GSAx/60           110
home_Last_20_HDCSV%            110
away_goalie                     12
away_Last_20_FenwickSV%        115
away_Last_20_GSAx/60           115
away_Last_20_HDCSV%            115
home_Team_Key                    1
home_last_20_FF%_5v5           324
home_last_20_GF%_5v5           324
home_last_20_xGF%_5v5          324
home_last_20_SH%               324
home_last20_pp_TOI_per_game    324
home_last20_xGF_per_min_pp     324
home_last20_pk_TOI_per_game    324
home_last20_xGA_per_

In [340]:
#folling up with NST about this missing game
df_20202021[df_20202021['home_Team_Key'].isna()]

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
659,2020020651,2021-04-11,Nassau Veterans Memorial Coliseum,NYI,NYR,2021-04-11 23:00:00,3,2,Final,1,NYI_2021-04-11,NYR_2021-04-11,Ilya Sorokin,,,,Igor Shesterkin,0.94335,0.061426,0.896714,,,,,,,,,,,,,,,,,,,,


### Combine and Save

In [None]:
all_games_rolling20_noSVA = pd.concat([df_20172018, df_20182019, df_20192020, df_20202021])

In [345]:
#impute goalie stats where lack of games causing NaN
all_games_rolling20_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_rolling20_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling20_noSVA['away_Last_20_FenwickSV%'])
all_games_rolling20_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_rolling20_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling20_noSVA['away_Last_20_GSAx/60'])
all_games_rolling20_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_rolling20_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling20_noSVA['away_Last_20_HDCSV%'])
all_games_rolling20_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_rolling20_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling20_noSVA['home_Last_20_FenwickSV%'])
all_games_rolling20_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_rolling20_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling20_noSVA['home_Last_20_GSAx/60'])
all_games_rolling20_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_rolling20_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling20_noSVA['home_Last_20_HDCSV%'])

In [380]:
all_games_rolling20_noSVA.isna().sum()

game_id                           0
date                              0
venue                             0
home_team                         0
away_team                         0
start_time                        0
home_score                        0
away_score                        0
status                            0
Home_Team_Won                     0
Home_Team_Key                     0
Away_Team_Key                     0
home_goalie                      12
home_Last_20_FenwickSV%           0
home_Last_20_GSAx/60              0
home_Last_20_HDCSV%               0
away_goalie                      16
away_Last_20_FenwickSV%           0
away_Last_20_GSAx/60              0
away_Last_20_HDCSV%               0
home_Team_Key                     3
home_last_20_FF%_5v5           1301
home_last_20_GF%_5v5           1301
home_last_20_xGF%_5v5          1301
home_last_20_SH%               1301
home_last20_pp_TOI_per_game    1301
home_last20_xGF_per_min_pp     1301
home_last20_pk_TOI_per_game 

In [381]:
all_games_rolling20_noSVA.to_csv('data/all_games_rolling20_noSVA.csv')

### EDA

In [None]:
df_20172018['Home_Team_Won'].value_counts(normalize = True)

In [None]:
df_20172018.columns

In [None]:
df_20172018[(df_20172018['home_B2B'] == 1) & (df_20172018['away_B2B'] == 0)]['Home_Team_Won'].value_counts(normalize = True)

In [None]:
df_20172018[(df_20172018['home_B2B'] == 0) & (df_20172018['away_B2B'] == 1)]['Home_Team_Won'].value_counts(normalize = True)

### Quick Model

Evaluating whether rolling 5, 10, 15, 20, or 25 games provides most accurate results. Will use CV results from simple Logistic Regression to Determine

In [383]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import log_loss
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.under_sampling import TomekLinks 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.pipeline import make_pipeline

#### 20 Game Rolling

In [385]:
model_df = all_games_rolling20_noSVA.dropna()

In [386]:
model_df.isna().sum()

game_id                        0
date                           0
venue                          0
home_team                      0
away_team                      0
start_time                     0
home_score                     0
away_score                     0
status                         0
Home_Team_Won                  0
Home_Team_Key                  0
Away_Team_Key                  0
home_goalie                    0
home_Last_20_FenwickSV%        0
home_Last_20_GSAx/60           0
home_Last_20_HDCSV%            0
away_goalie                    0
away_Last_20_FenwickSV%        0
away_Last_20_GSAx/60           0
away_Last_20_HDCSV%            0
home_Team_Key                  0
home_last_20_FF%_5v5           0
home_last_20_GF%_5v5           0
home_last_20_xGF%_5v5          0
home_last_20_SH%               0
home_last20_pp_TOI_per_game    0
home_last20_xGF_per_min_pp     0
home_last20_pk_TOI_per_game    0
home_last20_xGA_per_min_pk     0
home_B2B                       0
away_Team_

In [387]:
model_df.columns

Index(['game_id', 'date', 'venue', 'home_team', 'away_team', 'start_time',
       'home_score', 'away_score', 'status', 'Home_Team_Won', 'Home_Team_Key',
       'Away_Team_Key', 'home_goalie', 'home_Last_20_FenwickSV%',
       'home_Last_20_GSAx/60', 'home_Last_20_HDCSV%', 'away_goalie',
       'away_Last_20_FenwickSV%', 'away_Last_20_GSAx/60',
       'away_Last_20_HDCSV%', 'home_Team_Key', 'home_last_20_FF%_5v5',
       'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5', 'home_last_20_SH%',
       'home_last20_pp_TOI_per_game', 'home_last20_xGF_per_min_pp',
       'home_last20_pk_TOI_per_game', 'home_last20_xGA_per_min_pk', 'home_B2B',
       'away_Team_Key', 'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5',
       'away_last_20_xGF%_5v5', 'away_last_20_SH%',
       'away_last20_pp_TOI_per_game', 'away_last20_xGF_per_min_pp',
       'away_last20_pk_TOI_per_game', 'away_last20_xGA_per_min_pk',
       'away_B2B'],
      dtype='object')

In [388]:
features = ['home_Last_20_FenwickSV%',
       'home_Last_20_GSAx/60', 'home_Last_20_HDCSV%',
       'away_Last_20_FenwickSV%', 'away_Last_20_GSAx/60',
       'away_Last_20_HDCSV%', 'home_last_20_FF%_5v5',
       'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5', 'home_last_20_SH%',
       'home_last20_pp_TOI_per_game', 'home_last20_xGF_per_min_pp',
       'home_last20_pk_TOI_per_game', 'home_last20_xGA_per_min_pk', 'home_B2B', 'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5',
       'away_last_20_xGF%_5v5', 'away_last_20_SH%',
       'away_last20_pp_TOI_per_game', 'away_last20_xGF_per_min_pp',
       'away_last20_pk_TOI_per_game', 'away_last20_xGA_per_min_pk',
       'away_B2B']

In [432]:
X = model_df[features]
y = model_df['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_20 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_20

-0.6698997296268989

In [403]:
model_df.shape

(3241, 40)

### 15 Game Rolling

In [397]:
#17-18
features_b = merge_team_stats(primary,pp,pk)
features_b = calculate_team_features(features_b, 15)
df_20172018_b = merge_starters_and_features(results, goalie_features_dfB, features_b, feature_columns, goalie_feature_columns)

In [398]:
#18-19
features_1819_b = merge_team_stats(primary1819,pp1819,pk1819)
features_1819_b = calculate_team_features(features_1819_b, 15)
df_20182019_b = merge_starters_and_features(results1819, goalie_features_dfB, features_1819_b, feature_columns, goalie_feature_columns)

In [402]:
#19-20
features_1920_b = merge_team_stats(primary1920,pp1920,pk1920)
features_1920_b = calculate_team_features(features_1920_b, 15)
df_20192020_b = merge_starters_and_features(results1920, goalie_features_dfB, features_1920_b, feature_columns, goalie_feature_columns)

In [404]:
#20-21
features_2021_b = merge_team_stats(primary2021,pp2021,pk2021)
features_2021_b = calculate_team_features(features_2021_b, 15)
df_20202021_b = merge_starters_and_features(results2021, goalie_features_dfB, features_2021_b, feature_columns, goalie_feature_columns)

In [405]:
all_games_rolling15_noSVA = pd.concat([df_20172018_b, df_20182019_b, df_20192020_b, df_20202021_b])

#impute goalie stats where lack of games causing NaN
all_games_rolling15_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_rolling15_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling15_noSVA['away_Last_20_FenwickSV%'])
all_games_rolling15_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_rolling15_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling15_noSVA['away_Last_20_GSAx/60'])
all_games_rolling15_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_rolling15_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling15_noSVA['away_Last_20_HDCSV%'])
all_games_rolling15_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_rolling15_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling15_noSVA['home_Last_20_FenwickSV%'])
all_games_rolling15_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_rolling15_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling15_noSVA['home_Last_20_GSAx/60'])
all_games_rolling15_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_rolling15_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling15_noSVA['home_Last_20_HDCSV%'])

In [431]:
model_df_b = all_games_rolling15_noSVA.dropna()

X = model_df_b[features]
y = model_df_b['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_15 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_15

-0.6738516184149099

### 10 Game Rolling

In [412]:
#17-18
features_c = merge_team_stats(primary,pp,pk)
features_c = calculate_team_features(features_c, 10)
df_20172018_c = merge_starters_and_features(results, goalie_features_dfB, features_c, feature_columns, goalie_feature_columns)

#18-19
features_1819_c = merge_team_stats(primary1819,pp1819,pk1819)
features_1819_c = calculate_team_features(features_1819_c, 10)
df_20182019_c = merge_starters_and_features(results1819, goalie_features_dfB, features_1819_c, feature_columns, goalie_feature_columns)

#19-20
features_1920_c = merge_team_stats(primary1920,pp1920,pk1920)
features_1920_c = calculate_team_features(features_1920_c, 10)
df_20192020_c = merge_starters_and_features(results1920, goalie_features_dfB, features_1920_c, feature_columns, goalie_feature_columns)

#20-21
features_2021_c = merge_team_stats(primary2021,pp2021,pk2021)
features_2021_c = calculate_team_features(features_2021_c, 10)
df_20202021_c = merge_starters_and_features(results2021, goalie_features_dfB, features_2021_c, feature_columns, goalie_feature_columns)

In [416]:
all_games_rolling10_noSVA = pd.concat([df_20172018_c, df_20182019_c, df_20192020_c, df_20202021_c])

#impute goalie stats where lack of games causing NaN
all_games_rolling10_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_rolling10_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling10_noSVA['away_Last_20_FenwickSV%'])
all_games_rolling10_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_rolling10_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling10_noSVA['away_Last_20_GSAx/60'])
all_games_rolling10_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_rolling10_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling10_noSVA['away_Last_20_HDCSV%'])
all_games_rolling10_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_rolling10_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling10_noSVA['home_Last_20_FenwickSV%'])
all_games_rolling10_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_rolling10_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling10_noSVA['home_Last_20_GSAx/60'])
all_games_rolling10_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_rolling10_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling10_noSVA['home_Last_20_HDCSV%'])

In [433]:
model_df_c = all_games_rolling10_noSVA.dropna()

X = model_df_c[features]
y = model_df_c['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_10 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_10

-0.6888885394085568

### 5 Game Rolling

In [423]:
#17-18
features_d = merge_team_stats(primary,pp,pk)
features_d = calculate_team_features(features_d, 5)
df_20172018_d = merge_starters_and_features(results, goalie_features_dfB, features_d, feature_columns, goalie_feature_columns)

#18-19
features_1819_d = merge_team_stats(primary1819,pp1819,pk1819)
features_1819_d = calculate_team_features(features_1819_d, 5)
df_20182019_d = merge_starters_and_features(results1819, goalie_features_dfB, features_1819_d, feature_columns, goalie_feature_columns)

#19-20
features_1920_d = merge_team_stats(primary1920,pp1920,pk1920)
features_1920_d = calculate_team_features(features_1920_d, 5)
df_20192020_d = merge_starters_and_features(results1920, goalie_features_dfB, features_1920_d, feature_columns, goalie_feature_columns)

#20-21
features_2021_d = merge_team_stats(primary2021,pp2021,pk2021)
features_2021_d = calculate_team_features(features_2021_d, 5)
df_20202021_d = merge_starters_and_features(results2021, goalie_features_dfB, features_2021_d, feature_columns, goalie_feature_columns)

In [424]:
all_games_rolling5_noSVA = pd.concat([df_20172018_d, df_20182019_d, df_20192020_d, df_20202021_d])

#impute goalie stats where lack of games causing NaN
all_games_rolling5_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_rolling5_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling5_noSVA['away_Last_20_FenwickSV%'])
all_games_rolling5_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_rolling5_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling5_noSVA['away_Last_20_GSAx/60'])
all_games_rolling5_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_rolling5_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling5_noSVA['away_Last_20_HDCSV%'])
all_games_rolling5_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_rolling5_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling5_noSVA['home_Last_20_FenwickSV%'])
all_games_rolling5_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_rolling5_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling5_noSVA['home_Last_20_GSAx/60'])
all_games_rolling5_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_rolling5_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling5_noSVA['home_Last_20_HDCSV%'])

In [439]:
model_df_d = all_games_rolling5_noSVA.dropna()

X = model_df_d[features]
y = model_df_d['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_5 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_5

-0.6820499931059161

### 25 Game Rolling

In [435]:
#17-18
features_e = merge_team_stats(primary,pp,pk)
features_e = calculate_team_features(features_d, 25)
df_20172018_e = merge_starters_and_features(results, goalie_features_dfB, features_e, feature_columns, goalie_feature_columns)

#18-19
features_1819_e = merge_team_stats(primary1819,pp1819,pk1819)
features_1819_e = calculate_team_features(features_1819_d, 25)
df_20182019_e = merge_starters_and_features(results1819, goalie_features_dfB, features_1819_e, feature_columns, goalie_feature_columns)

#19-20
features_1920_e = merge_team_stats(primary1920,pp1920,pk1920)
features_1920_e = calculate_team_features(features_1920_d, 25)
df_20192020_e = merge_starters_and_features(results1920, goalie_features_dfB, features_1920_e, feature_columns, goalie_feature_columns)

#20-21
features_2021_e = merge_team_stats(primary2021,pp2021,pk2021)
features_2021_e = calculate_team_features(features_2021_d, 25)
df_20202021_e = merge_starters_and_features(results2021, goalie_features_dfB, features_2021_e, feature_columns, goalie_feature_columns)

In [436]:
all_games_rolling25_noSVA = pd.concat([df_20172018_e, df_20182019_e, df_20192020_e, df_20202021_e])

#impute goalie stats where lack of games causing NaN
all_games_rolling25_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_rolling25_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling25_noSVA['away_Last_20_FenwickSV%'])
all_games_rolling25_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_rolling25_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling25_noSVA['away_Last_20_GSAx/60'])
all_games_rolling25_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_rolling25_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling25_noSVA['away_Last_20_HDCSV%'])
all_games_rolling25_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_rolling25_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_rolling25_noSVA['home_Last_20_FenwickSV%'])
all_games_rolling25_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_rolling25_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_rolling25_noSVA['home_Last_20_GSAx/60'])
all_games_rolling25_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_rolling25_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_rolling25_noSVA['home_Last_20_HDCSV%'])

In [453]:
model_df_e = all_games_rolling25_noSVA.dropna()

X = model_df_e[features]
y = model_df_e['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_25 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_25

-0.6680030103766225

In [440]:
model_df_e.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Last_20_FenwickSV%,home_Last_20_GSAx/60,home_Last_20_HDCSV%,away_goalie,away_Last_20_FenwickSV%,away_Last_20_GSAx/60,away_Last_20_HDCSV%,home_Team_Key,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,home_last_20_SH%,home_last20_pp_TOI_per_game,home_last20_xGF_per_min_pp,home_last20_pk_TOI_per_game,home_last20_xGA_per_min_pk,home_B2B,away_Team_Key,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_B2B
403,2017020381,2017-11-30,Capital One Arena,WSH,L.A,2017-12-01 00:00:00,2,5,Final,0,WSH_2017-11-30,L.A_2017-11-30,Braden Holtby,0.945755,0.294764,0.866359,Jonathan Quick,0.949599,0.875378,0.89243,WSH_2017-11-30,46.68693,51.086957,46.558973,8.514493,5.373333,0.093945,6.384,0.119361,0.0,L.A_2017-11-30,50.208209,53.012048,46.730975,7.30897,5.546667,0.130673,5.939333,0.126344,0.0
404,2017020382,2017-11-30,Little Caesars Arena,DET,MTL,2017-12-01 00:30:00,3,6,Final,0,DET_2017-11-30,MTL_2017-11-30,Jimmy Howard,0.941103,0.148957,0.887324,Carey Price,0.933076,-0.432705,0.840426,DET_2017-11-30,49.794239,47.252747,48.002201,7.142857,4.915333,0.107744,5.530667,0.135029,0.0,MTL_2017-11-30,52.767962,43.181818,54.134771,5.783866,5.832667,0.135581,5.605333,0.12117,1.0
409,2017020387,2017-11-30,Rogers Place,EDM,TOR,2017-12-01 02:00:00,4,6,Final,0,EDM_2017-11-30,TOR_2017-11-30,Laurent Brossoit,0.934156,-0.280234,0.861425,Frederik Andersen,0.942826,0.105719,0.877934,EDM_2017-11-30,53.841743,47.42268,53.265391,6.824926,4.652667,0.128099,5.521333,0.110553,0.0,TOR_2017-11-30,48.672055,54.205607,50.583617,9.91453,4.768667,0.159206,5.196,0.113549,0.0
410,2017020388,2017-12-01,KeyBank Center,BUF,PIT,2017-12-02 00:00:00,0,4,Final,0,BUF_2017-12-01,PIT_2017-12-01,Robin Lehner,0.932895,-0.306537,0.850962,Tristan Jarry,0.934156,-0.280234,0.861425,BUF_2017-12-01,47.828823,41.111111,45.993469,6.390328,5.686,0.120718,4.930667,0.114224,0.0,PIT_2017-12-01,52.240566,35.555556,52.426362,4.819277,5.996,0.144963,6.118,0.110951,0.0
414,2017020391,2017-12-01,Nationwide Arena,CBJ,ANA,2017-12-02 00:00:00,4,2,Final,1,CBJ_2017-12-01,ANA_2017-12-01,Joonas Korpisalo,0.934156,-0.280234,0.861425,John Gibson,0.942246,0.27614,0.857143,CBJ_2017-12-01,54.117647,57.894737,55.726022,7.544582,4.780667,0.07957,4.068,0.117502,0.0,ANA_2017-12-01,45.782432,50.0,44.695763,8.078995,5.162,0.134134,7.034667,0.129644,0.0


In [441]:
feature_columns

['Team_Key',
 'last_20_FF%_5v5',
 'last_20_GF%_5v5',
 'last_20_xGF%_5v5',
 'last_20_SH%',
 'last20_pp_TOI_per_game',
 'last20_xGF_per_min_pp',
 'last20_pk_TOI_per_game',
 'last20_xGA_per_min_pk',
 'B2B']

In [442]:
model_df.columns

Index(['game_id', 'date', 'venue', 'home_team', 'away_team', 'start_time',
       'home_score', 'away_score', 'status', 'Home_Team_Won', 'Home_Team_Key',
       'Away_Team_Key', 'home_goalie', 'home_Last_20_FenwickSV%',
       'home_Last_20_GSAx/60', 'home_Last_20_HDCSV%', 'away_goalie',
       'away_Last_20_FenwickSV%', 'away_Last_20_GSAx/60',
       'away_Last_20_HDCSV%', 'home_Team_Key', 'home_last_20_FF%_5v5',
       'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5', 'home_last_20_SH%',
       'home_last20_pp_TOI_per_game', 'home_last20_xGF_per_min_pp',
       'home_last20_pk_TOI_per_game', 'home_last20_xGA_per_min_pk', 'home_B2B',
       'away_Team_Key', 'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5',
       'away_last_20_xGF%_5v5', 'away_last_20_SH%',
       'away_last20_pp_TOI_per_game', 'away_last20_xGF_per_min_pp',
       'away_last20_pk_TOI_per_game', 'away_last20_xGA_per_min_pk',
       'away_B2B'],
      dtype='object')

In [444]:
temp_features = ['game_id','home_last_20_FF%_5v5',
       'home_last_20_GF%_5v5', 'home_last_20_xGF%_5v5', 'home_last_20_SH%',
       'home_last20_pp_TOI_per_game', 'home_last20_xGF_per_min_pp',
       'home_last20_pk_TOI_per_game', 'home_last20_xGA_per_min_pk', 'home_B2B',
        'away_last_20_FF%_5v5', 'away_last_20_GF%_5v5',
       'away_last_20_xGF%_5v5', 'away_last_20_SH%',
       'away_last20_pp_TOI_per_game', 'away_last20_xGF_per_min_pp',
       'away_last20_pk_TOI_per_game', 'away_last20_xGA_per_min_pk',
       'away_B2B']

In [447]:
test_20v25 = pd.merge(left = model_df_e, right = model_df[temp_features], how = 'left', on = 'game_id', suffixes = ('_25', '_20'))

In [448]:
test_20v25.columns

Index(['game_id', 'date', 'venue', 'home_team', 'away_team', 'start_time',
       'home_score', 'away_score', 'status', 'Home_Team_Won', 'Home_Team_Key',
       'Away_Team_Key', 'home_goalie', 'home_Last_20_FenwickSV%',
       'home_Last_20_GSAx/60', 'home_Last_20_HDCSV%', 'away_goalie',
       'away_Last_20_FenwickSV%', 'away_Last_20_GSAx/60',
       'away_Last_20_HDCSV%', 'home_Team_Key', 'home_last_20_FF%_5v5_25',
       'home_last_20_GF%_5v5_25', 'home_last_20_xGF%_5v5_25',
       'home_last_20_SH%_25', 'home_last20_pp_TOI_per_game_25',
       'home_last20_xGF_per_min_pp_25', 'home_last20_pk_TOI_per_game_25',
       'home_last20_xGA_per_min_pk_25', 'home_B2B_25', 'away_Team_Key',
       'away_last_20_FF%_5v5_25', 'away_last_20_GF%_5v5_25',
       'away_last_20_xGF%_5v5_25', 'away_last_20_SH%_25',
       'away_last20_pp_TOI_per_game_25', 'away_last20_xGF_per_min_pp_25',
       'away_last20_pk_TOI_per_game_25', 'away_last20_xGA_per_min_pk_25',
       'away_B2B_25', 'home_last_20_FF%_

In [449]:
test_20v25_features = ['home_Last_20_FenwickSV%',
       'home_Last_20_GSAx/60', 'home_Last_20_HDCSV%',
       'away_Last_20_FenwickSV%', 'away_Last_20_GSAx/60',
       'away_Last_20_HDCSV%', 'home_last_20_FF%_5v5_20', 'home_last_20_GF%_5v5_20',
       'home_last_20_xGF%_5v5_20', 'home_last_20_SH%_20',
       'home_last20_pp_TOI_per_game_20', 'home_last20_xGF_per_min_pp_20',
       'home_last20_pk_TOI_per_game_20', 'home_last20_xGA_per_min_pk_20',
       'home_B2B_20', 'away_last_20_FF%_5v5_20', 'away_last_20_GF%_5v5_20',
       'away_last_20_xGF%_5v5_20', 'away_last_20_SH%_20',
       'away_last20_pp_TOI_per_game_20', 'away_last20_xGF_per_min_pp_20',
       'away_last20_pk_TOI_per_game_20', 'away_last20_xGA_per_min_pk_20',
       'away_B2B_20']

In [454]:
model_df_e_20= test_20v25.dropna()

X = model_df_e_20[test_20v25_features]
y = model_df_e_20['Home_Team_Won']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2021,  max_iter=1000))

accuracy_20v25 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring = 'neg_log_loss').mean()
accuracy_20v25

-0.6710427962623445

In [455]:
def calculate_weighted_team_features(df, rolling_games = 20):
    weights = np.arange(1,rolling_games)
    df[f'sum_rolling{rolling_games}_TOI_5v5'] = df.groupby('Team')['TOI'].transform(lambda x: x.rolling(rolling_games, rolling_games).sum().shift())
    df[f'sum_rolling{rolling_games}_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_GF_5v5'] = df.groupby('Team')['GF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_GA_5v5'] = df.groupby('Team')['GA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGF_5v5'] = df.groupby('Team')['xGF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGA_5v5'] = df.groupby('Team')['xGA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_SF_5v5'] = df.groupby('Team')['SF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last_{rolling_games}_FF%_5v5'] = df[f'sum_rolling{rolling_games}_FF_5v5']*100/ (df[f'sum_rolling{rolling_games}_FF_5v5']+df[f'sum_rolling{rolling_games}_FA_5v5'])
    df[f'last_{rolling_games}_GF%_5v5'] = df[f'sum_rolling{rolling_games}_GF_5v5']*100/ (df[f'sum_rolling{rolling_games}_GF_5v5']+df['sum_rolling20_GA_5v5'])
    df[f'last_{rolling_games}_xGF%_5v5'] = df['sum_rolling20_xGF_5v5']*100/ (df['sum_rolling20_xGF_5v5']+df[f'sum_rolling{rolling_games}_GA_5v5'])
    df[f'last_{rolling_games}_SH%'] = df[f'sum_rolling{rolling_games}_GF_5v5']*100 / df[f'sum_rolling{rolling_games}_SF_5v5']
    
    
    #fix NaNs in pp and pk features
    df['TOI_pp'] = np.where(df['TOI_pp'].isna(), 0, df['TOI_pp'])
    df['TOI_pk'] = np.where(df['TOI_pk'].isna(), 0, df['TOI_pk'])
    df['xGF_pp'] = np.where(df['xGF_pp'].isna(), 0, df['xGF_pp'])
    df['xGA_pk'] = np.where(df['xGA_pk'].isna(), 0, df['xGA_pk'])
    
    #pp features
    df[f'sum_rolling{rolling_games}_TOI_pp'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGF_pp'] = df.groupby('Team')['xGF_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last{rolling_games}_pp_TOI_per_game'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df[f'last{rolling_games}_xGF_per_min_pp'] = df[f'sum_rolling{rolling_games}_xGF_pp'] / df[f'sum_rolling{rolling_games}_TOI_pp'] 
    
    #pk features
    df[f'sum_rolling{rolling_games}_TOI_pk'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGA_pk'] = df.groupby('Team')['xGA_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last{rolling_games}_pk_TOI_per_game'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df[f'last{rolling_games}_xGA_per_min_pk'] = df[f'sum_rolling{rolling_games}_xGA_pk'] / df[f'sum_rolling{rolling_games}_TOI_pk'] 
    
    #to get back to back category
    df['Last_Game_Date'] = df.groupby('Team')['Date'].shift()
    df['Days_Since_Last_Game'] = df['Date'] - df['Last_Game_Date']
    df['B2B'] = np.where(df['Days_Since_Last_Game'] == '1 days', 1, 0)
    
    return df

In [471]:
df = primary
w = np.arange(1, 21)
rolling_games = 20
df[f'sum_rolling{rolling_games}_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).apply(lambda x: (x * w).sum()).shift())
df[f'sum_rolling{rolling_games}_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).apply(lambda x: (x * w).sum()).shift())
df[f'last_{rolling_games}_FF%_5v5'] = df[f'sum_rolling{rolling_games}_FF_5v5']*100/ (df[f'sum_rolling{rolling_games}_FF_5v5']+df[f'sum_rolling{rolling_games}_FA_5v5'])


In [472]:
df.head()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,sum_rolling5_FF_5v5,sum_rolling5_FA_5v5,last_5_FF%_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,last_20_FF%_5v5
0,"2017-10-04 - Maple Leafs 7, Jets 2",TOR,Limited ReportFull Report,39.133333,37,38,49.33,24,24,50.0,17,18,48.57,5,2,71.43,1.38,1.56,46.86,20,16,55.56,7,8,46.67,6,8,42.86,2,2,50.0,33.33,75.0,13,8,61.9,6,3,66.67,2,0,100.0,33.33,100.0,13,16,44.83,4,6,40.0,1,0,100.0,25.0,100.0,29.41,88.89,1.183,15321,2017-10-04,1,TOR_2017-10-04,,,,,,
1,"2017-10-04 - Maple Leafs 7, Jets 2",WPG,Limited ReportFull Report,39.133333,38,37,50.67,24,24,50.0,18,17,51.43,2,5,28.57,1.56,1.38,53.14,16,20,44.44,8,7,53.33,8,6,57.14,2,2,50.0,25.0,66.67,8,13,38.1,3,6,33.33,0,2,0.0,0.0,66.67,16,13,55.17,6,4,60.0,0,1,0.0,0.0,75.0,11.11,70.59,0.817,15321,2017-10-04,1,WPG_2017-10-04,,,,,,
2,"2017-10-04 - Blues 5, Penguins 4",PIT,Limited ReportFull Report,46.366667,58,48,54.72,43,38,53.09,28,27,50.91,3,2,60.0,2.63,2.28,53.56,26,25,50.98,10,11,47.62,4,9,30.77,0,1,0.0,0.0,88.89,16,14,53.33,7,8,46.67,0,1,0.0,0.0,87.5,28,21,57.14,14,10,58.33,3,0,100.0,21.43,100.0,10.71,92.59,1.033,18652,2017-10-04,1,PIT_2017-10-04,,,,,,
3,"2017-10-04 - Blues 5, Penguins 4",STL,Limited ReportFull Report,46.366667,48,58,45.28,38,43,46.91,27,28,49.09,2,3,40.0,2.28,2.63,46.44,25,26,49.02,11,10,52.38,9,4,69.23,1,0,100.0,11.11,100.0,14,16,46.67,8,7,53.33,1,0,100.0,12.5,100.0,21,28,42.86,10,14,41.67,0,3,0.0,0.0,78.57,7.41,89.29,0.967,18652,2017-10-04,1,STL_2017-10-04,,,,,,
4,"2017-10-04 - Flames 0, Oilers 3",CGY,Limited ReportFull Report,52.95,53,71,42.74,37,57,39.36,24,41,36.92,0,2,0.0,1.81,3.49,34.12,20,29,40.82,9,18,33.33,7,14,33.33,0,2,0.0,0.0,85.71,11,11,50.0,4,5,44.44,0,0,,0.0,100.0,21,36,36.84,9,22,29.03,0,0,,0.0,100.0,0.0,95.12,0.951,18347,2017-10-04,1,CGY_2017-10-04,,,,,,


In [473]:
df[df['Team'] == 'NYI'][['FF', 'sum_rolling20_FF_5v5', 'last_20_FF%_5v5']].head(22)

Unnamed: 0,FF,sum_rolling20_FF_5v5,last_20_FF%_5v5
25,40,,
39,32,,
64,33,,
97,38,,
150,24,,
157,34,,
190,37,,
228,30,,
251,28,,
291,34,,


In [464]:
40+32+33+38+24

167

In [466]:
(40*1+32*2+33*3+38*4+24*5) / 

95.0

In [478]:
def calculate_team_features(df, rolling_games = 20):

    df[f'sum_rolling{rolling_games}_TOI_5v5'] = df.groupby('Team')['TOI'].transform(lambda x: x.rolling(rolling_games, rolling_games).sum().shift())
    df[f'sum_rolling{rolling_games}_FF_5v5'] = df.groupby('Team')['FF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_FA_5v5'] = df.groupby('Team')['FA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_GF_5v5'] = df.groupby('Team')['GF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_GA_5v5'] = df.groupby('Team')['GA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGF_5v5'] = df.groupby('Team')['xGF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGA_5v5'] = df.groupby('Team')['xGA'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_SF_5v5'] = df.groupby('Team')['SF'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last_{rolling_games}_FF%_5v5'] = df[f'sum_rolling{rolling_games}_FF_5v5']*100/ (df[f'sum_rolling{rolling_games}_FF_5v5']+df[f'sum_rolling{rolling_games}_FA_5v5'])
    df[f'last_{rolling_games}_GF%_5v5'] = df[f'sum_rolling{rolling_games}_GF_5v5']*100/ (df[f'sum_rolling{rolling_games}_GF_5v5']+df[f'sum_rolling{rolling_games}_GA_5v5'])
    df[f'last_{rolling_games}_xGF%_5v5'] = df[f'sum_rolling{rolling_games}_xGF_5v5']*100/ (df[f'sum_rolling{rolling_games}_xGF_5v5']+df[f'sum_rolling{rolling_games}_xGA_5v5'])
    df[f'last_{rolling_games}_SH%'] = df[f'sum_rolling{rolling_games}_GF_5v5']*100 / df[f'sum_rolling{rolling_games}_SF_5v5']
    
    
    #fix NaNs in pp and pk features
    df['TOI_pp'] = np.where(df['TOI_pp'].isna(), 0, df['TOI_pp'])
    df['TOI_pk'] = np.where(df['TOI_pk'].isna(), 0, df['TOI_pk'])
    df['xGF_pp'] = np.where(df['xGF_pp'].isna(), 0, df['xGF_pp'])
    df['xGA_pk'] = np.where(df['xGA_pk'].isna(), 0, df['xGA_pk'])
    
    #pp features
    df[f'sum_rolling{rolling_games}_TOI_pp'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGF_pp'] = df.groupby('Team')['xGF_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last{rolling_games}_pp_TOI_per_game'] = df.groupby('Team')['TOI_pp'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df[f'last{rolling_games}_xGF_per_min_pp'] = df[f'sum_rolling{rolling_games}_xGF_pp'] / df[f'sum_rolling{rolling_games}_TOI_pp'] 
    
    #pk features
    df[f'sum_rolling{rolling_games}_TOI_pk'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'sum_rolling{rolling_games}_xGA_pk'] = df.groupby('Team')['xGA_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).sum().shift())
    df[f'last{rolling_games}_pk_TOI_per_game'] = df.groupby('Team')['TOI_pk'].transform(lambda x: x.rolling(rolling_games, rolling_games ).mean().shift())
    df[f'last{rolling_games}_xGA_per_min_pk'] = df[f'sum_rolling{rolling_games}_xGA_pk'] / df[f'sum_rolling{rolling_games}_TOI_pk'] 
    
    #to get back to back category
    df['Last_Game_Date'] = df.groupby('Team')['Date'].shift()
    df['Days_Since_Last_Game'] = df['Date'] - df['Last_Game_Date']
    df['B2B'] = np.where(df['Days_Since_Last_Game'] == '1 days', 1, 0)
    
    return df

In [480]:
primary

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,sum_rolling5_FF_5v5,sum_rolling5_FA_5v5,last_5_FF%_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,last_20_FF%_5v5,sum_rolling10_TOI_5v5,sum_rolling10_FF_5v5,sum_rolling10_FA_5v5,sum_rolling10_GF_5v5,sum_rolling10_GA_5v5,sum_rolling10_xGF_5v5,sum_rolling10_xGA_5v5,sum_rolling10_SF_5v5,last_10_FF%_5v5,last_10_GF%_5v5,last_10_xGF%_5v5,last_10_SH%
0,"2017-10-04 - Maple Leafs 7, Jets 2",TOR,Limited ReportFull Report,39.133333,37,38,49.33,24,24,50.00,17,18,48.57,5,2,71.43,1.38,1.56,46.86,20,16,55.56,7,8,46.67,6,8,42.86,2,2,50.0,33.33,75.00,13,8,61.90,6,3,66.67,2,0,100.0,33.33,100.00,13,16,44.83,4,6,40.00,1,0,100.0,25.00,100.00,29.41,88.89,1.183,15321,2017-10-04,1,TOR_2017-10-04,,,,,,,,,,,,,,,,,,
1,"2017-10-04 - Maple Leafs 7, Jets 2",WPG,Limited ReportFull Report,39.133333,38,37,50.67,24,24,50.00,18,17,51.43,2,5,28.57,1.56,1.38,53.14,16,20,44.44,8,7,53.33,8,6,57.14,2,2,50.0,25.00,66.67,8,13,38.10,3,6,33.33,0,2,0.0,0.00,66.67,16,13,55.17,6,4,60.00,0,1,0.0,0.00,75.00,11.11,70.59,0.817,15321,2017-10-04,1,WPG_2017-10-04,,,,,,,,,,,,,,,,,,
2,"2017-10-04 - Blues 5, Penguins 4",PIT,Limited ReportFull Report,46.366667,58,48,54.72,43,38,53.09,28,27,50.91,3,2,60.00,2.63,2.28,53.56,26,25,50.98,10,11,47.62,4,9,30.77,0,1,0.0,0.00,88.89,16,14,53.33,7,8,46.67,0,1,0.0,0.00,87.50,28,21,57.14,14,10,58.33,3,0,100.0,21.43,100.00,10.71,92.59,1.033,18652,2017-10-04,1,PIT_2017-10-04,,,,,,,,,,,,,,,,,,
3,"2017-10-04 - Blues 5, Penguins 4",STL,Limited ReportFull Report,46.366667,48,58,45.28,38,43,46.91,27,28,49.09,2,3,40.00,2.28,2.63,46.44,25,26,49.02,11,10,52.38,9,4,69.23,1,0,100.0,11.11,100.00,14,16,46.67,8,7,53.33,1,0,100.0,12.50,100.00,21,28,42.86,10,14,41.67,0,3,0.0,0.00,78.57,7.41,89.29,0.967,18652,2017-10-04,1,STL_2017-10-04,,,,,,,,,,,,,,,,,,
4,"2017-10-04 - Flames 0, Oilers 3",CGY,Limited ReportFull Report,52.950000,53,71,42.74,37,57,39.36,24,41,36.92,0,2,0.00,1.81,3.49,34.12,20,29,40.82,9,18,33.33,7,14,33.33,0,2,0.0,0.00,85.71,11,11,50.00,4,5,44.44,0,0,,0.00,100.00,21,36,36.84,9,22,29.03,0,0,,0.00,100.00,0.00,95.12,0.951,18347,2017-10-04,1,CGY_2017-10-04,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,"2018-04-07 - Canucks 2, Oilers 3",VAN,Limited ReportFull Report,51.583333,52,54,49.06,44,40,52.38,32,28,53.33,2,0,100.00,2.73,2.99,47.71,28,32,46.67,15,16,48.39,12,9,57.14,2,0,100.0,16.67,100.00,13,16,44.83,6,12,33.33,0,0,,0.00,100.00,22,16,57.89,12,5,70.59,0,0,,0.00,100.00,6.25,100.00,1.063,18347,2018-04-07,82,VAN_2018-04-07,511.0,487.0,51.202405,7070.0,7414.0,48.812483,506.100000,333.0,353.0,23.0,20.0,16.43,19.30,237.0,48.542274,53.488372,45.983767,9.704641
2538,"2018-04-07 - Stars 4, Kings 2",DAL,Limited ReportFull Report,50.700000,28,56,33.33,24,44,35.29,16,35,31.37,4,2,66.67,2.15,1.75,55.18,22,21,51.16,9,5,64.29,9,5,64.29,3,1,75.0,33.33,80.00,13,16,44.83,4,7,36.36,1,0,100.0,25.00,100.00,5,32,13.51,2,21,8.70,0,1,0.0,0.00,95.24,25.00,94.29,1.193,18230,2018-04-07,82,DAL_2018-04-07,460.0,528.0,46.558704,6517.0,6874.0,48.667015,482.550000,305.0,325.0,16.0,16.0,16.69,16.23,208.0,48.412698,50.000000,50.698663,7.692308
2539,"2018-04-07 - Stars 4, Kings 2",L.A,Limited ReportFull Report,50.700000,56,28,66.67,44,24,64.71,35,16,68.63,2,4,33.33,1.75,2.15,44.82,21,22,48.84,5,9,35.71,5,9,35.71,1,3,25.0,20.00,66.67,16,13,55.17,7,4,63.64,0,1,0.0,0.00,75.00,32,5,86.49,21,2,91.30,1,0,100.0,4.76,100.00,5.71,75.00,0.807,18230,2018-04-07,82,L.A_2018-04-07,477.0,491.0,49.276860,6958.0,7150.0,49.319535,496.366667,326.0,339.0,18.0,15.0,14.83,17.01,223.0,49.022556,54.545455,46.576633,8.071749
2540,"2018-04-07 - Wild 6, Sharks 3",MIN,Limited ReportFull Report,53.466667,52,56,48.15,31,37,45.59,23,22,51.11,5,2,71.43,1.84,1.53,54.62,25,21,54.35,14,10,58.33,12,6,66.67,3,2,60.0,25.00,66.67,11,11,50.00,4,4,50.00,1,0,100.0,25.00,100.00,22,27,44.90,7,12,36.84,1,0,100.0,14.29,100.00,21.74,90.91,1.126,17562,2018-04-07,82,MIN_2018-04-07,511.0,483.0,51.408451,6886.0,6920.0,49.876865,486.883333,311.0,332.0,14.0,14.0,15.10,15.72,230.0,48.367030,50.000000,48.994160,6.086957


In [484]:
test = merge_team_stats(primary, pp, pk)

In [485]:
test.head()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,sum_rolling5_FF_5v5,sum_rolling5_FA_5v5,last_5_FF%_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,last_20_FF%_5v5,sum_rolling10_TOI_5v5,sum_rolling10_FF_5v5,sum_rolling10_FA_5v5,sum_rolling10_GF_5v5,sum_rolling10_GA_5v5,sum_rolling10_xGF_5v5,sum_rolling10_xGA_5v5,sum_rolling10_SF_5v5,last_10_FF%_5v5,last_10_GF%_5v5,last_10_xGF%_5v5,last_10_SH%,TOI_pk,xGA_pk,TOI_pp,xGF_pp
0,"2017-10-04 - Maple Leafs 7, Jets 2",TOR,Limited ReportFull Report,39.133333,37,38,49.33,24,24,50.0,17,18,48.57,5,2,71.43,1.38,1.56,46.86,20,16,55.56,7,8,46.67,6,8,42.86,2,2,50.0,33.33,75.0,13,8,61.9,6,3,66.67,2,0,100.0,33.33,100.0,13,16,44.83,4,6,40.0,1,0,100.0,25.0,100.0,29.41,88.89,1.183,15321,2017-10-04,1,TOR_2017-10-04,,,,,,,,,,,,,,,,,,,16.0,1.61,4.866667,1.87
1,"2017-10-04 - Maple Leafs 7, Jets 2",WPG,Limited ReportFull Report,39.133333,38,37,50.67,24,24,50.0,18,17,51.43,2,5,28.57,1.56,1.38,53.14,16,20,44.44,8,7,53.33,8,6,57.14,2,2,50.0,25.0,66.67,8,13,38.1,3,6,33.33,0,2,0.0,0.0,66.67,16,13,55.17,6,4,60.0,0,1,0.0,0.0,75.0,11.11,70.59,0.817,15321,2017-10-04,1,WPG_2017-10-04,,,,,,,,,,,,,,,,,,,4.866667,1.87,16.0,1.61
2,"2017-10-04 - Blues 5, Penguins 4",PIT,Limited ReportFull Report,46.366667,58,48,54.72,43,38,53.09,28,27,50.91,3,2,60.0,2.63,2.28,53.56,26,25,50.98,10,11,47.62,4,9,30.77,0,1,0.0,0.0,88.89,16,14,53.33,7,8,46.67,0,1,0.0,0.0,87.5,28,21,57.14,14,10,58.33,3,0,100.0,21.43,100.0,10.71,92.59,1.033,18652,2017-10-04,1,PIT_2017-10-04,,,,,,,,,,,,,,,,,,,5.366667,0.54,6.316667,1.02
3,"2017-10-04 - Blues 5, Penguins 4",STL,Limited ReportFull Report,46.366667,48,58,45.28,38,43,46.91,27,28,49.09,2,3,40.0,2.28,2.63,46.44,25,26,49.02,11,10,52.38,9,4,69.23,1,0,100.0,11.11,100.0,14,16,46.67,8,7,53.33,1,0,100.0,12.5,100.0,21,28,42.86,10,14,41.67,0,3,0.0,0.0,78.57,7.41,89.29,0.967,18652,2017-10-04,1,STL_2017-10-04,,,,,,,,,,,,,,,,,,,6.316667,1.02,5.366667,0.54
4,"2017-10-04 - Flames 0, Oilers 3",CGY,Limited ReportFull Report,52.95,53,71,42.74,37,57,39.36,24,41,36.92,0,2,0.0,1.81,3.49,34.12,20,29,40.82,9,18,33.33,7,14,33.33,0,2,0.0,0.0,85.71,11,11,50.0,4,5,44.44,0,0,,0.0,100.0,21,36,36.84,9,22,29.03,0,0,,0.0,100.0,0.0,95.12,0.951,18347,2017-10-04,1,CGY_2017-10-04,,,,,,,,,,,,,,,,,,,4.0,0.19,2.0,0.29


In [487]:
test = calculate_team_features(test, 10)

In [489]:
test.tail()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,sum_rolling5_FF_5v5,sum_rolling5_FA_5v5,last_5_FF%_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,last_20_FF%_5v5,sum_rolling10_TOI_5v5,sum_rolling10_FF_5v5,sum_rolling10_FA_5v5,sum_rolling10_GF_5v5,sum_rolling10_GA_5v5,sum_rolling10_xGF_5v5,sum_rolling10_xGA_5v5,sum_rolling10_SF_5v5,last_10_FF%_5v5,last_10_GF%_5v5,last_10_xGF%_5v5,last_10_SH%,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling10_TOI_pp,sum_rolling10_xGF_pp,last10_pp_TOI_per_game,last10_xGF_per_min_pp,sum_rolling10_TOI_pk,sum_rolling10_xGA_pk,last10_pk_TOI_per_game,last10_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
2537,"2018-04-07 - Canucks 2, Oilers 3",VAN,Limited ReportFull Report,51.583333,52,54,49.06,44,40,52.38,32,28,53.33,2,0,100.0,2.73,2.99,47.71,28,32,46.67,15,16,48.39,12,9,57.14,2,0,100.0,16.67,100.0,13,16,44.83,6,12,33.33,0,0,,0.0,100.0,22,16,57.89,12,5,70.59,0,0,,0.0,100.0,6.25,100.0,1.063,18347,2018-04-07,82,VAN_2018-04-07,511.0,487.0,51.202405,7070.0,7414.0,48.812483,506.1,333.0,353.0,23.0,20.0,16.43,19.3,237.0,48.542274,53.488372,45.983767,9.704641,1.516667,0.31,4.966667,0.44,36.4,3.95,3.64,0.108516,49.466667,3.25,4.946667,0.065701,2018-04-05,2 days,0
2538,"2018-04-07 - Stars 4, Kings 2",DAL,Limited ReportFull Report,50.7,28,56,33.33,24,44,35.29,16,35,31.37,4,2,66.67,2.15,1.75,55.18,22,21,51.16,9,5,64.29,9,5,64.29,3,1,75.0,33.33,80.0,13,16,44.83,4,7,36.36,1,0,100.0,25.0,100.0,5,32,13.51,2,21,8.7,0,1,0.0,0.0,95.24,25.0,94.29,1.193,18230,2018-04-07,82,DAL_2018-04-07,460.0,528.0,46.558704,6517.0,6874.0,48.667015,482.55,305.0,325.0,16.0,16.0,16.69,16.23,208.0,48.412698,50.0,50.698663,7.692308,6.0,0.14,2.0,0.17,43.433333,5.83,4.343333,0.134229,53.866667,6.04,5.386667,0.112129,2018-04-06,1 days,1
2539,"2018-04-07 - Stars 4, Kings 2",L.A,Limited ReportFull Report,50.7,56,28,66.67,44,24,64.71,35,16,68.63,2,4,33.33,1.75,2.15,44.82,21,22,48.84,5,9,35.71,5,9,35.71,1,3,25.0,20.0,66.67,16,13,55.17,7,4,63.64,0,1,0.0,0.0,75.0,32,5,86.49,21,2,91.3,1,0,100.0,4.76,100.0,5.71,75.0,0.807,18230,2018-04-07,82,L.A_2018-04-07,477.0,491.0,49.27686,6958.0,7150.0,49.319535,496.366667,326.0,339.0,18.0,15.0,14.83,17.01,223.0,49.022556,54.545455,46.576633,8.071749,2.0,0.17,6.0,0.14,42.55,5.5,4.255,0.12926,44.216667,3.97,4.421667,0.089785,2018-04-05,2 days,0
2540,"2018-04-07 - Wild 6, Sharks 3",MIN,Limited ReportFull Report,53.466667,52,56,48.15,31,37,45.59,23,22,51.11,5,2,71.43,1.84,1.53,54.62,25,21,54.35,14,10,58.33,12,6,66.67,3,2,60.0,25.0,66.67,11,11,50.0,4,4,50.0,1,0,100.0,25.0,100.0,22,27,44.9,7,12,36.84,1,0,100.0,14.29,100.0,21.74,90.91,1.126,17562,2018-04-07,82,MIN_2018-04-07,511.0,483.0,51.408451,6886.0,6920.0,49.876865,486.883333,311.0,332.0,14.0,14.0,15.1,15.72,230.0,48.36703,50.0,48.99416,6.086957,3.25,0.28,1.25,0.0,40.95,3.7,4.095,0.090354,51.616667,3.99,5.161667,0.077301,2018-04-05,2 days,0
2541,"2018-04-07 - Wild 6, Sharks 3",S.J,Limited ReportFull Report,53.466667,56,52,51.85,37,31,54.41,22,23,48.89,2,5,28.57,1.53,1.84,45.38,21,25,45.65,10,14,41.67,6,12,33.33,2,3,40.0,33.33,75.0,11,11,50.0,4,4,50.0,0,1,0.0,0.0,75.0,27,22,55.1,12,7,63.16,0,1,0.0,0.0,85.71,9.09,78.26,0.874,17562,2018-04-07,82,S.J_2018-04-07,561.0,518.0,51.992586,7803.0,7415.0,51.274806,497.566667,366.0,354.0,23.0,18.0,19.91,16.93,259.0,50.833333,56.097561,54.044517,8.880309,1.25,0.0,3.25,0.28,46.933333,8.49,4.693333,0.180895,42.35,5.35,4.235,0.126328,2018-04-05,2 days,0


### Get All Team Stats

In [518]:
primary1617 = get_and_format_nst_team_stats('20162017','5v5', 'n')
pp1617 = get_and_format_nst_team_stats('20162017','pp', 'n')
pk1617 = get_and_format_nst_team_stats('20162017','pk', 'n')

In [519]:
primary1718 = get_and_format_nst_team_stats('20172018','5v5', 'n')
pp1718 = get_and_format_nst_team_stats('20172018','pp', 'n')
pk1718 = get_and_format_nst_team_stats('20172018','pk', 'n')

In [520]:
features1617 = merge_team_stats(primary1617, pp1617, pk1617)
features1718 = merge_team_stats(primary1718, pp1718, pk1718)
features1819 = merge_team_stats(primary1819, pp1819, pk1819)
features1920 = merge_team_stats(primary1920, pp1920, pk1920)
features2021 = merge_team_stats(primary2021, pp2021, pk2021)

In [521]:
team_stats_all_seasons = pd.concat([features1617, features1718, features1819, features1920, features2021]).sort_values('Date')

In [498]:
team_stats_all_seasons[(team_stats_all_seasons['last20_pk_TOI_per_game'].isna()) & (team_stats_all_seasons['Date'] > '2016-12-15')]

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,...,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key,TOI_pk,xGA_pk,TOI_pp,xGF_pp,sum_rolling5_FF_5v5,sum_rolling5_FA_5v5,last_5_FF%_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,last_20_FF%_5v5,sum_rolling10_TOI_5v5,sum_rolling10_FF_5v5,sum_rolling10_FA_5v5,sum_rolling10_GF_5v5,sum_rolling10_GA_5v5,sum_rolling10_xGF_5v5,sum_rolling10_xGA_5v5,sum_rolling10_SF_5v5,last_10_FF%_5v5,last_10_GF%_5v5,last_10_xGF%_5v5,last_10_SH%,sum_rolling20_TOI_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,Last_Game_Date,Days_Since_Last_Game,B2B
29,"2017-10-06 - Golden Knights 2, Stars 1",VGK,Limited ReportFull Report,36.616667,31,56,35.63,21,45,31.82,19,35,35.19,1,0,100.0,1.07,2.53,29.82,12,26,31.58,8,13,38.1,6,10,37.5,0,0,,0.0,100.0,4,13,23.53,2,8,20.0,1,0,100.0,50.0,100.0,14,22,38.89,9,13,40.91,...,100.0,5.26,100.0,1.053,18532,2017-10-06,1,VGK_2017-10-06,6.7,0.94,14.0,1.06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,0
51,"2017-10-07 - Golden Knights 2, Coyotes 1",VGK,Limited ReportFull Report,44.166667,56,43,56.57,39,31,55.71,25,23,52.08,0,1,0.0,2.08,2.21,48.49,28,28,50.0,11,8,57.89,5,5,50.0,0,1,0.0,0.0,80.0,17,20,45.95,6,14,30.0,0,0,,0.0,100.0,22,11,66.67,13,3,81.25,...,100.0,0.0,95.65,0.957,17125,2017-10-07,2,VGK_2017-10-07,7.95,0.63,6.783333,1.41,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-06,1 days,1
89,"2017-10-10 - Coyotes 2, Golden Knights 5",VGK,Limited ReportFull Report,49.933333,45,48,48.39,32,36,47.06,25,27,48.08,4,2,66.67,1.15,1.64,41.23,17,19,47.22,4,8,33.33,3,5,37.5,1,1,50.0,33.33,80.0,13,11,54.17,10,6,62.5,2,0,100.0,20.0,100.0,23,24,48.94,11,15,42.31,...,93.33,16.0,92.59,1.086,18191,2017-10-10,3,VGK_2017-10-10,5.05,0.45,5.016667,0.68,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-07,3 days,0
123,"2017-10-13 - Red Wings 6, Golden Knights 3",VGK,Limited ReportFull Report,51.566667,43,43,50.0,34,34,50.0,24,25,48.98,3,6,33.33,1.53,2.16,41.52,21,19,52.5,8,12,40.0,6,10,37.5,2,5,28.57,33.33,50.0,13,7,65.0,8,5,61.54,1,1,50.0,12.5,80.0,20,22,47.62,8,9,47.06,...,100.0,12.5,76.0,0.885,17645,2017-10-13,4,VGK_2017-10-13,0.166667,0.0,4.166667,0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-10,3 days,0
153,"2017-10-15 - Bruins 1, Golden Knights 3",VGK,Limited ReportFull Report,42.833333,42,36,53.85,31,28,52.54,18,17,51.43,2,0,100.0,1.65,0.82,66.8,21,11,65.63,6,0,100.0,4,0,100.0,1,0,100.0,25.0,,15,11,57.69,5,5,50.0,1,0,100.0,20.0,100.0,19,19,50.0,8,11,42.11,...,100.0,11.11,100.0,1.111,17562,2017-10-15,5,VGK_2017-10-15,4.266667,0.14,10.016667,0.32,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-13,2 days,0
179,"2017-10-17 - Sabres 4, Golden Knights 5",VGK,Limited ReportFull Report,40.5,31,45,40.79,23,33,41.07,18,18,50.0,3,0,100.0,1.14,1.82,38.52,18,18,50.0,7,5,58.33,6,5,54.55,2,0,100.0,33.33,100.0,11,13,45.83,7,6,53.85,0,0,,0.0,100.0,7,22,24.14,4,6,40.0,...,100.0,16.67,100.0,1.167,17617,2017-10-17,6,VGK_2017-10-17,8.2,0.86,9.133333,0.87,486.0,491.0,49.744115,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-15,2 days,0
241,"2017-10-21 - Blues 2, Golden Knights 3",VGK,Limited ReportFull Report,47.966667,32,59,35.16,26,44,37.14,14,34,29.17,0,2,0.0,0.98,2.4,28.97,13,28,31.71,1,10,9.09,1,10,9.09,0,1,0.0,0.0,90.0,12,18,40.0,5,8,38.46,0,0,,0.0,100.0,18,26,40.91,8,13,38.1,...,92.31,0.0,94.12,0.941,17883,2017-10-21,7,VGK_2017-10-21,8.0,0.89,4.033333,0.41,444.0,482.0,47.948164,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-17,4 days,0
269,"2017-10-24 - Blackhawks 2, Golden Knights 4",VGK,Limited ReportFull Report,44.45,46,43,51.69,32,29,52.46,23,23,50.0,3,1,75.0,2.04,1.77,53.59,23,25,47.92,10,12,45.45,7,8,46.67,1,1,50.0,14.29,87.5,13,13,50.0,9,8,52.94,2,0,100.0,22.22,100.0,21,16,56.76,6,7,46.15,...,100.0,13.04,95.65,1.087,18108,2017-10-24,8,VGK_2017-10-24,5.6,0.52,8.966667,1.41,415.0,540.0,43.455497,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-21,3 days,0
297,"2017-10-27 - Avalanche 0, Golden Knights 7",VGK,Limited ReportFull Report,45.716667,42,44,48.84,31,32,49.21,17,27,38.64,6,0,100.0,1.74,1.52,53.36,20,15,57.14,8,5,61.54,6,3,66.67,5,0,100.0,83.33,100.0,12,10,54.55,6,10,37.5,0,0,,0.0,100.0,17,20,45.95,4,13,23.53,...,100.0,35.29,100.0,1.353,17702,2017-10-27,9,VGK_2017-10-27,6.016667,0.64,8.216667,0.32,429.0,510.0,45.686901,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-24,3 days,0
339,"2017-10-30 - Golden Knights 3, Islanders 6",VGK,Limited ReportFull Report,43.966667,50,42,54.35,35,30,53.85,25,22,53.19,0,4,0.0,1.92,1.72,52.75,28,19,59.57,10,10,50.0,8,8,50.0,0,2,0.0,0.0,75.0,18,9,66.67,9,5,64.29,0,1,0.0,0.0,80.0,18,18,50.0,6,6,50.0,...,83.33,0.0,81.82,0.818,11113,2017-10-30,10,VGK_2017-10-30,7.366667,1.2,8.666667,0.64,438.0,502.0,46.595745,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-10-27,3 days,0


In [522]:
pd.options.display.max_rows = 100
team_stats_all_seasons.isna().sum()

Game              0
Team              0
Unnamed: 2        0
TOI               0
CF                0
CA                0
CF%               0
FF                0
FA                0
FF%               0
SF                0
SA                0
SF%               0
GF                0
GA                0
GF%             206
xGF               0
xGA               0
xGF%             50
SCF               0
SCA               0
SCF%              0
HDCF              0
HDCA              0
HDCF%            50
HDSF              0
HDSA              0
HDSF%            50
HDGF              0
HDGA              0
HDGF%          1362
HDSH%           101
HDSV%           101
MDCF              0
MDCA              0
MDCF%             0
MDSF              0
MDSA              0
MDSF%             2
MDGF              0
MDGA              0
MDGF%          4044
MDSH%            44
MDSV%            44
LDCF              0
LDCA              0
LDCF%             0
LDSF              0
LDSA              0
LDSF%             0


In [523]:
for games in [3,5,10,20,30]:
    team_stats_all_seasons = calculate_team_features(team_stats_all_seasons, games)

In [524]:
team_stats_all_seasons.tail()

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,...,last_10_xGF%_5v5,last_10_SH%,sum_rolling10_TOI_pp,sum_rolling10_xGF_pp,last10_pp_TOI_per_game,last10_xGF_per_min_pp,sum_rolling10_TOI_pk,sum_rolling10_xGA_pk,last10_pk_TOI_per_game,last10_xGA_per_min_pk,sum_rolling20_TOI_5v5,sum_rolling20_FF_5v5,sum_rolling20_FA_5v5,sum_rolling20_GF_5v5,sum_rolling20_GA_5v5,sum_rolling20_xGF_5v5,sum_rolling20_xGA_5v5,sum_rolling20_SF_5v5,last_20_FF%_5v5,last_20_GF%_5v5,last_20_xGF%_5v5,last_20_SH%,sum_rolling20_TOI_pp,sum_rolling20_xGF_pp,last20_pp_TOI_per_game,last20_xGF_per_min_pp,sum_rolling20_TOI_pk,sum_rolling20_xGA_pk,last20_pk_TOI_per_game,last20_xGA_per_min_pk,sum_rolling30_TOI_5v5,sum_rolling30_FF_5v5,sum_rolling30_FA_5v5,sum_rolling30_GF_5v5,sum_rolling30_GA_5v5,sum_rolling30_xGF_5v5,sum_rolling30_xGA_5v5,sum_rolling30_SF_5v5,last_30_FF%_5v5,last_30_GF%_5v5,last_30_xGF%_5v5,last_30_SH%,sum_rolling30_TOI_pp,sum_rolling30_xGF_pp,last30_pp_TOI_per_game,last30_xGF_per_min_pp,sum_rolling30_TOI_pk,sum_rolling30_xGA_pk,last30_pk_TOI_per_game,last30_xGA_per_min_pk
1511,"2021-04-29 - Flyers 3, Devils 5",PHI,Limited ReportFull Report,50.1,46,31,59.74,33,22,60.0,20,13,60.61,1,2,33.33,1.68,1.19,58.56,25,18,58.14,5,5,50.0,3,3,50.0,1,1,50.0,33.33,66.67,20,13,60.61,7,3,70.0,0,1,0.0,0.0,66.67,20,13,60.61,9,7,56.25,...,49.882145,6.048387,40.15,5.62,4.015,0.139975,51.966667,5.17,5.196667,0.099487,985.616667,701.0,604.0,29.0,43.0,34.97,32.9,521.0,53.716475,40.277778,51.524974,5.566219,89.766667,11.72,4.488333,0.130561,103.766667,9.97,5.188333,0.096081,1460.466667,1005.0,870.0,47.0,82.0,50.03,48.32,746.0,53.6,36.434109,50.869344,6.300268,150.533333,16.17,5.017778,0.107418,142.083333,14.13,4.736111,0.099449
1509,"2021-04-29 - Sabres 2, Bruins 5",BUF,Limited ReportFull Report,50.1,40,48,45.45,32,43,42.67,27,30,47.37,2,3,40.0,1.98,1.57,55.75,22,16,57.89,7,8,46.67,7,6,53.85,2,2,50.0,28.57,66.67,15,8,65.22,9,5,64.29,0,1,0.0,0.0,80.0,16,30,34.78,10,17,37.04,...,46.831337,6.923077,53.616667,4.06,5.361667,0.075723,36.6,5.59,3.66,0.152732,977.35,642.0,804.0,37.0,44.0,35.41,42.34,495.0,44.39834,45.679012,45.543408,7.474747,95.216667,7.09,4.760833,0.074462,91.316667,11.99,4.565833,0.131301,1492.183333,945.0,1171.0,53.0,76.0,52.51,64.06,698.0,44.659735,41.085271,45.045895,7.593123,122.783333,8.39,4.092778,0.068332,136.633333,17.86,4.554444,0.130715
1508,"2021-04-29 - Sabres 2, Bruins 5",BOS,Limited ReportFull Report,50.1,48,40,54.55,43,32,57.33,30,27,52.63,3,2,60.0,1.57,1.98,44.25,16,22,42.11,8,7,53.33,6,7,46.15,2,2,50.0,33.33,71.43,8,15,34.78,5,9,35.71,1,0,100.0,20.0,100.0,30,16,65.22,17,10,62.96,...,59.129173,8.041958,45.983333,3.32,4.598333,0.0722,68.583333,6.43,6.858333,0.093755,945.433333,729.0,600.0,42.0,35.0,35.77,30.12,548.0,54.853273,54.545455,54.287449,7.664234,102.383333,8.54,5.119167,0.083412,117.666667,9.77,5.883333,0.083031,1423.75,1058.0,895.0,55.0,46.0,49.38,45.18,788.0,54.173067,54.455446,52.220812,6.979695,154.966667,12.35,5.165556,0.079695,171.583333,14.9,5.719444,0.086838
1519,"2021-04-29 - Panthers 4, Blackhawks 3",FLA,Limited ReportFull Report,46.95,51,29,63.75,40,23,63.49,27,19,58.7,2,2,50.0,2.2,1.3,62.91,24,11,68.57,7,5,58.33,5,5,50.0,1,2,33.33,20.0,60.0,17,6,73.91,7,5,58.33,1,0,100.0,14.29,100.0,25,16,60.98,14,9,60.87,...,57.146845,7.067138,66.15,6.01,6.615,0.090854,38.2,4.13,3.82,0.108115,950.083333,700.0,592.0,37.0,33.0,39.34,34.09,534.0,54.179567,52.857143,53.574833,6.928839,126.3,12.59,6.315,0.099683,88.483333,10.63,4.424167,0.120136,1406.05,1019.0,907.0,59.0,54.0,58.57,51.79,780.0,52.90758,52.212389,53.071765,7.564103,193.533333,20.46,6.451111,0.105718,144.85,15.11,4.828333,0.104315
1510,"2021-04-29 - Flyers 3, Devils 5",N.J,Limited ReportFull Report,50.1,31,46,40.26,22,33,40.0,13,20,39.39,2,1,66.67,1.19,1.68,41.44,18,25,41.86,5,5,50.0,3,3,50.0,1,1,50.0,33.33,66.67,13,20,39.39,3,7,30.0,1,0,100.0,33.33,100.0,13,20,39.39,7,9,43.75,...,51.397558,7.725322,48.816667,5.36,4.881667,0.109799,31.3,3.5,3.13,0.111821,992.366667,676.0,647.0,40.0,54.0,38.55,34.52,468.0,51.095994,42.553191,52.75763,8.547009,102.033333,11.45,5.101667,0.112218,77.533333,10.62,3.876667,0.136973,1468.583333,1003.0,990.0,60.0,75.0,55.72,53.49,708.0,50.326141,44.444444,51.020969,8.474576,146.966667,14.75,4.898889,0.100363,136.383333,16.68,4.546111,0.122302


In [525]:
team_stats_all_seasons.isna().sum()[team_stats_all_seasons.isna().sum() >100]

GF%                        206
HDGF%                     1362
HDSH%                      101
HDSV%                      101
MDGF%                     4044
LDGF%                     6230
sum_rolling5_TOI_5v5       155
sum_rolling5_FF_5v5        155
sum_rolling5_FA_5v5        155
sum_rolling5_GF_5v5        155
sum_rolling5_GA_5v5        155
sum_rolling5_xGF_5v5       155
sum_rolling5_xGA_5v5       155
sum_rolling5_SF_5v5        155
last_5_FF%_5v5             155
last_5_GF%_5v5             155
last_5_xGF%_5v5            155
last_5_SH%                 155
sum_rolling5_TOI_pp        155
sum_rolling5_xGF_pp        155
last5_pp_TOI_per_game      155
last5_xGF_per_min_pp       155
sum_rolling5_TOI_pk        155
sum_rolling5_xGA_pk        155
last5_pk_TOI_per_game      155
last5_xGA_per_min_pk       155
sum_rolling10_TOI_5v5      310
sum_rolling10_FF_5v5       310
sum_rolling10_FA_5v5       310
sum_rolling10_GF_5v5       310
sum_rolling10_GA_5v5       310
sum_rolling10_xGF_5v5      310
sum_roll

In [527]:
list(team_stats_all_seasons.columns)

['Game',
 'Team',
 'Unnamed: 2',
 'TOI',
 'CF',
 'CA',
 'CF%',
 'FF',
 'FA',
 'FF%',
 'SF',
 'SA',
 'SF%',
 'GF',
 'GA',
 'GF%',
 'xGF',
 'xGA',
 'xGF%',
 'SCF',
 'SCA',
 'SCF%',
 'HDCF',
 'HDCA',
 'HDCF%',
 'HDSF',
 'HDSA',
 'HDSF%',
 'HDGF',
 'HDGA',
 'HDGF%',
 'HDSH%',
 'HDSV%',
 'MDCF',
 'MDCA',
 'MDCF%',
 'MDSF',
 'MDSA',
 'MDSF%',
 'MDGF',
 'MDGA',
 'MDGF%',
 'MDSH%',
 'MDSV%',
 'LDCF',
 'LDCA',
 'LDCF%',
 'LDSF',
 'LDSA',
 'LDSF%',
 'LDGF',
 'LDGA',
 'LDGF%',
 'LDSH%',
 'LDSV%',
 'SH%',
 'SV%',
 'PDO',
 'Attendance',
 'Date',
 'Game_Number',
 'Team_Key',
 'TOI_pk',
 'xGA_pk',
 'TOI_pp',
 'xGF_pp',
 'sum_rolling3_TOI_5v5',
 'sum_rolling3_FF_5v5',
 'sum_rolling3_FA_5v5',
 'sum_rolling3_GF_5v5',
 'sum_rolling3_GA_5v5',
 'sum_rolling3_xGF_5v5',
 'sum_rolling3_xGA_5v5',
 'sum_rolling3_SF_5v5',
 'last_3_FF%_5v5',
 'last_3_GF%_5v5',
 'last_3_xGF%_5v5',
 'last_3_SH%',
 'sum_rolling3_TOI_pp',
 'sum_rolling3_xGF_pp',
 'last3_pp_TOI_per_game',
 'last3_xGF_per_min_pp',
 'sum_rolling3_TOI_pk

In [532]:
feature_columns_all_seasons = ['Team_Key',
 'last_3_FF%_5v5',
 'last_3_GF%_5v5',
 'last_3_xGF%_5v5',
 'last_3_SH%',
 'last3_pp_TOI_per_game',
 'last3_xGF_per_min_pp',
 'last3_pk_TOI_per_game',
 'last3_xGA_per_min_pk',
 'B2B',
 'last_5_FF%_5v5',
 'last_5_GF%_5v5',
 'last_5_xGF%_5v5',
 'last_5_SH%',
 'last5_pp_TOI_per_game',
 'last5_xGF_per_min_pp',
 'last5_pk_TOI_per_game',
 'last5_xGA_per_min_pk',
 'last_10_FF%_5v5',
 'last_10_GF%_5v5',
 'last_10_xGF%_5v5',
 'last_10_SH%',
 'last10_pp_TOI_per_game',
 'last10_xGF_per_min_pp',
 'last10_pk_TOI_per_game',
 'last10_xGA_per_min_pk',
 'last_20_FF%_5v5',
 'last_20_GF%_5v5',
 'last_20_xGF%_5v5',
 'last_20_SH%',
 'last20_pp_TOI_per_game',
 'last20_xGF_per_min_pp',
 'last20_pk_TOI_per_game',
 'last20_xGA_per_min_pk',
 'last_30_FF%_5v5',
 'last_30_GF%_5v5',
 'last_30_xGF%_5v5',
 'last_30_SH%',
 'last30_pp_TOI_per_game',
 'last30_xGF_per_min_pp',
 'last30_pk_TOI_per_game',
 'last30_xGA_per_min_pk']

In [533]:
df_20172018_B = merge_starters_and_features(results, goalie_features_dfB, team_stats_all_seasons, feature_columns_all_seasons, goalie_feature_columns)
df_20182019_B = merge_starters_and_features(results1819, goalie_features_dfB, team_stats_all_seasons, feature_columns_all_seasons, goalie_feature_columns)
df_20192020_B = merge_starters_and_features(results1920, goalie_features_dfB, team_stats_all_seasons, feature_columns_all_seasons, goalie_feature_columns)
df_20202021_B = merge_starters_and_features(results2021, goalie_features_dfB, team_stats_all_seasons, feature_columns_all_seasons, goalie_feature_columns)

In [537]:
all_games_multirolling_noSVA = pd.concat([df_20172018_B, df_20182019_B, df_20192020_B, df_20202021_B])

In [538]:
#impute goalie stats where lack of games causing NaN
all_games_multirolling_noSVA['away_Last_20_FenwickSV%'] = np.where(all_games_multirolling_noSVA['away_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_multirolling_noSVA['away_Last_20_FenwickSV%'])
all_games_multirolling_noSVA['away_Last_20_GSAx/60'] = np.where(all_games_multirolling_noSVA['away_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_multirolling_noSVA['away_Last_20_GSAx/60'])
all_games_multirolling_noSVA['away_Last_20_HDCSV%'] = np.where(all_games_multirolling_noSVA['away_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_multirolling_noSVA['away_Last_20_HDCSV%'])
all_games_multirolling_noSVA['home_Last_20_FenwickSV%'] = np.where(all_games_multirolling_noSVA['home_Last_20_FenwickSV%'].isna(), ig_FenwickSV,all_games_multirolling_noSVA['home_Last_20_FenwickSV%'])
all_games_multirolling_noSVA['home_Last_20_GSAx/60'] = np.where(all_games_multirolling_noSVA['home_Last_20_GSAx/60'].isna(), ig_GSAx60, all_games_multirolling_noSVA['home_Last_20_GSAx/60'])
all_games_multirolling_noSVA['home_Last_20_HDCSV%'] = np.where(all_games_multirolling_noSVA['home_Last_20_HDCSV%'].isna(), ig_HDCSV, all_games_multirolling_noSVA['home_Last_20_HDCSV%'])

In [541]:
all_games_multirolling_noSVA.to_csv('data/all_games_multirolling_noSVA.csv')

In [540]:
pd.options.display.max_rows = 104
all_games_multirolling_noSVA.isna().sum()

game_id                         0
date                            0
venue                           0
home_team                       0
away_team                       0
start_time                      0
home_score                      0
away_score                      0
status                          0
Home_Team_Won                   0
Home_Team_Key                   0
Away_Team_Key                   0
home_goalie                    12
home_Last_20_FenwickSV%         0
home_Last_20_GSAx/60            0
home_Last_20_HDCSV%             0
away_goalie                    16
away_Last_20_FenwickSV%         0
away_Last_20_GSAx/60            0
away_Last_20_HDCSV%             0
home_Team_Key                   3
home_last_3_FF%_5v5             4
home_last_3_GF%_5v5             5
home_last_3_xGF%_5v5            4
home_last_3_SH%                 4
home_last3_pp_TOI_per_game      4
home_last3_xGF_per_min_pp       4
home_last3_pk_TOI_per_game      4
home_last3_xGA_per_min_pk       4
home_B2B      

### Getting Data With Scoring and Venue Adjustments

In [576]:
sequence = [x/10 for x in range(60, 120)]
time.sleep(random.choice(sequence))

primarysva1617 = get_and_format_nst_team_stats('20162017', 'sva', 'n')
primarysva1718 = get_and_format_nst_team_stats('20172018','sva', 'n')
primarysva1819 = get_and_format_nst_team_stats('20182019','sva', 'n')
primarysva1920 = get_and_format_nst_team_stats('20192020','sva', 'n')
primarysva2021 = get_and_format_nst_team_stats('20202021','sva', 'n')

time.sleep(random.choice(sequence))

pp1617 = get_and_format_nst_team_stats('20162017','pp', 'n')
pp1718 = get_and_format_nst_team_stats('20172018','pp', 'n')
pp1819 = get_and_format_nst_team_stats('20182019','pp', 'n')
pp1920 = get_and_format_nst_team_stats('20192020','pp', 'n')
pp2021 = get_and_format_nst_team_stats('20202021','pp', 'n')

time.sleep(random.choice(sequence))

pk1718 = get_and_format_nst_team_stats('20172018','pk', 'n')


In [577]:
featuressva1617 = merge_team_stats(primarysva1617, pp1617, pk1617)
featuressva1718 = merge_team_stats(primarysva1718, pp1718, pk1718)
featuressva1819 = merge_team_stats(primarysva1819, pp1819, pk1819)
featuressva1920 = merge_team_stats(primarysva1920, pp1920, pk1920)
featuressva2021 = merge_team_stats(primarysva2021, pp2021, pk2021)

In [578]:
team_stats_all_seasons_sva = pd.concat([featuressva1617, featuressva1718, featuressva1819, featuressva1920, featuressva2021]).sort_values('Date')

In [579]:
for games in [3,5,10,20,30,40]:
    team_stats_all_seasons_sva = calculate_team_features(team_stats_all_seasons_sva, games)

In [581]:
list(team_stats_all_seasons_sva.columns)

['Game',
 'Team',
 'Unnamed: 2',
 'TOI',
 'CF',
 'CA',
 'CF%',
 'FF',
 'FA',
 'FF%',
 'SF',
 'SA',
 'SF%',
 'GF',
 'GA',
 'GF%',
 'xGF',
 'xGA',
 'xGF%',
 'SCF',
 'SCA',
 'SCF%',
 'HDCF',
 'HDCA',
 'HDCF%',
 'HDSF',
 'HDSA',
 'HDSF%',
 'HDGF',
 'HDGA',
 'HDGF%',
 'HDSH%',
 'HDSV%',
 'MDCF',
 'MDCA',
 'MDCF%',
 'MDSF',
 'MDSA',
 'MDSF%',
 'MDGF',
 'MDGA',
 'MDGF%',
 'MDSH%',
 'MDSV%',
 'LDCF',
 'LDCA',
 'LDCF%',
 'LDSF',
 'LDSA',
 'LDSF%',
 'LDGF',
 'LDGA',
 'LDGF%',
 'LDSH%',
 'LDSV%',
 'SH%',
 'SV%',
 'PDO',
 'Attendance',
 'Date',
 'Game_Number',
 'Team_Key',
 'TOI_pk',
 'xGA_pk',
 'TOI_pp',
 'xGF_pp',
 'sum_rolling3_TOI_5v5',
 'sum_rolling3_FF_5v5',
 'sum_rolling3_FA_5v5',
 'sum_rolling3_GF_5v5',
 'sum_rolling3_GA_5v5',
 'sum_rolling3_xGF_5v5',
 'sum_rolling3_xGA_5v5',
 'sum_rolling3_SF_5v5',
 'last_3_FF%_5v5',
 'last_3_GF%_5v5',
 'last_3_xGF%_5v5',
 'last_3_SH%',
 'sum_rolling3_TOI_pp',
 'sum_rolling3_xGF_pp',
 'last3_pp_TOI_per_game',
 'last3_xGF_per_min_pp',
 'sum_rolling3_TOI_pk

In [586]:
feature_columns_all_seasons = ['Team_Key', 'Game_Number',
 'last_3_FF%_5v5',
 'last_3_GF%_5v5',
 'last_3_xGF%_5v5',
 'last_3_SH%',
 'last3_pp_TOI_per_game',
 'last3_xGF_per_min_pp',
 'last3_pk_TOI_per_game',
 'last3_xGA_per_min_pk',
 'B2B',
 'last_5_FF%_5v5',
 'last_5_GF%_5v5',
 'last_5_xGF%_5v5',
 'last_5_SH%',
 'last5_pp_TOI_per_game',
 'last5_xGF_per_min_pp',
 'last5_pk_TOI_per_game',
 'last5_xGA_per_min_pk',
 'last_10_FF%_5v5',
 'last_10_GF%_5v5',
 'last_10_xGF%_5v5',
 'last_10_SH%',
 'last10_pp_TOI_per_game',
 'last10_xGF_per_min_pp',
 'last10_pk_TOI_per_game',
 'last10_xGA_per_min_pk',
 'last_20_FF%_5v5',
 'last_20_GF%_5v5',
 'last_20_xGF%_5v5',
 'last_20_SH%',
 'last20_pp_TOI_per_game',
 'last20_xGF_per_min_pp',
 'last20_pk_TOI_per_game',
 'last20_xGA_per_min_pk',
 'last_30_FF%_5v5',
 'last_30_GF%_5v5',
 'last_30_xGF%_5v5',
 'last_30_SH%',
 'last30_pp_TOI_per_game',
 'last30_xGF_per_min_pp',
 'last30_pk_TOI_per_game',
 'last30_xGA_per_min_pk',
 'last_40_FF%_5v5',
 'last_40_GF%_5v5',
 'last_40_xGF%_5v5',
 'last_40_SH%',
 'last40_pp_TOI_per_game',
 'last40_xGF_per_min_pp',
 'last40_pk_TOI_per_game',
 'last40_xGA_per_min_pk']

In [587]:
df_20172018_C = merge_starters_and_features(results, goalie_features_dfB, team_stats_all_seasons_sva, feature_columns_all_seasons, goalie_feature_columns)
df_20182019_C = merge_starters_and_features(results1819, goalie_features_dfB, team_stats_all_seasons_sva, feature_columns_all_seasons, goalie_feature_columns)
df_20192020_C = merge_starters_and_features(results1920, goalie_features_dfB, team_stats_all_seasons_sva, feature_columns_all_seasons, goalie_feature_columns)
df_20202021_C = merge_starters_and_features(results2021, goalie_features_dfB, team_stats_all_seasons_sva, feature_columns_all_seasons, goalie_feature_columns)

In [588]:
all_games_multirolling_SVA = pd.concat([df_20172018_C, df_20182019_C, df_20192020_C, df_20202021_C])

In [589]:
all_games_multirolling_SVA.head()

Unnamed: 0,game_id,date,venue,home_team,away_team,start_time,home_score,away_score,status,Home_Team_Won,Home_Team_Key,Away_Team_Key,home_goalie,home_Goalie_FenwickSV%,home_Goalie_GSAx/60,home_Goalie_HDCSV%,away_goalie,away_Goalie_FenwickSV%,away_Goalie_GSAx/60,away_Goalie_HDCSV%,home_Team_Key,home_Game_Number,home_last_3_FF%_5v5,home_last_3_GF%_5v5,home_last_3_xGF%_5v5,home_last_3_SH%,home_last3_pp_TOI_per_game,home_last3_xGF_per_min_pp,home_last3_pk_TOI_per_game,home_last3_xGA_per_min_pk,home_B2B,home_last_5_FF%_5v5,home_last_5_GF%_5v5,home_last_5_xGF%_5v5,home_last_5_SH%,home_last5_pp_TOI_per_game,home_last5_xGF_per_min_pp,home_last5_pk_TOI_per_game,home_last5_xGA_per_min_pk,home_last_10_FF%_5v5,home_last_10_GF%_5v5,home_last_10_xGF%_5v5,home_last_10_SH%,home_last10_pp_TOI_per_game,home_last10_xGF_per_min_pp,home_last10_pk_TOI_per_game,home_last10_xGA_per_min_pk,home_last_20_FF%_5v5,home_last_20_GF%_5v5,home_last_20_xGF%_5v5,...,away_Game_Number,away_last_3_FF%_5v5,away_last_3_GF%_5v5,away_last_3_xGF%_5v5,away_last_3_SH%,away_last3_pp_TOI_per_game,away_last3_xGF_per_min_pp,away_last3_pk_TOI_per_game,away_last3_xGA_per_min_pk,away_B2B,away_last_5_FF%_5v5,away_last_5_GF%_5v5,away_last_5_xGF%_5v5,away_last_5_SH%,away_last5_pp_TOI_per_game,away_last5_xGF_per_min_pp,away_last5_pk_TOI_per_game,away_last5_xGA_per_min_pk,away_last_10_FF%_5v5,away_last_10_GF%_5v5,away_last_10_xGF%_5v5,away_last_10_SH%,away_last10_pp_TOI_per_game,away_last10_xGF_per_min_pp,away_last10_pk_TOI_per_game,away_last10_xGA_per_min_pk,away_last_20_FF%_5v5,away_last_20_GF%_5v5,away_last_20_xGF%_5v5,away_last_20_SH%,away_last20_pp_TOI_per_game,away_last20_xGF_per_min_pp,away_last20_pk_TOI_per_game,away_last20_xGA_per_min_pk,away_last_30_FF%_5v5,away_last_30_GF%_5v5,away_last_30_xGF%_5v5,away_last_30_SH%,away_last30_pp_TOI_per_game,away_last30_xGF_per_min_pp,away_last30_pk_TOI_per_game,away_last30_xGA_per_min_pk,away_last_40_FF%_5v5,away_last_40_GF%_5v5,away_last_40_xGF%_5v5,away_last_40_SH%,away_last40_pp_TOI_per_game,away_last40_xGF_per_min_pp,away_last40_pk_TOI_per_game,away_last40_xGA_per_min_pk
0,2017020001,2017-10-04,Bell MTS Place,WPG,TOR,2017-10-04 23:00:00,2,7,Final,0,WPG_2017-10-04,TOR_2017-10-04,Steve Mason,0.932657,-0.33494,0.866667,Frederik Andersen,0.942629,0.027934,0.872792,WPG_2017-10-04,1.0,52.073795,62.416999,48.839009,10.258649,3.444444,0.063871,4.005556,0.090707,0.0,52.399869,57.080799,51.663405,9.426112,4.19,0.079714,3.693333,0.098556,50.977189,50.738779,51.924105,7.380972,5.128333,0.091453,4.315,0.128158,49.296838,51.281437,51.260619,...,1.0,51.594385,47.355164,48.770492,8.692972,6.45,0.10646,3.394444,0.111948,0.0,52.562502,45.9375,48.770492,6.967375,5.893333,0.06991,3.07,0.074267,50.792085,48.572198,49.886878,7.837427,5.073333,0.08042,4.235,0.114522,49.687136,49.188289,49.131362,7.552033,4.525833,0.118431,4.195,0.123361,50.085902,51.013795,50.0,7.493438,4.538889,0.128519,4.363889,0.133978,49.991679,51.399425,49.339386,8.124451,4.646667,0.1224,4.54,0.133976
1,2017020002,2017-10-04,PPG Paints Arena,PIT,STL,2017-10-05 00:00:00,4,5,Final,0,PIT_2017-10-04,STL_2017-10-04,Matt Murray,0.941176,0.205712,0.869942,Jake Allen,0.945897,-0.138771,0.882353,PIT_2017-10-04,1.0,44.169509,52.837327,49.560117,13.82154,2.761111,0.182294,3.683333,0.171041,0.0,42.564205,59.064609,46.860987,12.093988,3.336667,0.143856,3.546667,0.153383,43.807042,45.254958,43.294064,8.673423,3.633333,0.176422,3.358333,0.113151,48.750126,52.677702,49.122002,...,1.0,45.437316,48.43562,49.468085,10.395145,7.283333,0.113043,5.25,0.067937,0.0,46.882217,49.927641,51.204482,11.358025,6.0,0.096,4.966667,0.109128,51.762995,60.235018,56.574746,9.833864,5.313333,0.101255,4.9,0.088571,50.637795,64.851704,52.974711,8.595064,4.81,0.102079,5.041667,0.07914,50.657828,62.391842,53.831533,8.068254,4.613889,0.096376,4.832222,0.087882,50.633643,58.184556,52.486645,8.420932,4.315417,0.102018,4.92875,0.097844
2,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cam Talbot,0.942539,0.312441,0.89645,Mike Smith,0.940136,0.041876,0.891688,EDM_2017-10-04,1.0,61.871942,67.470882,66.234888,8.840201,6.222222,0.109286,5.166667,0.078065,0.0,60.511924,58.385392,60.180542,8.478124,6.283333,0.113316,4.54,0.131278,54.257609,57.390731,54.170931,10.202647,6.091667,0.11409,4.815,0.132087,52.167745,59.967217,53.263517,...,1.0,44.065149,47.90146,43.499044,9.144748,5.316667,0.136677,5.644444,0.08622,0.0,43.520998,45.427286,40.305523,9.286882,4.816667,0.153218,5.853333,0.112415,46.986842,40.884428,42.363112,7.444549,5.438333,0.111983,5.585,0.10188,48.845951,54.384033,46.495147,8.859738,4.541667,0.118899,4.8525,0.099227,50.449932,54.47604,48.071667,9.030071,4.732778,0.12755,4.905,0.116276,50.595552,50.499508,49.136336,7.879167,4.921667,0.120843,5.185417,0.107127
3,2017020003,2017-10-04,Rogers Place,EDM,CGY,2017-10-05 02:00:00,3,0,Final,1,EDM_2017-10-04,CGY_2017-10-04,Cameron Talbot,0.942539,0.312441,0.89645,Mike Smith,0.940136,0.041876,0.891688,EDM_2017-10-04,1.0,61.871942,67.470882,66.234888,8.840201,6.222222,0.109286,5.166667,0.078065,0.0,60.511924,58.385392,60.180542,8.478124,6.283333,0.113316,4.54,0.131278,54.257609,57.390731,54.170931,10.202647,6.091667,0.11409,4.815,0.132087,52.167745,59.967217,53.263517,...,1.0,44.065149,47.90146,43.499044,9.144748,5.316667,0.136677,5.644444,0.08622,0.0,43.520998,45.427286,40.305523,9.286882,4.816667,0.153218,5.853333,0.112415,46.986842,40.884428,42.363112,7.444549,5.438333,0.111983,5.585,0.10188,48.845951,54.384033,46.495147,8.859738,4.541667,0.118899,4.8525,0.099227,50.449932,54.47604,48.071667,9.030071,4.732778,0.12755,4.905,0.116276,50.595552,50.499508,49.136336,7.879167,4.921667,0.120843,5.185417,0.107127
4,2017020004,2017-10-04,SAP Center at San Jose,S.J,PHI,2017-10-05 02:30:00,3,5,Final,0,S.J_2017-10-04,PHI_2017-10-04,Martin Jones,0.932564,-0.23218,0.852201,Brian Elliott,0.940035,0.009622,0.852632,S.J_2017-10-04,1.0,54.394882,62.234534,49.575372,11.527279,4.45,0.098876,6.3,0.115344,0.0,54.316401,57.771883,52.571429,9.804628,4.62,0.118615,4.763333,0.137299,53.012374,45.443587,52.992908,8.268632,3.926667,0.129117,4.883333,0.13802,52.824405,48.85344,52.718405,...,1.0,55.404341,64.739229,56.31068,7.457229,4.672222,0.11415,5.777778,0.110192,0.0,51.909534,56.272661,49.941995,6.524847,5.173333,0.137242,5.963333,0.086864,52.461604,60.718636,53.426249,8.037406,4.32,0.172222,4.793333,0.117872,51.361625,58.109135,52.868013,7.967239,5.54,0.146841,4.888333,0.096966,51.47572,49.303136,51.280317,6.34076,5.412222,0.141778,4.803333,0.093963,51.197815,45.246898,50.855171,5.932286,5.57125,0.143998,5.305,0.093779


In [590]:
#impute goalie stats where lack of games causing NaN
all_games_multirolling_SVA['away_Goalie_FenwickSV%'] = np.where(all_games_multirolling_SVA['away_Goalie_FenwickSV%'].isna(), ig_FenwickSV,all_games_multirolling_SVA['away_Goalie_FenwickSV%'])
all_games_multirolling_SVA['away_Goalie_GSAx/60'] = np.where(all_games_multirolling_SVA['away_Goalie_GSAx/60'].isna(), ig_GSAx60, all_games_multirolling_SVA['away_Goalie_GSAx/60'])
all_games_multirolling_SVA['away_Goalie_HDCSV%'] = np.where(all_games_multirolling_SVA['away_Goalie_HDCSV%'].isna(), ig_HDCSV, all_games_multirolling_SVA['away_Goalie_HDCSV%'])
all_games_multirolling_SVA['home_Goalie_FenwickSV%'] = np.where(all_games_multirolling_SVA['home_Goalie_FenwickSV%'].isna(), ig_FenwickSV,all_games_multirolling_SVA['home_Goalie_FenwickSV%'])
all_games_multirolling_SVA['home_Goalie_GSAx/60'] = np.where(all_games_multirolling_SVA['home_Goalie_GSAx/60'].isna(), ig_GSAx60, all_games_multirolling_SVA['home_Goalie_GSAx/60'])
all_games_multirolling_SVA['home_Goalie_HDCSV%'] = np.where(all_games_multirolling_SVA['home_Goalie_HDCSV%'].isna(), ig_HDCSV, all_games_multirolling_SVA['home_Goalie_HDCSV%'])

In [None]:
conditions = [((all_games_multirolling_SVA['date'] >= '2017-10-04') & (all_games_multirolling_SVA['date'] <= '2018-04-08')),
              ((all_games_multirolling_SVA['date'] >= '2018-10-03') & (all_games_multirolling_SVA['date'] <= '2019-04-06')),
              ((all_games_multirolling_SVA['date'] >= '2019-10-02') & (all_games_multirolling_SVA['date'] <= '2020-03-12')),
              ((all_games_multirolling_SVA['date'] >= '2021-01-13') & (all_games_multirolling_SVA['date'] <= '2021-04-29'))
             ]
, 
choices = ['2017-2018',
           '2018-2019',
           '2019-2020',
           '2020-2021']
           
    

all_games_multirolling_SVA['Season'] = np.select(conditions, choices)

In [591]:
all_games_multirolling_SVA.to_csv('data/all_games_multirolling_SVA.csv')

In [595]:
pd.options.display.max_rows = 120
all_games_multirolling_SVA.isna().sum()

game_id                         0
date                            0
venue                           0
home_team                       0
away_team                       0
                               ..
away_last_40_SH%               24
away_last40_pp_TOI_per_game    24
away_last40_xGF_per_min_pp     24
away_last40_pk_TOI_per_game    24
away_last40_xGA_per_min_pk     24
Length: 122, dtype: int64