# WPA Model for NHL Games

This Notebook creates a Win-Probability Model for NHL Games using XGBoost and this [Kaggle](<https://www.kaggle.com/datasets/s903124/nhl-playbyplay-data-from-2007?resource=download>) for training and testing data.

## 1. Importing Data and Packages

In [1]:
import pandas as pd
import numpy as np

import xgboost as xg
from xgboost import XGBClassifier, plot_importance, plot_tree

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
from matplotlib.pyplot import style
import seaborn as sns
%matplotlib inline

In [3]:
sns.set_style('darkgrid')

In [5]:
pbp = pd.read_csv('/Users/dB/Documents/repos/github/hacklytics-nhl-dashboard/.data/nhl_pbp20192020.csv')
pbp.head()

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
0,20001,2019-10-02,1,PSTR,Period Start- Local time: 7:13 EDT,0:00,0.0,5x5,,,...,0,0,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,,,MIKE BABCOCK,D.J. SMITH
1,20001,2019-10-02,1,FAC,TOR won Neu. Zone - OTT #36 WHITE vs TOR #91 T...,0:00,0.0,5x5,Neu,,...,0,0,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,0.0,0.0,MIKE BABCOCK,D.J. SMITH
2,20001,2019-10-02,1,GOAL,"OTT #7 TKACHUK(1), Tip-In, Off. Zone, 4 ft.Ass...",0:25,25.0,5x5,Off,TIP-IN,...,0,0,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,85.0,-1.0,MIKE BABCOCK,D.J. SMITH
3,20001,2019-10-02,1,FAC,OTT won Neu. Zone - OTT #71 TIERNEY vs TOR #88...,0:25,25.0,5x5,Neu,,...,1,0,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,0.0,0.0,MIKE BABCOCK,D.J. SMITH
4,20001,2019-10-02,1,MISS,"OTT #22 ZAITSEV, Slap, Wide of Net, Neu. Zone,...",0:38,38.0,5x5,Neu,SLAP SHOT,...,1,0,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,28.0,-37.0,MIKE BABCOCK,D.J. SMITH


In [6]:
print(pbp.columns.tolist())

['Game_Id', 'Date', 'Period', 'Event', 'Description', 'Time_Elapsed', 'Seconds_Elapsed', 'Strength', 'Ev_Zone', 'Type', 'Ev_Team', 'Home_Zone', 'Away_Team', 'Home_Team', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID', 'awayPlayer1', 'awayPlayer1_id', 'awayPlayer2', 'awayPlayer2_id', 'awayPlayer3', 'awayPlayer3_id', 'awayPlayer4', 'awayPlayer4_id', 'awayPlayer5', 'awayPlayer5_id', 'awayPlayer6', 'awayPlayer6_id', 'homePlayer1', 'homePlayer1_id', 'homePlayer2', 'homePlayer2_id', 'homePlayer3', 'homePlayer3_id', 'homePlayer4', 'homePlayer4_id', 'homePlayer5', 'homePlayer5_id', 'homePlayer6', 'homePlayer6_id', 'Away_Players', 'Home_Players', 'Away_Score', 'Home_Score', 'Away_Goalie', 'Away_Goalie_Id', 'Home_Goalie', 'Home_Goalie_Id', 'xC', 'yC', 'Home_Coach', 'Away_Coach']


## 2. Preparing Data for WP Model

In [7]:
# Count Num. of Instances of data per game
# maybe change Ev_Team
pbp.groupby(['Game_Id', 'Ev_Team'])['Game_Id'].count()

Game_Id  Ev_Team
20001    OTT        154
         TOR        156
20002    STL        113
         WSH        128
20003    EDM        125
                   ... 
30321    VGK        144
30322    DAL        137
         VGK        161
30323    DAL        153
         VGK        162
Name: Game_Id, Length: 2402, dtype: int64

In [8]:
# Calculate Max Goals for Home and Away Teams for Binary Classifiers for W/L
pbp['home_max_goal'] = pbp['Home_Score'].groupby(pbp['Game_Id']).transform('max')
pbp['away_max_goal'] = pbp['Away_Score'].groupby(pbp['Game_Id']).transform('max')
pbp.head()

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach,home_max_goal,away_max_goal
0,20001,2019-10-02,1,PSTR,Period Start- Local time: 7:13 EDT,0:00,0.0,5x5,,,...,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,,,MIKE BABCOCK,D.J. SMITH,5,3
1,20001,2019-10-02,1,FAC,TOR won Neu. Zone - OTT #36 WHITE vs TOR #91 T...,0:00,0.0,5x5,Neu,,...,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,0.0,0.0,MIKE BABCOCK,D.J. SMITH,5,3
2,20001,2019-10-02,1,GOAL,"OTT #7 TKACHUK(1), Tip-In, Off. Zone, 4 ft.Ass...",0:25,25.0,5x5,Off,TIP-IN,...,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,85.0,-1.0,MIKE BABCOCK,D.J. SMITH,5,3
3,20001,2019-10-02,1,FAC,OTT won Neu. Zone - OTT #71 TIERNEY vs TOR #88...,0:25,25.0,5x5,Neu,,...,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,0.0,0.0,MIKE BABCOCK,D.J. SMITH,5,3
4,20001,2019-10-02,1,MISS,"OTT #22 ZAITSEV, Slap, Wide of Net, Neu. Zone,...",0:38,38.0,5x5,Neu,SLAP SHOT,...,CRAIG ANDERSON,8467950.0,FREDERIK ANDERSEN,8475883.0,28.0,-37.0,MIKE BABCOCK,D.J. SMITH,5,3


Now we can create the second DataFrame to use in the WP Model

In [15]:
wp_df = pd.DataFrame()

wp_df['game_id'] = pbp['Game_Id']
wp_df['home_team'] = pbp['Home_Team']
wp_df['home_team'] = pbp['Away_Team']
wp_df['player1'] = pbp['p1_name'] # does this need to be numeric?
# Do we need these? The data has it (assuming they are event players)
wp_df['player2'] = pbp['p2_name']
wp_df['player3'] = pbp['p3_name']

wp_df['off_team'] = pbp['Ev_Team']
wp_df['def_team'] = np.where(pbp['Ev_Team'] != pbp['Home_Team'], pbp['Home_Team'], pbp['Away_Team'])

# 3600s - time elapsed in seconds, OT games are negative values
wp_df['time_remaining'] = pbp["Seconds_Elapsed"].apply(lambda x: 3600 - x if x <= 3600 else -(x - 3600))

pbp['num_home_skaters'] = pbp[['homePlayer1','homePlayer2','homePlayer3','homePlayer4','homePlayer5','homePlayer6']].notna().sum(axis=1)
pbp['num_away_skaters'] = pbp[['awayPlayer1','awayPlayer2','awayPlayer3','awayPlayer4','awayPlayer5','awayPlayer6']].notna().sum(axis=1)

wp_df['off_skaters'] = np.where(pbp['Ev_Team'] == pbp['Home_Team'], pbp['num_home_skaters'], pbp['num_away_skaters'])
wp_df['def_skaters'] = np.where(pbp['Ev_Team'] != pbp['Home_Team'], pbp['num_home_skaters'], pbp['num_away_skaters'])

wp_df['skater_diff'] = wp_df['off_skaters'] - wp_df['def_skaters']
wp_df['goalie_pulled'] = np.where(wp_df['off_skaters'] == 6, 1, 0)
wp_df['off_team_score'] = np.where(pbp['Ev_Team'] == pbp['Home_Team'], pbp['Home_Score'], pbp['Away_Score'])
wp_df['def_team_score'] = np.where(pbp['Ev_Team'] != pbp['Home_Team'], pbp['Home_Score'], pbp['Away_Score'])
wp_df['score_diff'] = wp_df['off_team_score'] - wp_df['def_team_score']
wp_df['xT'] = 0 # ???
wp_df['off_team_final_score'] = np.where(pbp['Ev_Team'] == pbp['Home_Team'], pbp['home_max_goal'], pbp['away_max_goal'])
wp_df['def_team_final_score'] = np.where(pbp['Ev_Team'] != pbp['Home_Team'], pbp['home_max_goal'], pbp['away_max_goal'])
wp_df['win'] = np.where(wp_df['off_team_final_score'] > wp_df['def_team_final_score'],1,0)
wp_df.head()

Unnamed: 0,game_id,home_team,player1,player2,player3,off_team,def_team,time_remaining,off_skaters,def_skaters,skater_diff,goalie_pulled,off_team_score,def_team_score,score_diff,xT,off_team_final_score,def_team_final_score,win
0,20001,OTT,,,,,TOR,3600.0,6,6,0,1,0,0,0,0,3,5,0
1,20001,OTT,JOHN TAVARES,COLIN WHITE,,TOR,OTT,3600.0,6,6,0,1,0,0,0,0,5,3,1
2,20001,OTT,BRADY TKACHUK,CONNOR BROWN,COLIN WHITE,OTT,TOR,3575.0,6,6,0,1,0,0,0,0,3,5,0
3,20001,OTT,CHRIS TIERNEY,WILLIAM NYLANDER,,OTT,TOR,3575.0,6,6,0,1,1,0,1,0,3,5,0
4,20001,OTT,NIKITA ZAITSEV,,,OTT,TOR,3562.0,6,6,0,1,1,0,1,0,3,5,0


In [17]:
# Not sure why the source makes this df but we'll do it anyways...

# df for splitting
spl_df = pd.DataFrame()

spl_df['game_id'] = wp_df['game_id']
spl_df['player1'] = wp_df['player1']
spl_df['player2'] = wp_df['player2']
spl_df['player3'] = wp_df['player3']
spl_df['team'] = wp_df['off_team']
spl_df['time_remaining(s)'] = wp_df['time_remaining']
spl_df['skater_diff'] = wp_df['skater_diff']
spl_df['score_diff'] = wp_df['skater_diff']
spl_df['goalie_pulled'] = wp_df['goalie_pulled']
spl_df['xT'] = wp_df['xT']
spl_df['win'] = wp_df['win']

In [None]:
# Create the Test-Train Split
# Not sure what to put here so I just added all of it

X = spl_df.drop(['win','player1','team','game_id'],axis=1)
Y = pd.DataFrame(spl_df['win'], columns=['win'])

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.3, random_state=2142,stratify=Y)

# This does not perform the split by game_id

In [None]:
# Convert the splits from np.ndarrays to pd.DataFrames

col_names = train_df.columns.to_list