In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

%matplotlib inline



In [4]:
pd.set_option('display.max_columns', 500)

### Importing Data

In [5]:
team_stats = pd.read_csv('./Data/clean NHL data.csv', index_col=0)
player_stats = pd.read_csv('./Data/full player stats.csv', index_col=0)
advanced_stats = pd.read_csv('./Data/full advanced stats.csv', index_col=0)

cup_champs = pd.read_csv('./Data/cup champs.csv', index_col=0)
team_ranks = pd.read_csv('./Data/NHL Rankings 2008-2018 vertical.csv', index_col=0)

full_df = pd.read_csv('./Data/model data.csv', index_col=0)

#### Few more items to clean up

- Renaming the 'team_name' column in the team stats dataframe to just 'team' to match the player and advanced stats dfs.
- There is a discrepancy in the dataframes between the St Louis Blues name. The 'advanced_stat' and 'player_stat' dataframes have a period after St(.). I'm removing that so the team names match
- resetting index and cleaning the team ranks dataframe

In [6]:
full_df['ind'] = full_df['team_name'] + '_' + full_df['year'].astype(str)

full_df.set_index('ind', inplace=True)

full_df.head()

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,year,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,fenwick_pct
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Nashville Predators_2018,82.0,267.0,145.0,193.0,58.0,18.0,211.0,54.0,101.6,81.94,117.0,0.713,21.17,0.923,9.9,0.03,0.71,Nashville Predators,53.0,2018,0.0,5,49.66,25.481481,9.185185,1.692593,1.7,49.836
Winnipeg Jets_2018,82.0,277.0,159.0,200.0,64.0,20.0,218.0,50.0,101.0,81.75,114.0,0.695,23.36,0.917,10.3,0.02,0.74,Winnipeg Jets,52.0,2018,0.0,3,51.061538,26.285714,7.25,1.792857,1.567857,51.357692
Tampa Bay Lightning_2018,82.0,296.0,172.0,216.0,66.0,23.0,236.0,64.0,102.0,76.03,113.0,0.689,23.91,0.912,10.7,-0.07,0.66,Tampa Bay Lightning,54.0,2018,0.0,4,51.536,28.962963,9.333333,2.103704,1.407407,51.576
Boston Bruins_2018,82.0,270.0,161.0,197.0,61.0,20.0,214.0,40.0,100.2,83.67,112.0,0.683,23.64,0.912,9.9,-0.07,0.62,Boston Bruins,50.0,2018,0.0,7,53.162069,23.16129,5.419355,1.554839,1.509677,53.32069
Vegas Golden Knights_2018,82.0,272.0,182.0,218.0,53.0,24.0,228.0,44.0,100.5,81.43,109.0,0.665,21.37,0.911,10.1,-0.01,0.52,Vegas Golden Knights,51.0,2018,0.0,2,50.779167,26.0,5.037037,1.788889,1.518519,50.770833


In [7]:
team_stats.groupby('year').count()

Unnamed: 0_level_0,average_age,chances_pp,games,goals,goals_against_ev,goals_ev,goals_pp,goals_sh,losses,losses_ot,losses_shootout,opp_chances_pp,opp_goals,opp_goals_pp,opp_goals_sh,pdo,pen_kill_pct,pen_min_per_game,pen_min_per_game_opp,points,points_pct,power_play_pct,save_pct,shot_pct,shots,shots_against,sos,srs,team_name,total_goals_per_game,wins,wins_shootout,cup_champs,rank
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
2008,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2009,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2010,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2011,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2012,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2013,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2014,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2015,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2016,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2017,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30


### Setting up feature and target variables

In [8]:
X = full_df.drop(columns=['rank', 'cup_champs', 'team_name'])
y = full_df[['rank', 'year']]

#### Train, Test Split

- Train, test, split is a little tricky due to the dataset. The purpose of the model is to predict playoff performance based on stats from the regular season. So, I cannot use an automated train, test, split here, as I need training data that contains all of the observations from a given year. Instead I have decided to manually select 8 whole years of data to use as my training data, while holding out 2 whole years to use as my testing data.

In [9]:
X_train = X[(X['year'] != 2016) & (X['year'] != 2009)].drop(columns='year')

X_test = X[X['year'].isin([2016, 2009])].drop(columns='year')

y_test = y[y['year'].isin([2016, 2009])].drop(columns='year')

y_train = y[(y['year'] != 2016) & (y['year'] != 2009)].drop(columns='year')

#### Multiclass Logistic Regression

- Initially I had planned to predict only the Stanley Cup winner. This presented a big problem as there is only 1 cup winner in a given year out of 30 or 31 teams depending on the season. This is a huge class imbalance, coupled with the small number of observations (30/31) in a given year, creating a workable model from that data would be extremely difficult.
- Instead I have chosen to assign a rank to each team in every season for which I have data. The teams are ranked based on where they finished. The Stanley Cup winner is ranked at 1, the runner up at 2, followed by conference final runners-up and so on down to 31. For teams that exited the playoffs in the same round, the teams with the higher point totals in the regular season were ranked higher. This rank value is dropped from me feature set and is the main target variable.
- The Multiclass Logistic Regression will allow me to predict every teams final season ranking based on regular season statistics. In addition to receiving the numerical ranking for each team, I will be able to see the probabilities assigned to those predictions. While predicting how far each team will get in the playoffs is very difficult and very high accuracy is unlikely, assigning probabilities to those predictions is necessary for interpreting results.  

In [10]:
# ss = StandardScaler()
# X_scaled = ss.fit_transform(X)
# pd.DataFrame(X_scaled, columns=features).head()

In [11]:
logreg = LogisticRegression(random_state=28, multi_class='multinomial', solver='lbfgs')
model = logreg.fit(X_train, y_train);

  y = column_or_1d(y, warn=True)


In [12]:
model.predict(X_train)
model.score(X_train, y_train)

0.36900369003690037

In [13]:
model.predict_proba(X_train)

array([[1.01205182e-02, 5.35164378e-03, 1.69701487e-01, ...,
        5.71134146e-24, 1.11065312e-29, 3.55276048e-20],
       [8.56058937e-03, 6.96796052e-02, 1.57015106e-01, ...,
        1.13309152e-22, 5.71758749e-28, 9.57594881e-18],
       [7.51800589e-04, 1.03696392e-02, 1.70095649e-02, ...,
        5.76705337e-21, 8.35775048e-27, 8.18602891e-17],
       ...,
       [8.02838229e-22, 1.72900757e-18, 5.48345658e-20, ...,
        2.18097697e-01, 3.03092699e-01, 3.31574072e-08],
       [1.18342178e-19, 1.16312188e-14, 3.87650591e-18, ...,
        2.38393448e-01, 4.85344383e-01, 1.10734938e-06],
       [5.60295777e-21, 4.29093303e-16, 5.53420282e-20, ...,
        1.51282677e-01, 1.11606994e-01, 1.77788243e-04]])

In [14]:
model.predict(X_test)
model.score(X_test, y_test)

0.15

In [15]:
model.coef_.round(decimals=3)

array([[ 0.077,  0.156, -0.204,  0.059,  0.043, -0.035, -0.182, -0.082,
        -0.108, -0.011,  0.193,  0.001, -0.063, -0.001, -0.042,  0.001,
         0.007,  0.082,  0.129,  0.079,  0.044,  0.001,  0.014,  0.123],
       [ 0.014,  0.208, -0.072,  0.04 ,  0.14 ,  0.03 , -0.228, -0.131,
        -0.028,  0.083,  0.056,  0.   , -0.017, -0.   ,  0.   , -0.004,
         0.002,  0.072, -0.041, -0.038,  0.027, -0.   , -0.   , -0.05 ],
       [-0.004,  0.171, -0.168,  0.026,  0.055, -0.129, -0.151, -0.098,
        -0.008,  0.015,  0.231,  0.002,  0.04 , -0.   ,  0.029,  0.003,
         0.009,  0.106, -0.065,  0.063,  0.041,  0.006,  0.005, -0.066],
       [-0.094,  0.07 , -0.134,  0.025,  0.009, -0.159, -0.008, -0.031,
         0.012,  0.009,  0.175,  0.002,  0.033,  0.   ,  0.015, -0.004,
        -0.002,  0.11 , -0.041, -0.015,  0.023,  0.002, -0.005, -0.026],
       [ 0.061,  0.144, -0.115,  0.013,  0.04 , -0.138, -0.236,  0.018,
        -0.002,  0.028,  0.301,  0.001,  0.011,  0.   , -0.0

In [16]:
predictions = model.predict(X_test)

probs = model.predict_proba(X_test)

In [17]:
predictions

array([ 5,  7,  6,  2,  6,  9,  6, 10,  8,  8, 11, 11, 14, 15, 16, 22, 13,
       22, 19, 25, 20, 23, 20, 24, 24, 23, 23, 27, 28, 28,  6,  5,  7,  9,
        9,  7,  9, 12, 12, 16,  4, 17, 17, 18, 18, 17, 15, 14,  2, 21, 24,
       24, 23, 26, 29, 26, 29, 29, 30, 30])

In [18]:
ind = pd.Series(y_test.index)
y_t = pd.Series(y_test['rank'])
preds = pd.Series(predictions)

In [19]:
probs_df = pd.DataFrame(probs).round(decimals=3)

In [20]:
probs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,0.017,0.011,0.159,0.001,0.326,0.211,0.011,0.0,0.167,0.087,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.031,0.061,0.235,0.107,0.011,0.019,0.288,0.001,0.021,0.018,0.088,0.034,0.039,0.043,0.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013,0.021,0.042,0.039,0.061,0.369,0.041,0.02,0.091,0.209,0.072,0.004,0.0,0.005,0.011,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.049,0.203,0.064,0.014,0.032,0.078,0.122,0.016,0.025,0.125,0.178,0.067,0.001,0.005,0.02,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.012,0.075,0.025,0.009,0.034,0.662,0.054,0.008,0.011,0.051,0.048,0.001,0.0,0.002,0.008,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.023,0.017,0.057,0.034,0.102,0.005,0.01,0.029,0.327,0.071,0.113,0.143,0.015,0.031,0.021,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.003,0.056,0.019,0.032,0.015,0.363,0.087,0.025,0.084,0.148,0.101,0.015,0.0,0.01,0.04,0.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.14,0.008,0.017,0.014,0.063,0.009,0.013,0.031,0.166,0.291,0.111,0.084,0.004,0.026,0.021,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.001,0.002,0.008,0.061,0.006,0.002,0.004,0.219,0.107,0.036,0.067,0.109,0.14,0.14,0.06,0.005,0.0,0.033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.001,0.021,0.005,0.032,0.001,0.12,0.036,0.301,0.005,0.09,0.06,0.026,0.003,0.016,0.093,0.13,0.003,0.055,0.002,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd.DataFrame(data=[ind, y_t.values, preds]).T #columns=['ind', 'actual_rank', 'predicted_rank'])

  ---