# Modeling the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

%matplotlib inline



In [2]:
pd.set_option('display.max_columns', 500)

### Importing Data

In [3]:
full_df = pd.read_csv('./Data/model data.csv', index_col=0)

#### One more item to clean up

- In order to model this correctly using Logistic Regression I need to remove the 'team_name' column from the dataset. However, I still need to know which team the results are associated with. To do this I am resetting the index as a combination of team_name and year. This way the information will remain as part of the data, but will not be included in the model.

In [4]:
full_df['ind'] = full_df['team_name'] + '_' + full_df['year'].astype(str)

full_df.set_index('ind', inplace=True)

full_df.head()

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,year,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
Nashville Predators_2018,82.0,267.0,145.0,193.0,58.0,18.0,211.0,54.0,101.6,81.94,117.0,0.713,21.17,0.923,9.9,0.03,0.71,Nashville Predators,53.0,2018,0.0,5,49.66,25.481481,9.185185,1.692593,1.7,4.103704,49.836,0.388889,48.0,4.0
Winnipeg Jets_2018,82.0,277.0,159.0,200.0,64.0,20.0,218.0,50.0,101.0,81.75,114.0,0.695,23.36,0.917,10.3,0.02,0.74,Winnipeg Jets,52.0,2018,0.0,3,51.061538,26.285714,7.25,1.792857,1.567857,3.925,51.357692,0.277778,41.0,14.0
Tampa Bay Lightning_2018,82.0,296.0,172.0,216.0,66.0,23.0,236.0,64.0,102.0,76.03,113.0,0.689,23.91,0.912,10.7,-0.07,0.66,Tampa Bay Lightning,54.0,2018,0.0,4,51.536,28.962963,9.333333,2.103704,1.407407,4.085185,51.576,0.388889,44.0,2.0
Boston Bruins_2018,82.0,270.0,161.0,197.0,61.0,20.0,214.0,40.0,100.2,83.67,112.0,0.683,23.64,0.912,9.9,-0.07,0.62,Boston Bruins,50.0,2018,0.0,7,53.162069,23.16129,5.419355,1.554839,1.509677,3.512903,53.32069,0.277778,36.0,21.0
Vegas Golden Knights_2018,82.0,272.0,182.0,218.0,53.0,24.0,228.0,44.0,100.5,81.43,109.0,0.665,21.37,0.911,10.1,-0.01,0.52,Vegas Golden Knights,51.0,2018,0.0,2,50.779167,26.0,5.037037,1.788889,1.518519,3.807407,50.770833,0.277778,36.0,9.0


 - Verifying I did not lose any information

In [5]:
full_df.groupby('year').count()

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2008,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2009,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2010,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2011,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2012,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2013,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2014,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,29,30,30
2015,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2016,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2017,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30


In [6]:
full_df[full_df['score_balance_pct'].isnull()]

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,year,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
Florida Panthers_2014,82.0,196.0,193.0,153.0,27.0,45.0,268.0,63.0,98.5,75.95,66.0,0.402,10.04,0.897,7.7,0.01,-0.87,Florida Panthers,29.0,2014,0.0,29,50.382143,15.21875,-5.8125,0.80625,0.728125,1.90625,50.082143,,-40.0,-36.0


I checked this and the Florida Panthers did not have any high scorers in 2014, thus I am filling this NaN with a 0.

In [7]:
full_df.fillna(value=0, inplace=True)

In [8]:
full_df = full_df[full_df['rank'] < 17]

### Setting up feature and target variables

In [9]:
X = full_df.drop(columns=['rank', 'cup_champs', 'team_name'])
y = full_df[['rank', 'year']]

#### Train, Test Split

- Train, test, split is a little tricky due to the dataset. The purpose of the model is to predict playoff performance based on stats from the regular season. So, I cannot use an automated train, test, split here, as I need training data that contains all of the observations from a given year. Instead I have decided to manually select 8 whole years of data to use as my training data, while holding out 2 whole years to use as my testing data.

In [10]:
X_train = X[(X['year'] != 2009) & (X['year'] != 2016)].drop(columns='year')

X_test = X[X['year'].isin([2009, 2016])].drop(columns='year')

y_test = y[y['year'].isin([2009, 2016])].drop(columns='year')

y_train = y[(y['year'] != 2009) & (y['year'] != 2016)].drop(columns='year')

#### Multiclass Logistic Regression

- Initially I had planned to predict only the Stanley Cup winner. This presented a big problem as there is only 1 cup winner in a given year out of 30 or 31 teams depending on the season. This is a huge class imbalance, coupled with the small number of observations (30/31) in a given year, creating a workable model from that data would be extremely difficult.
- Instead I have chosen to assign a rank to each team in every season for which I have data. The teams are ranked based on where they finished. The Stanley Cup winner is ranked at 1, the runner up at 2, followed by conference final runners-up and so on down to 31. For teams that exited the playoffs in the same round, the teams with the higher point totals in the regular season were ranked higher. This rank value is dropped from me feature set and is the main target variable.
- The Multiclass Logistic Regression will allow me to predict every teams final season ranking based on regular season statistics. In addition to receiving the numerical ranking for each team, I will be able to see the probabilities assigned to those predictions. While predicting how far each team will get in the playoffs is very difficult and very high accuracy is unlikely, assigning probabilities to those predictions is necessary for interpreting results.  

In [11]:
logreg = LogisticRegression(random_state=28, multi_class='multinomial', solver='lbfgs')
model = logreg.fit(X_train, y_train);

  y = column_or_1d(y, warn=True)


In [12]:
model.predict(X_train)
model.score(X_train, y_train)

0.3888888888888889

In [13]:
model.predict_proba(X_train)

array([[1.71818428e-02, 6.07476803e-03, 1.55469218e-01, ...,
        1.71202193e-04, 4.13179734e-05, 1.54125039e-07],
       [3.20813070e-02, 4.27659325e-02, 1.31184083e-01, ...,
        8.75697229e-04, 4.26525947e-04, 3.01915212e-06],
       [3.60592861e-03, 9.94193406e-03, 1.28725419e-02, ...,
        1.66700924e-03, 8.07642285e-04, 1.61834055e-05],
       ...,
       [1.40829143e-03, 4.22292271e-03, 1.94765377e-03, ...,
        3.09149337e-01, 1.28802369e-01, 1.24939112e-01],
       [3.81071010e-03, 1.07959396e-02, 8.60752216e-04, ...,
        2.53440457e-01, 2.47668349e-01, 1.02185296e-01],
       [2.81314816e-04, 6.23672422e-03, 2.04967554e-04, ...,
        5.20762270e-02, 1.33021827e-01, 6.35369436e-01]])

In [14]:
model.predict(X_test)
model.score(X_test, y_test)

0.1875

##### Initial Analysis:

 - Our model is not very predictive as expected. The model is tasked with predicting 16 different outcomes for 2 years, so 60 in total. The complicated part, however, is that the model has very little data to train on. It is essentially using 240 (8 years x 30 teams) observations to make 60 predictions. Compounding that difficulty is the fact that many of these teams are so tightly packed with very little separating them.
 
 - Lets put together a dataframe of the actual ranks vs. the predicted ranks with the probabilities of the predictions.

In [15]:
predictions = model.predict(X_test)

probs = model.predict_proba(X_test)

In [27]:
y_test

Unnamed: 0_level_0,rank
ind,Unnamed: 1_level_1
Washington Capitals_2016,5
Dallas Stars_2016,6
St Louis Blues_2016,3
Pittsburgh Penguins_2016,1
Anaheim Ducks_2016,9
Florida Panthers_2016,10
Chicago Blackhawks_2016,11
Los Angeles Kings_2016,12
New York Rangers_2016,13
New York Islanders_2016,7


In [16]:
probs

array([[3.53612607e-02, 5.00843378e-03, 1.47933342e-01, 3.97109183e-04,
        3.69507032e-01, 1.19726856e-01, 5.24469427e-03, 1.67535406e-05,
        2.37520639e-01, 7.07462277e-02, 8.42869726e-03, 9.44788126e-05,
        5.22416930e-07, 1.01503291e-05, 3.79591696e-06, 6.78060173e-09],
       [8.36058140e-02, 5.83461049e-02, 1.65169336e-01, 1.49297774e-01,
        8.79923918e-03, 2.29462279e-02, 2.68037121e-01, 2.24751150e-03,
        9.82715456e-03, 5.96796666e-03, 7.72301084e-02, 2.64017046e-02,
        6.90135925e-02, 4.38944426e-02, 8.67989761e-03, 5.36004750e-04],
       [3.29552131e-02, 1.80664290e-02, 5.75723728e-02, 2.53351543e-02,
        9.03524485e-02, 3.07254619e-01, 3.70987291e-02, 6.80900671e-03,
        2.04912517e-01, 1.50117170e-01, 5.93145146e-02, 4.77585224e-03,
        3.21584178e-04, 2.53431248e-03, 2.49566813e-03, 8.44094774e-05],
       [8.88279836e-02, 1.95184316e-01, 5.58364373e-02, 1.67170146e-02,
        3.39023352e-02, 8.05456093e-02, 1.11132537e-01, 1.640

In [17]:
ind = pd.Series(y_test.index)
y_t = pd.Series(y_test['rank'])
preds = pd.Series(predictions)

probs_df = pd.DataFrame(probs).round(decimals=3)

In [18]:
probs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.035,0.005,0.148,0.0,0.37,0.12,0.005,0.0,0.238,0.071,0.008,0.0,0.0,0.0,0.0,0.0
1,0.084,0.058,0.165,0.149,0.009,0.023,0.268,0.002,0.01,0.006,0.077,0.026,0.069,0.044,0.009,0.001
2,0.033,0.018,0.058,0.025,0.09,0.307,0.037,0.007,0.205,0.15,0.059,0.005,0.0,0.003,0.002,0.0
3,0.089,0.195,0.056,0.017,0.034,0.081,0.111,0.016,0.03,0.096,0.156,0.097,0.001,0.004,0.017,0.0
4,0.054,0.071,0.058,0.014,0.082,0.444,0.06,0.006,0.068,0.065,0.068,0.004,0.0,0.002,0.004,0.0
5,0.027,0.06,0.065,0.027,0.096,0.043,0.026,0.051,0.194,0.087,0.151,0.126,0.007,0.017,0.022,0.0
6,0.011,0.047,0.023,0.03,0.029,0.32,0.067,0.014,0.184,0.138,0.085,0.024,0.0,0.007,0.017,0.001
7,0.133,0.022,0.021,0.009,0.051,0.043,0.033,0.032,0.133,0.292,0.135,0.068,0.002,0.013,0.014,0.0
8,0.003,0.01,0.013,0.06,0.01,0.026,0.014,0.302,0.062,0.029,0.1,0.056,0.126,0.121,0.058,0.008
9,0.006,0.04,0.013,0.073,0.006,0.181,0.064,0.227,0.017,0.091,0.083,0.047,0.011,0.034,0.067,0.04


In [19]:
results = pd.DataFrame(data=[ind, y_t.values, preds]).T

results.rename(columns={'Unnamed 0': 'Actual Rank', 'Unnamed 1': 'Predicted Rank'}, inplace=True)

probs_max = pd.DataFrame(probs_df.max(axis=1))

In [20]:
results = pd.merge(results, probs_max, left_index=True, right_index=True)

results.rename(columns={0: 'Probability'}, inplace=True)

results

Unnamed: 0,ind,Actual Rank,Predicted Rank,Probability
0,Washington Capitals_2016,5,5,0.37
1,Dallas Stars_2016,6,7,0.268
2,St Louis Blues_2016,3,6,0.307
3,Pittsburgh Penguins_2016,1,2,0.195
4,Anaheim Ducks_2016,9,6,0.444
5,Florida Panthers_2016,10,9,0.194
6,Chicago Blackhawks_2016,11,6,0.32
7,Los Angeles Kings_2016,12,10,0.292
8,New York Rangers_2016,13,8,0.302
9,New York Islanders_2016,7,8,0.227


In [21]:
results_2018 = results[0:15]
results_2018 = results_2018[results_2018['Actual Rank'] < 17].sort_values(by='Actual Rank', ascending=True)
results_2018

Unnamed: 0,ind,Actual Rank,Predicted Rank,Probability
3,Pittsburgh Penguins_2016,1,2,0.195
10,San Jose Sharks_2016,2,2,0.213
2,St Louis Blues_2016,3,6,0.307
11,Tampa Bay Lightning_2016,4,8,0.252
0,Washington Capitals_2016,5,5,0.37
1,Dallas Stars_2016,6,7,0.268
9,New York Islanders_2016,7,8,0.227
13,Nashville Predators_2016,8,15,0.165
4,Anaheim Ducks_2016,9,6,0.444
5,Florida Panthers_2016,10,9,0.194


  ---

In [25]:
probs_df.sum(axis=1)

0     1.000
1     1.000
2     0.999
3     1.000
4     1.000
5     0.999
6     0.997
7     1.001
8     0.998
9     1.000
10    1.001
11    1.000
12    1.000
13    1.000
14    0.999
15    1.001
16    0.999
17    0.998
18    0.999
19    1.000
20    1.000
21    1.000
22    1.000
23    0.999
24    0.999
25    1.000
26    1.001
27    0.999
28    1.000
29    0.999
30    1.000
31    1.000
dtype: float64

### Comparing my predictions to Vegas for 2018

In [28]:
probs_df.sum()

0     1.053
1     1.424
2     1.636
3     2.167
4     1.892
5     2.313
6     2.467
7     1.998
8     2.526
9     1.926
10    1.748
11    2.011
12    1.452
13    2.471
14    2.425
15    2.479
dtype: float64

In [None]:
probs_df['round_one_advance_%'] = probs_df[0] + probs_df[1] + probs_df[2] + probs_df[3] + probs_df[4] + probs_df[5] + probs_df[6] + probs_df[7]
probs_df['round_two_advance_%'] = probs_df[0] + probs_df[1] + probs_df[2] + probs_df[3]
probs_df['win_conference_%'] = probs_df[0] + probs_df[1]
probs_df['win_cup_%'] = probs_df[0]

In [None]:
probs_df

In [None]:
probs_df.drop(columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], inplace=True)

In [None]:
probs_df.head()

In [None]:
odds_results = pd.merge(results, probs_df, left_index=True, right_index=True)

In [None]:
odds_2018 = odds_results[0:30]
playoff_odds_2018 = odds_2018[odds_2018['Actual Rank'] < 17]
playoff_odds_2018