# Modeling the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

%matplotlib inline



In [2]:
pd.set_option('display.max_columns', 500)

### Importing Data

In [3]:
full_df = pd.read_csv('./Data/model data.csv', index_col=0)

#### One more item to clean up

- In order to model this correctly using Logistic Regression I need to remove the 'team_name' column from the dataset. However, I still need to know which team the results are associated with. To do this I am resetting the index as a combination of team_name and year. This way the information will remain as part of the data, but will not be included in the model.

In [4]:
full_df['ind'] = full_df['team_name'] + '_' + full_df['year'].astype(str)

full_df.set_index('ind', inplace=True)

full_df.head()

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,year,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
Nashville Predators_2018,82.0,267.0,145.0,193.0,58.0,18.0,211.0,54.0,101.6,81.94,117.0,0.713,21.17,0.923,9.9,0.03,0.71,Nashville Predators,53.0,2018,0.0,5,49.66,25.481481,9.185185,1.692593,1.7,4.103704,49.836,0.388889,48.0,4.0
Winnipeg Jets_2018,82.0,277.0,159.0,200.0,64.0,20.0,218.0,50.0,101.0,81.75,114.0,0.695,23.36,0.917,10.3,0.02,0.74,Winnipeg Jets,52.0,2018,0.0,3,51.061538,26.285714,7.25,1.792857,1.567857,3.925,51.357692,0.277778,41.0,14.0
Tampa Bay Lightning_2018,82.0,296.0,172.0,216.0,66.0,23.0,236.0,64.0,102.0,76.03,113.0,0.689,23.91,0.912,10.7,-0.07,0.66,Tampa Bay Lightning,54.0,2018,0.0,4,51.536,28.962963,9.333333,2.103704,1.407407,4.085185,51.576,0.388889,44.0,2.0
Boston Bruins_2018,82.0,270.0,161.0,197.0,61.0,20.0,214.0,40.0,100.2,83.67,112.0,0.683,23.64,0.912,9.9,-0.07,0.62,Boston Bruins,50.0,2018,0.0,7,53.162069,23.16129,5.419355,1.554839,1.509677,3.512903,53.32069,0.277778,36.0,21.0
Vegas Golden Knights_2018,82.0,272.0,182.0,218.0,53.0,24.0,228.0,44.0,100.5,81.43,109.0,0.665,21.37,0.911,10.1,-0.01,0.52,Vegas Golden Knights,51.0,2018,0.0,2,50.779167,26.0,5.037037,1.788889,1.518519,3.807407,50.770833,0.277778,36.0,9.0


 - Verifying I did not lose any information

In [5]:
full_df.groupby('year').count()

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2008,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2009,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2010,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2011,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2012,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2013,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2014,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,29,30,30
2015,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2016,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
2017,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30


In [6]:
full_df[full_df['score_balance_pct'].isnull()]

Unnamed: 0_level_0,games,goals,goals_against_ev,goals_ev,goals_pp,losses,opp_goals,opp_goals_pp,pdo,pen_kill_pct,points,points_pct,power_play_pct,save_pct,shot_pct,sos,srs,team_name,wins,year,cup_champs,rank,avg_corsi_pct,player_point_avg,avg_plus_minus,avg_ops,avg_dps,avg_ps,fenwick_pct,score_balance_pct,ev_goal_diff,special_teams_diff
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
Florida Panthers_2014,82.0,196.0,193.0,153.0,27.0,45.0,268.0,63.0,98.5,75.95,66.0,0.402,10.04,0.897,7.7,0.01,-0.87,Florida Panthers,29.0,2014,0.0,29,50.382143,15.21875,-5.8125,0.80625,0.728125,1.90625,50.082143,,-40.0,-36.0


I checked this and the Florida Panthers did not have any high scorers in 2014, thus I am filling this NaN with a 0.

In [7]:
full_df.fillna(value=0, inplace=True)

### Setting up feature and target variables

In [8]:
X = full_df.drop(columns=['rank', 'cup_champs', 'team_name'])
y = full_df[['rank', 'year']]

#### Train, Test Split

- Train, test, split is a little tricky due to the dataset. The purpose of the model is to predict playoff performance based on stats from the regular season. So, I cannot use an automated train, test, split here, as I need training data that contains all of the observations from a given year. Instead I have decided to manually select 8 whole years of data to use as my training data, while holding out 2 whole years to use as my testing data.

In [9]:
X_train = X[(X['year'] != 2018) & (X['year'] != 2011)].drop(columns='year')

X_test = X[X['year'].isin([2018, 2011])].drop(columns='year')

y_test = y[y['year'].isin([2018, 2011])].drop(columns='year')

y_train = y[(y['year'] != 2018) & (y['year'] != 2011)].drop(columns='year')

#### Multiclass Logistic Regression

- Initially I had planned to predict only the Stanley Cup winner. This presented a big problem as there is only 1 cup winner in a given year out of 30 or 31 teams depending on the season. This is a huge class imbalance, coupled with the small number of observations (30/31) in a given year, creating a workable model from that data would be extremely difficult.
- Instead I have chosen to assign a rank to each team in every season for which I have data. The teams are ranked based on where they finished. The Stanley Cup winner is ranked at 1, the runner up at 2, followed by conference final runners-up and so on down to 31. For teams that exited the playoffs in the same round, the teams with the higher point totals in the regular season were ranked higher. This rank value is dropped from me feature set and is the main target variable.
- The Multiclass Logistic Regression will allow me to predict every teams final season ranking based on regular season statistics. In addition to receiving the numerical ranking for each team, I will be able to see the probabilities assigned to those predictions. While predicting how far each team will get in the playoffs is very difficult and very high accuracy is unlikely, assigning probabilities to those predictions is necessary for interpreting results.  

In [10]:
logreg = LogisticRegression(random_state=28, multi_class='multinomial', solver='lbfgs')
model = logreg.fit(X_train, y_train);

  y = column_or_1d(y, warn=True)


In [11]:
model.predict(X_train)
model.score(X_train, y_train)

0.37407407407407406

In [12]:
model.predict_proba(X_train)

array([[2.81684090e-02, 5.49799027e-05, 8.53423465e-02, ...,
        4.27894638e-53, 2.41429029e-54, 1.68071855e-58],
       [1.04201895e-01, 1.17938183e-01, 1.74678548e-01, ...,
        8.10092351e-37, 1.08009565e-38, 3.22377453e-41],
       [4.57982612e-03, 2.03155590e-03, 5.05010609e-02, ...,
        1.41462234e-27, 2.48576926e-28, 6.41693626e-32],
       ...,
       [2.18834691e-34, 5.15498678e-28, 7.90669978e-32, ...,
        1.61729165e-01, 1.93694051e-01, 1.38074814e-01],
       [1.34384409e-28, 1.64056789e-21, 3.76464656e-27, ...,
        1.75804245e-01, 2.40049842e-01, 2.50611120e-01],
       [2.05874217e-31, 4.26314332e-24, 3.61778922e-30, ...,
        2.12623449e-01, 1.80861432e-01, 1.84361856e-01]])

In [13]:
model.predict(X_test)
model.score(X_test, y_test)

0.08196721311475409

##### Initial Analysis:

 - Our model is not very predictive as expected. The model is tasked with predicting 30 different outcomes for 2 years, so 60 in total. The complicated part, however, is that the model has very little data to train on. It is essentially using 240 (8 years x 30 teams) observations to make 60 predictions. Compounding that difficulty is the fact that many of these teams are so tightly packed with very little separating them.
 
 - Lets put together a dataframe of the actual ranks vs. the predicted ranks with the probabilities of the predictions.

In [14]:
predictions = model.predict(X_test)

probs = model.predict_proba(X_test)

In [15]:
ind = pd.Series(y_test.index)
y_t = pd.Series(y_test['rank'])
preds = pd.Series(predictions)

probs_df = pd.DataFrame(probs).round(decimals=3)

In [16]:
results = pd.DataFrame(data=[ind, y_t.values, preds]).T

results.rename(columns={'Unnamed 0': 'Actual Rank', 'Unnamed 1': 'Predicted Rank'}, inplace=True)

probs_max = pd.DataFrame(probs_df.max(axis=1))

In [17]:
results = pd.merge(results, probs_max, left_index=True, right_index=True)

results.rename(columns={0: 'Probability'}, inplace=True)

results

Unnamed: 0,ind,Actual Rank,Predicted Rank,Probability
0,Nashville Predators_2018,5,9,0.658
1,Winnipeg Jets_2018,3,9,0.374
2,Tampa Bay Lightning_2018,4,10,0.396
3,Boston Bruins_2018,7,6,0.470
4,Vegas Golden Knights_2018,2,7,0.341
5,Washington Capitals_2018,1,7,0.243
6,Toronto Maple Leafs_2018,10,7,0.332
7,Anaheim Ducks_2018,13,8,0.336
8,Minnesota Wild_2018,14,8,0.257
9,Pittsburgh Penguins_2018,8,7,0.260


  ---