In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

%matplotlib inline



In [2]:
pd.set_option('display.max_columns', 500)

### Importing data

In [3]:
team_stats = pd.read_csv('./Data/full team stats.csv', index_col=0)
player_stats = pd.read_csv('./Data/full player stats.csv', index_col=0)
advanced_stats = pd.read_csv('./Data/full advanced stats.csv', index_col=0)

cup_champs = pd.read_csv('./Data/cup champs.csv', index_col=0)
team_ranks = pd.read_csv('./Data/NHL Rankings 2008-2018 vertical.csv', index_col=0)

#### Couple more items to clean up

- Renaming the 'team_name' column in the team stats dataframe to just 'team' to match the player and advanced stats dfs.
- There is a discrepancy in the dataframes between the St Louis Blues name. The 'advanced_stat' and 'player_stat' dataframes have a period after St(.). I'm removing that so the team names match
- resetting index and cleaning the team ranks dataframe

In [4]:
team_stats.rename(columns={'team_name': 'team'}, inplace=True)

advanced_stats['team'] = advanced_stats['team'].replace('[^A-Za-z ]', '', regex=True)
player_stats['team'] = player_stats['team'].replace('[^A-Za-z ]', '', regex=True)

In [5]:
team_ranks.reset_index(inplace=True)

team_ranks.rename(columns={'Team': 'team', 'Year': 'year', 'Rank': 'rank'}, inplace=True)

team_ranks['team'].map(lambda x: 'Winnipeg Jets' if 'Jets' in x else x)

0        Washington Capitals
1       Vegas Golden Knights
2              Winnipeg Jets
3        Tampa Bay Lightning
4        Nashville Predators
5            San Jose Sharks
6              Boston Bruins
7        Pittsburgh Penguins
8        Philadelphia Flyers
9        Toronto Maple Leafs
10     Columbus Blue Jackets
11          New Jersey Devis
12             Anaheim Ducks
13            Minnesota Wild
14         Los Angeles Kings
15        Colorado Avalanche
16           Ottawa Senators
17           St. Louis Blues
18           Edmonton Oilers
19          New York Rangers
20        Montreal Canadians
21            Calgary Flames
22        Chicago Blackhawks
23              Dallas Stars
24        New York Islanders
25         Detroit Red Wings
26          Florida Panthers
27         Vancouver Canucks
28           Arizona Coyotes
29            Buffalo Sabres
               ...          
301        Detroit Red Wings
302      Pittsburgh Penguins
303             Dallas Stars
304      Phila

In [6]:
stats_rank = pd.merge(team_stats, team_ranks, on=['team', 'year'])

In [7]:
stats_rank['ind'] = stats_rank['team'] + '_' + stats_rank['year'].astype(str)

In [8]:
stats_rank.set_index('ind', inplace=True)

In [9]:
stats_rank.head()

Unnamed: 0_level_0,average_age,chances_pp,games,goals,goals_against_ev,goals_ev,goals_pp,goals_sh,losses,losses_ot,losses_shootout,opp_chances_pp,opp_goals,opp_goals_pp,opp_goals_sh,pdo,pen_kill_pct,pen_min_per_game,pen_min_per_game_opp,points,points_pct,power_play_pct,save_pct,shot_pct,shots,shots_against,sos,srs,team,total_goals_per_game,wins,wins_shootout,year,is_champ,rank
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Nashville Predators_2018,28.4,274,82,267,145,193,58,10,18,11,7,299,211,54,5,101.6,81.94,11.3,9.6,117,0.713,21.17,0.923,9.9,2641,2659,0.03,0.71,Nashville Predators,5.83,53,6,2018,0,5
Tampa Bay Lightning_2018,27.5,276,82,296,172,216,66,9,23,5,2,267,236,64,3,102.0,76.03,10.1,10.4,113,0.689,23.91,0.912,10.7,2737,2756,-0.07,0.66,Tampa Bay Lightning,6.49,54,6,2018,0,4
Boston Bruins_2018,28.6,258,82,270,161,197,61,9,20,12,3,245,214,40,10,100.2,83.67,9.5,9.6,112,0.683,23.64,0.912,9.9,2703,2399,-0.07,0.62,Boston Bruins,5.9,50,3,2018,0,7
Vegas Golden Knights_2018,28.0,248,82,272,182,218,53,8,24,7,3,237,228,44,5,100.5,81.43,7.1,7.8,109,0.665,21.37,0.911,10.1,2774,2619,-0.01,0.52,Vegas Golden Knights,6.1,51,4,2018,0,2
Washington Capitals_2018,28.4,244,82,259,178,197,55,4,26,7,1,269,239,53,8,101.4,80.3,9.9,9.3,105,0.64,22.54,0.909,10.7,2400,2637,-0.04,0.21,Washington Capitals,6.07,49,3,2018,1,1


In [10]:
X = stats_rank.drop(columns=['rank', 'is_champ', 'team'])
y = stats_rank[['rank', 'year']]

In [11]:
X_train = X[(X['year'] != 2016) & (X['year'] != 2009)].drop(columns='year')

In [12]:
X_test = X[X['year'].isin([2016, 2009])].drop(columns='year')

In [13]:
y_test = y[y['year'].isin([2016, 2009])].drop(columns='year')

In [14]:
y_train = y[(y['year'] != 2016) & (y['year'] != 2009)].drop(columns='year')

In [15]:
y_train.head()

Unnamed: 0_level_0,rank
ind,Unnamed: 1_level_1
Nashville Predators_2018,5
Tampa Bay Lightning_2018,4
Boston Bruins_2018,7
Vegas Golden Knights_2018,2
Washington Capitals_2018,1


In [16]:
logreg = LogisticRegression(random_state=28, multi_class='multinomial', solver='lbfgs')
model = logreg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
model.predict(X_train)
model.score(X_train, y_train)

0.2183406113537118

In [18]:
model.predict_proba(X_train)

array([[6.48818181e-02, 1.09484168e-01, 3.16599513e-02, ...,
        4.82071931e-06, 4.57718155e-09, 5.17986736e-11],
       [1.44794026e-02, 8.10556477e-02, 1.64175848e-02, ...,
        1.89926274e-05, 1.20933265e-08, 3.94211960e-10],
       [7.45788202e-02, 1.00342726e-01, 4.27152823e-02, ...,
        1.12092658e-05, 1.79964679e-08, 9.20038753e-06],
       ...,
       [3.54259750e-04, 1.31510978e-03, 3.54862048e-04, ...,
        4.20352696e-02, 1.21262399e-01, 1.22208497e-12],
       [6.70350284e-06, 6.47706909e-04, 5.62624265e-05, ...,
        5.43874178e-02, 1.95170775e-01, 2.45445317e-14],
       [6.12294914e-06, 1.54599175e-04, 1.14911867e-04, ...,
        6.90535319e-02, 3.96581639e-01, 5.22648417e-10]])

In [19]:
model.predict(X_test)
model.score(X_test, y_test)

0.11764705882352941

In [20]:
predictions = model.predict(X_test)

In [21]:
probs = model.predict_proba(X_test)

In [23]:
probs

array([[8.41593541e-02, 1.13832081e-01, 8.26990236e-02, ...,
        2.26542200e-06, 3.50088010e-09, 9.92945725e-08],
       [3.12494185e-02, 4.31072244e-02, 1.43800699e-01, ...,
        2.40767073e-05, 3.17475646e-07, 1.89112639e-03],
       [2.42699843e-01, 1.08551973e-01, 2.83925158e-02, ...,
        5.16697971e-05, 1.57709864e-07, 5.44307132e-05],
       ...,
       [8.89932789e-06, 1.03132088e-04, 2.20063124e-05, ...,
        8.24319925e-02, 5.42297257e-01, 2.36355726e-11],
       [1.12298458e-06, 3.24923682e-05, 9.69238413e-06, ...,
        6.09108720e-02, 6.14841505e-01, 1.11130366e-17],
       [8.64600063e-08, 2.85589432e-06, 1.49006298e-07, ...,
        9.62505074e-02, 7.97502330e-01, 2.65064718e-17]])

In [32]:
y_test.values

array([[ 5],
       [ 6],
       [ 1],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [ 7],
       [ 2],
       [ 4],
       [14],
       [ 8],
       [17],
       [15],
       [16],
       [18],
       [19],
       [22],
       [23],
       [24],
       [26],
       [27],
       [28],
       [29],
       [30],
       [ 9],
       [ 5],
       [ 2],
       [ 6],
       [ 3],
       [ 7],
       [ 1],
       [11],
       [12],
       [ 4],
       [13],
       [17],
       [15],
       [18],
       [ 8],
       [19],
       [20],
       [21],
       [22],
       [23],
       [24],
       [26],
       [28],
       [29],
       [30]])

In [31]:
ind = pd.Series(y_test.index)
y_t = pd.Series(y_test.values)
preds = pd.Series(predictions)

Exception: Data must be 1-dimensional

In [33]:
ind

0       Washington Capitals_2016
1              Dallas Stars_2016
2       Pittsburgh Penguins_2016
3             Anaheim Ducks_2016
4          Florida Panthers_2016
5        Chicago Blackhawks_2016
6         Los Angeles Kings_2016
7          New York Rangers_2016
8        New York Islanders_2016
9           San Jose Sharks_2016
10      Tampa Bay Lightning_2016
11      Philadelphia Flyers_2016
12      Nashville Predators_2016
13            Boston Bruins_2016
14        Detroit Red Wings_2016
15           Minnesota Wild_2016
16      Carolina Hurricanes_2016
17          Ottawa Senators_2016
18       Colorado Avalanche_2016
19           Buffalo Sabres_2016
20          Arizona Coyotes_2016
21           Calgary Flames_2016
22    Columbus Blue Jackets_2016
23        Vancouver Canucks_2016
24          Edmonton Oilers_2016
25      Toronto Maple Leafs_2016
26          San Jose Sharks_2009
27            Boston Bruins_2009
28        Detroit Red Wings_2009
29      Washington Capitals_2009
30       C

In [28]:
pd.concat([ind, y_t, preds], axis=1)

Unnamed: 0,ind,rank,0
0,Washington Capitals_2016,,9.0
1,Dallas Stars_2016,,3.0
2,Pittsburgh Penguins_2016,,1.0
3,Anaheim Ducks_2016,,1.0
4,Florida Panthers_2016,,9.0
5,Chicago Blackhawks_2016,,6.0
6,Los Angeles Kings_2016,,1.0
7,New York Rangers_2016,,11.0
8,New York Islanders_2016,,6.0
9,San Jose Sharks_2016,,7.0


In [None]:
champ_dict = cup_champs[cup_champs['cup_champs']==1].set_index('Year').drop(columns='cup_champs').to_dict()['Team']

In [None]:
champ_dict

In [None]:
champs_df = pd.DataFrame.from_dict(champ_dict, orient='index')
champs_df.rename(columns={0: 'team'}, inplace=True)

In [None]:
champs_df.reset_index(inplace=True)

In [None]:
champs_df.rename(columns={'index': 'year'}, inplace=True)

In [None]:
champs_df['is_champ'] = 1

In [None]:
champs_df

In [None]:
player_champ_df = pd.merge(player_stats, champs_df, how='outer', on=['year', 'team'])

In [None]:
player_champ_df['is_champ'].fillna(value=0, inplace=True)

In [None]:
player_champ_df

In [None]:
correlations = player_champ_df.drop(['is_champ', 'team', 'player', 'position', 'toi_avg'], axis=1).apply(lambda x: x.corr(player_champ_df['is_champ'])).abs().sort_values(ascending=False)

In [None]:
player_stats

In [None]:
advanced_stats

In [None]:
team_stats['rank'] = 0

In [None]:
team_stats.drop(columns='rank', inplace=True)

#### Couple more items to clean up

- Renaming the 'team_name' column in the team stats dataframe to just 'team' to match the player and advanced stats dfs.
- There is a discrepancy in the dataframes between the St Louis Blues name. The 'advanced_stat' and 'player_stat' dataframes have a period after St(.). I'm removing that so the team names match

In [None]:
team_stats.rename(columns={'team_name': 'team'}, inplace=True)

advanced_stats['team'] = advanced_stats['team'].replace('[^A-Za-z ]', '', regex=True)
player_stats['team'] = player_stats['team'].replace('[^A-Za-z ]', '', regex=True)

### Function to transform individual stats into team averages to be used in the team stats dataframe

In [None]:
def agg_stat(dataframe, col_name):
    y = dataframe.groupby(by=['year', 'team'])[col_name].mean().unstack()
    y.reset_index(inplace=True)
    y = pd.melt(y, id_vars=['year'])
    return y   

In [None]:
corsi_pct = agg_stat(advanced_stats, 'corsi_pct')

In [None]:
team_stats = pd.merge(team_stats, corsi_pct, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'avg_corsi_pct'}, inplace=True)

In [None]:
corsi_for = agg_stat(advanced_stats, 'corsi_for')

In [None]:
team_stats = pd.merge(team_stats, corsi_for, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'avg_corsi_for'}, inplace=True)

In [None]:
player_point_avg = agg_stat(player_stats, 'points')

In [None]:
team_stats = pd.merge(team_stats, player_point_avg, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'player_point_avg'}, inplace=True)

In [None]:
avg_plus_minus = agg_stat(player_stats, 'plus_minus')

In [None]:
team_stats = pd.merge(team_stats, avg_plus_minus, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'avg_plus_minus'}, inplace=True)

In [None]:
avg_ops = agg_stat(player_stats, 'ops')

In [None]:
team_stats = pd.merge(team_stats, avg_ops, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'avg_ops'}, inplace=True)

In [None]:
avg_dps = agg_stat(player_stats, 'dps')

In [None]:
team_stats = pd.merge(team_stats, avg_dps, on=['year', 'team'])

In [None]:
team_stats.rename(columns={'value': 'avg_dps'}, inplace=True)

In [None]:
pd.merge(team_stats, 

- Absolute value of correlations to champion

In [None]:
correlations = team_stats.drop(['is_champ', 'team'], axis=1).apply(lambda x: x.corr(team_stats['is_champ'])).abs().sort_values(ascending=False)

In [None]:
correlations

In [None]:
pd.get_dummies(team_stats, columns=['year'])

### First Bad Model

In [None]:
team_stats = pd.get_dummies(team_stats, columns=['rank'])

In [None]:
team_stats.drop(columns='is_champ', inplace=True)

In [None]:
team_stats.columns

In [None]:
X = team_stats.drop(columns=['average_age', 'chances_pp', 'games', 'goals', 'goals_against_ev',
                             'goals_ev', 'goals_pp', 'goals_sh', 'losses', 'losses_ot',
                             'losses_shootout', 'opp_chances_pp', 'opp_goals', 'opp_goals_pp',
                             'opp_goals_sh', 'pdo', 'pen_kill_pct', 'pen_min_per_game',
                             'pen_min_per_game_opp', 'points', 'points_pct', 'power_play_pct',
                             'save_pct', 'shot_pct', 'shots', 'shots_against', 'sos', 'srs',
                             'team_name', 'total_goals_per_game', 'wins', 'wins_shootout', 'year']
            
y = team_stats['rank_1', 'rank_2', 'rank_3', 'rank_4', 'rank_5', 'rank_6', 'rank_7',
               'rank_8', 'rank_9', 'rank_10', 'rank_11', 'rank_12', 'rank_13',
               'rank_14', 'rank_15', 'rank_16', 'rank_17', 'rank_18', 'rank_19',
               'rank_20', 'rank_21', 'rank_22', 'rank_23', 'rank_24', 'rank_25',
               'rank_26', 'rank_27', 'rank_28', 'rank_29', 'rank_30', 'rank_31']

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=.3, random_state=28)

In [None]:
rid


### Creating balanced scoring feature

In [34]:
stats_2018 = player_stats[player_stats['year'] == 2018]
stats_2017 = player_stats[player_stats['year'] == 2017]
stats_2016 = player_stats[player_stats['year'] == 2016]
stats_2015 = player_stats[player_stats['year'] == 2015]
stats_2014 = player_stats[player_stats['year'] == 2014]
stats_2013 = player_stats[player_stats['year'] == 2013]
stats_2012 = player_stats[player_stats['year'] == 2012]
stats_2011 = player_stats[player_stats['year'] == 2011]
stats_2010 = player_stats[player_stats['year'] == 2010]
stats_2009 = player_stats[player_stats['year'] == 2009]
stats_2008 = player_stats[player_stats['year'] == 2008]

In [35]:
skater_2018 = stats_2018[stats_2018['position'] != 'G']
skater_2017 = stats_2017[stats_2017['position'] != 'G']
skater_2016 = stats_2016[stats_2016['position'] != 'G']
skater_2015 = stats_2015[stats_2015['position'] != 'G']
skater_2014 = stats_2014[stats_2014['position'] != 'G']
skater_2013 = stats_2013[stats_2013['position'] != 'G']
skater_2012 = stats_2012[stats_2012['position'] != 'G']
skater_2011 = stats_2011[stats_2011['position'] != 'G']
skater_2010 = stats_2010[stats_2010['position'] != 'G']
skater_2009 = stats_2009[stats_2009['position'] != 'G']
skater_2008 = stats_2008[stats_2008['position'] != 'G']

In [36]:
pts_mean_2018 = skater_2018['points'].mean()
pts_std_2018 = skater_2018['points'].std()
high_pts_2018 = pts_mean_2018 + pts_std_2018

In [37]:
high_pts_2018

46.37545359549142

In [38]:
top_scorers_2018 = skater_2018[skater_2018['points'] > high_pts_2018]

In [40]:
top_scorers_2018

Unnamed: 0,age,assists,dps,es_assists,es_blocks,es_faceoff_losses,es_faceoff_pct,es_faceoff_wins,es_goals,es_hits,games_played,goals,gw_goals,ops,penalty_minutes,player,plus_minus,point_shares,points,position,pp_assists,pp_goals,sh_assists,sh_goals,shot_pct,shots,team,toi,toi_avg,year
0,24.0,35,1.9,25.0,30.0,121.0,46.5,105.0,26,115,77,34,3,6.4,14,Rickard Rakell,6.0,8.3,69,RW,10.0,8,0.0,0,14.8,230,Anaheim Ducks,1495,19:25,2018
1,32.0,50,2.1,36.0,57.0,443.0,47.8,406.0,10,96,56,11,0,4.0,42,Ryan Getzlaf,20.0,6.2,61,C,13.0,0,1.0,1,9.4,117,Anaheim Ducks,1200,21:26,2018
2,32.0,32,1.2,25.0,36.0,14.0,12.5,2.0,13,59,71,17,1,3.3,71,Corey Perry,-4.0,4.5,49,RW,7.0,4,0.0,0,10.1,168,Anaheim Ducks,1262,17:47,2018
30,19.0,42,1.7,27.0,28.0,41.0,47.4,37.0,17,7,82,23,3,4.9,24,Clayton Keller,-7.0,6.7,65,C,14.0,6,1.0,0,10.8,212,Arizona Coyotes,1483,18:05,2018
31,27.0,42,1.9,27.0,54.0,810.0,47.7,740.0,9,52,82,14,2,2.8,26,Derek Stepan,-7.0,4.7,56,C,15.0,3,0.0,2,6.7,209,Arizona Coyotes,1580,19:16,2018
56,29.0,51,2.5,33.0,25.0,76.0,39.7,50.0,25,37,68,34,8,8.4,63,Brad Marchand,25.0,11.0,85,LW,15.0,8,3.0,1,18.7,182,Boston Bruins,1341,19:43,2018
57,21.0,45,2.1,32.0,31.0,20.0,20.0,5.0,22,55,82,35,5,7.7,37,David Pastrnak,10.0,9.7,80,RW,13.0,13,0.0,0,14.2,246,Boston Bruins,1473,17:58,2018
58,32.0,33,2.3,21.0,42.0,584.0,57.3,784.0,18,48,64,30,6,6.0,26,Patrice Bergeron,21.0,8.3,63,C,11.0,10,1.0,2,13.4,224,Boston Bruins,1244,19:26,2018
59,26.0,45,3.3,26.0,73.0,0.0,0.0,0.0,8,79,76,14,4,5.6,36,Torey Krug,0.0,8.8,59,D,19.0,5,0.0,1,7.1,197,Boston Bruins,1551,20:24,2018
60,22.0,31,1.7,21.0,33.0,13.0,43.5,10.0,13,45,77,16,4,3.2,16,Danton Heinen,10.0,4.9,47,C/LW,10.0,2,0.0,1,11.9,135,Boston Bruins,1157,15:01,2018


In [None]:
for player in top_scorers_2018:
    print(player)