In [39]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

tf.enable_eager_execution()

In [3]:
np.random.seed(1313)
tf.set_random_seed(1313)

In [4]:
game_stats = pd.read_csv("data/game_teams_stats.csv")

In [5]:
game_stats.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1


In [6]:
home = game_stats.loc[game_stats['HoA']=='home']
away = game_stats.loc[game_stats['HoA']=='away']

In [7]:
games = home.merge(away, on='game_id', suffixes=['_home', '_away'])

In [8]:
games.head()

Unnamed: 0,game_id,team_id_home,HoA_home,won_home,settled_in_home,head_coach_home,goals_home,shots_home,hits_home,pim_home,...,head_coach_away,goals_away,shots_away,hits_away,pim_away,powerPlayOpportunities_away,powerPlayGoals_away,faceOffWinPercentage_away,giveaways_away,takeaways_away
0,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,...,Peter DeBoer,3,26,31,12,3,1,44.9,6,7
1,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,...,Peter DeBoer,4,35,32,12,4,0,50.9,8,7
2,2011030223,1,home,True,OT,Peter DeBoer,4,31,30,10,...,Peter Laviolette,3,28,28,4,5,1,50.8,2,1
3,2011030224,1,home,True,REG,Peter DeBoer,4,43,19,4,...,Peter Laviolette,2,22,23,10,2,1,62.5,20,5
4,2011030225,4,home,False,REG,Peter Laviolette,1,28,38,8,...,Peter DeBoer,3,30,26,2,4,1,55.0,6,5


In [9]:
print(games.columns)

Index(['game_id', 'team_id_home', 'HoA_home', 'won_home', 'settled_in_home',
       'head_coach_home', 'goals_home', 'shots_home', 'hits_home', 'pim_home',
       'powerPlayOpportunities_home', 'powerPlayGoals_home',
       'faceOffWinPercentage_home', 'giveaways_home', 'takeaways_home',
       'team_id_away', 'HoA_away', 'won_away', 'settled_in_away',
       'head_coach_away', 'goals_away', 'shots_away', 'hits_away', 'pim_away',
       'powerPlayOpportunities_away', 'powerPlayGoals_away',
       'faceOffWinPercentage_away', 'giveaways_away', 'takeaways_away'],
      dtype='object')


In [10]:
relevant_columns = ['game_id', 'won_home', 'settled_in_home', 'shots_home', 'hits_home', 'pim_home',
                   'powerPlayOpportunities_home', 'faceOffWinPercentage_home', 'giveaways_home', 'takeaways_home', 
                   'shots_away', 'hits_away', 'pim_away','powerPlayOpportunities_away', 'giveaways_away', 'takeaways_away']

In [11]:
games = games.loc[:, relevant_columns]

In [12]:
games.head()

Unnamed: 0,game_id,won_home,settled_in_home,shots_home,hits_home,pim_home,powerPlayOpportunities_home,faceOffWinPercentage_home,giveaways_home,takeaways_home,shots_away,hits_away,pim_away,powerPlayOpportunities_away,giveaways_away,takeaways_away
0,2011030221,True,OT,36,27,6,6,55.1,13,4,26,31,12,3,6,7
1,2011030222,False,REG,20,24,32,5,49.1,9,6,35,32,12,4,8,7
2,2011030223,True,OT,31,30,10,2,49.2,11,4,28,28,4,5,2,1
3,2011030224,True,REG,43,19,4,5,37.5,5,3,22,23,10,2,20,5
4,2011030225,False,REG,28,38,8,1,45.0,14,7,30,26,2,4,6,5


In [13]:
def result(x):
    if x[1] == 'OT' or x[1] == 'SO':
        return 'b_tie'
    elif x[0]:
        return 'a_home_win'
    else:
        return 'c_away_win'

In [14]:
games['result'] = games.loc[:,['won_home', 'settled_in_home']].apply(result, axis=1)

In [20]:
X = games.loc[:,'shots_home':'takeaways_away'].values
y = games['result'].astype('category').cat.codes

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1313)

In [23]:
print(X_train.shape, y_train.shape)

(8575, 13) (8575,)


In [24]:
model_logistic = LogisticRegression(random_state=1313).fit(X_train, y_train)



In [25]:
print(model_logistic.score(X_test, y_test))

0.5082196572228052


In [27]:
y_pred_logi = model_logistic.predict(X_test)

In [30]:
print(np.sum(np.equal(y_pred_logi, 0)))
print(np.sum(np.equal(y_pred_logi, 1)))
print(np.sum(np.equal(y_pred_logi, 2)))

1691
203
965


In [32]:
def confusion_matrix(y_test, y_pred, classes):
    con_mat = tf.math.confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1)).numpy()
    con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)
    con_mat_df = pd.DataFrame(con_mat_norm, index=classes, columns=classes)
    
    figure = plt.figure(figsize=(4, 4))
    sns.heatmap(con_mat_df, annot=True, cmap=plt.cm.coolwarm)

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [38]:
classes = ['home win REG', 'OT/SO', 'away win REG']
print(classification_report(y_test, y_pred_logi))

              precision    recall  f1-score   support

           0       0.51      0.72      0.60      1201
           1       0.48      0.14      0.22       695
           2       0.50      0.50      0.50       963

   micro avg       0.51      0.51      0.51      2859
   macro avg       0.50      0.46      0.44      2859
weighted avg       0.50      0.51      0.48      2859



In [40]:
print(confusion_matrix(y_test, y_pred_logi))

[[870  54 277]
 [394  98 203]
 [427  51 485]]


In [46]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(512, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.5),  
  tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [47]:
model.fit(X_train, y_train, epochs=10)
model.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[1.0163273110845032, 0.49038124]

In [41]:
y_pred = model.predict(X_test)

In [37]:
print(np.sum(np.equal(y_pred.argmax(axis=1), 0)))
print(np.sum(np.equal(y_pred.argmax(axis=1), 1)))
print(np.sum(np.equal(y_pred.argmax(axis=1), 2)))

1100
612
1147
