In [31]:
import pandas as pd
import numpy as np
import importlib
import os
import requests
import json
from lxml import etree
from lxml import html
from sklearn.model_selection import train_test_split
import joblib
from datetime import *
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
htmlparser =  etree.HTMLParser()
from sklearn.metrics import classification_report, confusion_matrix

pd.options.mode.chained_assignment = None

import data_grab, data_proc, data_prep, data_explor

In [9]:
df_merged = pd.read_csv('data/merged_df.csv')
df_skaters = pd.read_csv('data/df_skater.csv')

### Add Calculated Columns

In [10]:
df_merged['fanPoints'] = data_explor.fan_points(df_merged)
df_merged['overPerform'] = data_explor.overperform(df_merged, 'fanPoints', 'playerId')
df_merged['overPerformDummy'] = data_explor.over_perf_dummy(df_merged, 'overPerform')
df_merged['underPerformDummy'] = data_explor.under_perf_dummy(df_merged, 'overPerform')
df_merged['samePerfDummy'] = data_explor.same_perf_dummy(df_merged, 'overPerform')
df_merged['homeRoadPerf'] = data_explor.home_away_perf(df_merged, 'overPerform', ['playerId', 'homeRoad'])
df_merged['goalDummy'] = np.where(df_merged['goals'] > 0, 1, 0)

In [11]:
better_home_skater = list(np.where((df_merged['homeRoad'] == 'H') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_away_skater = list(np.where((df_merged['homeRoad'] == 'R') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_home_skater = [*set(better_home_skater)]
better_away_skater = [*set(better_away_skater)]

In [12]:
df_skaters['OpHomeDummy'] = np.where(df_skaters['playerId'].isin(better_home_skater), 1, 0)
df_skaters['OpRoadDummy'] = np.where(df_skaters['playerId'].isin(better_away_skater), 1, 0)
df_skaters['OpNowhereDummy'] = np.where((df_skaters['OpHomeDummy'] == 0) & (df_skaters['OpRoadDummy'] == 0), 1, 0)

In [13]:
feature_list = ['assists', 'goals', 'plusMinus', 'points', 'ppPoints', 'fanPoints', 'shootingPct', 'shots', 'timeOnIcePerGame', 'ppTimeOnIce', 'timeOnIcePerShift']

In [14]:
for feature in feature_list:
    df_merged[f'{feature}Ma7'] = data_proc.moving_average(df_merged, feature, 'playerId', 7)
    df_merged[f'{feature}Ma7'] = df_merged[f'{feature}Ma7'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma3'] = data_proc.moving_average(df_merged, feature, 'playerId', 3)
    df_merged[f'{feature}Ma3'] = df_merged[f'{feature}Ma3'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}LastGame'] = df_merged[feature].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma10'] = data_proc.moving_average(df_merged, feature, 'playerId', 10)
    df_merged[f'{feature}Ma10'] = df_merged[f'{feature}Ma10'].shift(1)

for feature in feature_list:
    df_merged[f'{feature}Ma14'] = data_proc.moving_average(df_merged, feature, 'playerId', 14)
    df_merged[f'{feature}Ma14'] = df_merged[f'{feature}Ma14'].shift(1)

In [15]:
goals = ['goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap', 'goalsTipIn', 'goalsWrapAround', 'goalsWrist']

shots = ['shotsOnNetBackhand', 'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap', 'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist']

for goal in goals:
    df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')

for shot in shots:
    df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')

### Cleaning

In [16]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()
drop_cols.append('Unnamed: 0')
drop_cols

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [17]:
impute_by_player = 'hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame fanPointsMa3 fanPointsMa7 ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa10 fanPointsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame fanPointsLastGame timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14'.split()
impute_by_perf = 'missedShotCrossbar missedShotGoalpost missedShotOverNet missedShotWideOfNet goalsBackhand goalsDeflected goalsSlap goalsSnap goalsTipIn goalsWrapAround goalsWrist shootingPct shotsOnNetBackhand shotsOnNetDeflected shotsOnNetSlap shotsOnNetSnap shotsOnNetTipIn shotsOnNetWrapAround shotsOnNetWrist hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14 timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa3 fanPointsMa7 fanPointsMa10 fanPointsMa14 fanPointsLastGame timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame'.split()

In [18]:
for col in impute_by_player:
    data_prep.handle_missing(df_merged, 'playerId', col)

In [19]:
for col in impute_by_perf:
    try:
        data_prep.handle_missing(df_merged, 'overPerformDummy', col)
    except:
        continue

### Final Scrub, Rinse, and Split

In [20]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [21]:
df_merged['homeRoad1'] = df_merged['homeRoad'].copy()
df_merged['positionCode1'] = df_merged['positionCode'].copy()

  df_merged['positionCode1'] = df_merged['positionCode'].copy()


In [22]:
df_merged = pd.get_dummies(df_merged, columns=['homeRoad', 'shootsCatches', 'positionCode', 'opponentTeamAbbrev'])

In [23]:
home_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'H'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'R')]])
away_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'R'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'H')]])

#### Train Home Model

In [24]:
features = '%shotsOnNetBackhand %shotsOnNetDeflected %shotsOnNetSlap %shotsOnNetSnap %shotsOnNetTipIn %shotsOnNetWrapAround %shotsOnNetWrist homeRoad_H positionCode_C positionCode_D positionCode_R assistsMa7 goalsMa7 plusMinusMa7 pointsMa7 fanPointsMa7 shotsMa7 assistsMa3 goalsMa3 plusMinusMa3 pointsMa3 fanPointsMa3 shotsMa3 assistsLastGame goalsLastGame plusMinusLastGame pointsLastGame fanPointsLastGame shotsLastGame assistsMa10 goalsMa10 plusMinusMa10 pointsMa10 fanPointsMa10 shotsMa10 assistsMa14 goalsMa14 plusMinusMa14 pointsMa14 fanPointsMa14 shotsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame'.split()
target = 'goalDummy'

In [25]:
X = home_model_df[features].values
y = home_model_df[target].values

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [29]:
model = ensemble.GradientBoostingClassifier(n_estimators=5000, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto')

In [30]:
model.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.05, max_depth=6, max_features='auto',
                           min_samples_leaf=8, min_samples_split=5,
                           n_estimators=5000)

In [32]:
joblib.dump(model, 'nhl_home_goals.pkl')

['nhl_home_goals.pkl']

In [36]:
model.score(X_train, y_train)

0.9027782404477165

In [37]:
model.score(X_test, y_test)

0.8453915152268876

In [35]:
home_model_df.to_csv("home_model_df.csv")
away_model_df.to_csv("away_model_df.csv")

home_model_df Train Away Model

In [61]:
X = away_model_df[features].values
y = away_model_df[target].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=234)

In [63]:
model = ensemble.GradientBoostingRegressor(n_estimators=1250, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto', loss='huber')

In [64]:
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=6,
                          max_features='auto', min_samples_leaf=8,
                          min_samples_split=5, n_estimators=1250)

In [65]:
joblib.dump(model, 'nhl_away_fan.pkl')

['nhl_away_fan.pkl']

In [66]:
mse=mean_absolute_error(y_train, model.predict(X_train))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.15


In [67]:
mse=mean_absolute_error(y_test, model.predict(X_test))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.85
