# Daily Predictions

Uses models in the pickle files to make predictions on daily data that is drawn in from daily lineups and linked with the most recent statistics from the original dataframe. This new dataframe is updated with current opponent/home/away and pushed through the respective models and output is displayed.

In [2]:
import pandas as pd
import numpy as np
import importlib
import os
import requests
import json
from lxml import etree
from lxml import html
from sklearn.model_selection import train_test_split
import joblib
from datetime import *
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
htmlparser =  etree.HTMLParser()

pd.options.mode.chained_assignment = None

import data_grab, data_proc, data_prep, data_explor

### Read in Daily Lineups

In [3]:
daily_url = "https://www.rotowire.com/hockey/nhl-lineups.php"
daily_results = requests.get(daily_url)
daily_results_tree = html.fromstring(daily_results.content)

In [4]:
away_teams_xpath = '/html/body/div[1]/div/main/div[3]//div//div//div//div//a[1]//div//text()'
away_teams = daily_results_tree.xpath(away_teams_xpath)

In [5]:
home_teams_xpath = '/html/body/div[1]/div/main/div[3]//div//div//div//div//a[2]//div//text()'
home_teams = daily_results_tree.xpath(home_teams_xpath)

In [6]:
nhl_teams = 'ANA ARI BOS BUF CAR CGY CHI CLS CBJ COL DAL DET EDM FLA LA MIN MON NJ NSH NYI NYR OTT PHI PIT SEA SJ STL TB TOR VAN VGK WAS WPG'.split()

In [7]:
away_teams = [x for x in away_teams if x in nhl_teams]
home_teams = [x for x in home_teams if x in nhl_teams]

In [8]:
games_away = [(x,y) for x,y in zip(away_teams, home_teams)]
games_home = [(x,y) for x,y in zip(home_teams, away_teams)]

games_dict_away = dict(games_away)
games_dict_home = dict(games_home)

### Read in Updated Dataframe

Read in the merged dataframe and then create new dataframe for players playing on teams playing today. Will then be able to generate the feature columns and use pickled files to predict their outputs. Simple.

In [9]:
## most recent update: 11-01-2022

In [10]:
importlib.reload(data_grab)
importlib.reload(data_prep)
importlib.reload(data_explor)
importlib.reload(data_proc)

<module 'data_proc' from '/Users/nickdimmitt/hockey/data_proc.py'>

In [11]:
end_date = "2022-11-01"
yesterday = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
today = datetime.today().strftime("%Y-%m-%d")

In [12]:
df_skaters = data_grab.main(yesterday, end_date, 'skater', "~/dfs/hockey/data/df_skaters.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_misc = data_grab.main(yesterday, end_date, 'misc', "~/dfs/hockey/data/df_skaters_misc.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_shot = data_grab.main(yesterday, end_date, 'shots', "~/dfs/hockey/data/df_skaters_shot.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_toi = data_grab.main(yesterday, end_date, 'toi', "~/dfs/hockey/data/df_skaters_toi.csv", 0, 10000, 100, update=True, saveData=False)

In [13]:
df_merged = data_prep.main(df_skaters, df_skate_misc, df_skate_shot, df_skate_toi, ['gameId','playerId'], 'Unnamed: 0_y, goals_y, shots_y, gameDate_y, shootingPct_y, gamesPlayed_y, homeRoad_y, lastName_y, opponentTeamAbbrev_y, otGoals_y, positionCode_y, shootsCatches_y, skaterFullName_y, teamAbbrev_y, timeOnIcePerGame_y'.split(", "), ['Unnamed: 0', 'emptyNetAssists', 'emptyNetPoints', 'faceoffWinPct', 'shootingPctBackhand', 'shootingPctDeflected', 'shootingPctSlap', 'shootingPctSnap', 'shootingPctTipIn', 'shootingPctWrapAround', 'shootingPctWrist'], saveData=True)

  df = pd.merge(df1, df2, how='inner', left_index=True, right_index=True)


In [14]:
df_merged.shape

(586482, 69)

### Add Today's Rows

Either duplicate or generate rows for all the players playing today and append to bottom of dataframe.

In [15]:
home_teams

['CLS', 'CHI']

In [16]:
today_home_df = df_merged[(df_merged['gameDate'] > "2022-11-01") & (df_merged['teamAbbrev'].isin(home_teams + ['CBJ']))]
today_away_df = df_merged[(df_merged['gameDate'] > "2022-11-01") & (df_merged['teamAbbrev'].isin(away_teams))]

In [17]:
today_home_df['gameDate'] = today
today_away_df['gameDate'] = today

today_home_df['homeRoad'] = 'H'
today_away_df['homeRoad'] = 'A'

In [18]:
today_away_df['teamAbbrev'].value_counts()

FLA    142
PIT    141
Name: teamAbbrev, dtype: int64

In [19]:
today_home_df[['gamesPlayed', 'goals', 'evTimeOnIce', 'evTimeOnIcePerGame', 'otTimeOnIce', 'otTimeOnIcePerOtGame', 'goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap',
       'goalsTipIn', 'ppTimeOnIce',
       'ppTimeOnIcePerGame', 'shTimeOnIce', 'shTimeOnIcePerGame', 'shifts',
       'shiftsPerGame', 'goalsWrapAround', 'goalsWrist','shootingPct', 'shots', 'shotsOnNetBackhand',
       'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap',
       'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist','assists', 'evGoals', 'evPoints',
       'gameWinningGoals', 'otGoals', 'penaltyMinutes', 'plusMinus', 'points',
       'pointsPerGame', 'positionCode', 'ppGoals', 'ppPoints', 'shGoals',
       'shPoints', 'timeOnIcePerGame', 'blockedShots',
       'blockedShotsPer60', 'emptyNetGoals', 'firstGoals', 'giveaways',
       'giveawaysPer60', 'hits', 'hitsPer60', 'missedShotCrossbar',
       'missedShotGoalpost', 'missedShotOverNet', 'missedShotWideOfNet',
       'missedShots', 'takeaways', 'takeawaysPer60']] = 0

today_away_df[['gamesPlayed', 'goals', 'evTimeOnIce', 'evTimeOnIcePerGame', 'otTimeOnIce', 'otTimeOnIcePerOtGame', 'goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap',
       'goalsTipIn', 'ppTimeOnIce',
       'ppTimeOnIcePerGame', 'shTimeOnIce', 'shTimeOnIcePerGame', 'shifts',
       'shiftsPerGame', 'goalsWrapAround', 'goalsWrist','shootingPct', 'shots', 'shotsOnNetBackhand',
       'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap',
       'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist','assists', 'evGoals', 'evPoints',
       'gameWinningGoals', 'otGoals', 'penaltyMinutes', 'plusMinus', 'points',
       'pointsPerGame', 'positionCode', 'ppGoals', 'ppPoints', 'shGoals',
       'shPoints', 'timeOnIcePerGame', 'blockedShots',
       'blockedShotsPer60', 'emptyNetGoals', 'firstGoals', 'giveaways',
       'giveawaysPer60', 'hits', 'hitsPer60', 'missedShotCrossbar',
       'missedShotGoalpost', 'missedShotOverNet', 'missedShotWideOfNet',
       'missedShots', 'takeaways', 'takeawaysPer60']] = 0

In [89]:
today_away_df['opponentTeamAbbrev'] = today_away_df['teamAbbrev'].map(games_dict_away)
today_home_df['opponentTeamAbbrev'] = today_home_df['teamAbbrev'].map(games_dict_home)

In [20]:
today_df = pd.concat([today_home_df, today_away_df])
today_df.drop_duplicates(subset='playerId', inplace=True)

In [21]:
df_merged = pd.concat([df_merged, today_df])

### Add Calculated Columns

In [22]:
df_merged['fanPoints'] = data_explor.fan_points(df_merged)
df_merged['overPerform'] = data_explor.overperform(df_merged, 'fanPoints', 'playerId')
df_merged['overPerformDummy'] = data_explor.over_perf_dummy(df_merged, 'overPerform')
df_merged['underPerformDummy'] = data_explor.under_perf_dummy(df_merged, 'overPerform')
df_merged['samePerfDummy'] = data_explor.same_perf_dummy(df_merged, 'overPerform')
df_merged['homeRoadPerf'] = data_explor.home_away_perf(df_merged, 'overPerform', ['playerId', 'homeRoad'])

In [23]:
better_home_skater = list(np.where((df_merged['homeRoad'] == 'H') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_away_skater = list(np.where((df_merged['homeRoad'] == 'R') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_home_skater = [*set(better_home_skater)]
better_away_skater = [*set(better_away_skater)]

In [24]:
df_skaters['OpHomeDummy'] = np.where(df_skaters['playerId'].isin(better_home_skater), 1, 0)
df_skaters['OpRoadDummy'] = np.where(df_skaters['playerId'].isin(better_away_skater), 1, 0)
df_skaters['OpNowhereDummy'] = np.where((df_skaters['OpHomeDummy'] == 0) & (df_skaters['OpRoadDummy'] == 0), 1, 0)

In [25]:
feature_list = ['assists', 'goals', 'plusMinus', 'points', 'ppPoints', 'fanPoints', 'shootingPct', 'shots', 'timeOnIcePerGame', 'ppTimeOnIce', 'timeOnIcePerShift']

In [26]:
for feature in feature_list:
    df_merged[f'{feature}Ma7'] = data_proc.moving_average(df_merged, feature, 'playerId', 7)
    df_merged[f'{feature}Ma7'] = df_merged[f'{feature}Ma7'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma3'] = data_proc.moving_average(df_merged, feature, 'playerId', 3)
    df_merged[f'{feature}Ma3'] = df_merged[f'{feature}Ma3'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}LastGame'] = df_merged[feature].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma10'] = data_proc.moving_average(df_merged, feature, 'playerId', 10)
    df_merged[f'{feature}Ma10'] = df_merged[f'{feature}Ma10'].shift(1)

for feature in feature_list:
    df_merged[f'{feature}Ma14'] = data_proc.moving_average(df_merged, feature, 'playerId', 14)
    df_merged[f'{feature}Ma14'] = df_merged[f'{feature}Ma14'].shift(1)

In [27]:
goals = ['goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap', 'goalsTipIn', 'goalsWrapAround', 'goalsWrist']

shots = ['shotsOnNetBackhand', 'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap', 'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist']

for goal in goals:
    df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')

for shot in shots:
    df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')

  df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')
  df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')


### Cleaning

In [28]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()
drop_cols.append('Unnamed: 0')
drop_cols

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [29]:
impute_by_player = 'hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame fanPointsMa3 fanPointsMa7 ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa10 fanPointsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame fanPointsLastGame timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14'.split()
impute_by_perf = 'missedShotCrossbar missedShotGoalpost missedShotOverNet missedShotWideOfNet goalsBackhand goalsDeflected goalsSlap goalsSnap goalsTipIn goalsWrapAround goalsWrist shootingPct shotsOnNetBackhand shotsOnNetDeflected shotsOnNetSlap shotsOnNetSnap shotsOnNetTipIn shotsOnNetWrapAround shotsOnNetWrist hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14 timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa3 fanPointsMa7 fanPointsMa10 fanPointsMa14 fanPointsLastGame timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame'.split()

In [30]:
for col in impute_by_player:
    data_prep.handle_missing(df_merged, 'playerId', col)

In [31]:
for col in impute_by_perf:
    try:
        data_prep.handle_missing(df_merged, 'overPerformDummy', col)
    except:
        continue

### Final Scrub, Rinse, and Split

In [32]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [33]:
df_merged['homeRoad1'] = df_merged['homeRoad'].copy()
df_merged['positionCode1'] = df_merged['positionCode'].copy()

  df_merged['homeRoad1'] = df_merged['homeRoad'].copy()
  df_merged['positionCode1'] = df_merged['positionCode'].copy()


In [34]:
df_merged = pd.get_dummies(df_merged, columns=['homeRoad', 'shootsCatches', 'positionCode', 'opponentTeamAbbrev'])

## Pull Out Prediction DataFrame

Pull out the rows you want to predict values for and add them to a new dataframe. Drop their target columns.

In [35]:
predictable_df = df_merged[df_merged['gameDate'] == today]

In [36]:
predictable_df['teamAbbrev'].value_counts()

CBJ    24
PIT    23
CHI    21
FLA    21
Name: teamAbbrev, dtype: int64

#### Transform Predictable DataFrame

In [37]:
home_df = pd.concat([predictable_df[((predictable_df['homeRoadPerf'] > 0) & (predictable_df['homeRoad1'] == 'H'))], predictable_df[(predictable_df['homeRoadPerf'] <= 0) & (predictable_df['homeRoad1'] == 'R')]])
away_df = pd.concat([predictable_df[((predictable_df['homeRoadPerf'] > 0) & (predictable_df['homeRoad1'] == 'R'))], predictable_df[(predictable_df['homeRoadPerf'] <= 0) & (predictable_df['homeRoad1'] == 'H')]])

In [38]:
home_df.drop(['fanPoints', 'overPerform', 'overPerformDummy'], axis=1, inplace=True)
away_df.drop(['fanPoints', 'overPerform', 'overPerformDummy'], axis=1, inplace=True)

In [39]:
home_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'H'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'R')]])
away_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'R'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'H')]])

(0, 199)

### Train Models

Can skip this if already trained.

#### Train Home Model

In [53]:
features = '%shotsOnNetBackhand %shotsOnNetDeflected %shotsOnNetSlap %shotsOnNetSnap %shotsOnNetTipIn %shotsOnNetWrapAround %shotsOnNetWrist homeRoad_H positionCode_C positionCode_D positionCode_R assistsMa7 goalsMa7 plusMinusMa7 pointsMa7 fanPointsMa7 shotsMa7 assistsMa3 goalsMa3 plusMinusMa3 pointsMa3 fanPointsMa3 shotsMa3 assistsLastGame goalsLastGame plusMinusLastGame pointsLastGame fanPointsLastGame shotsLastGame assistsMa10 goalsMa10 plusMinusMa10 pointsMa10 fanPointsMa10 shotsMa10 assistsMa14 goalsMa14 plusMinusMa14 pointsMa14 fanPointsMa14 shotsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame'.split()
target = 'fanPoints'

In [54]:
X = home_model_df[features].values
y = home_model_df[target].values

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [56]:
model = ensemble.GradientBoostingRegressor(n_estimators=1250, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto', loss='huber')

In [57]:
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=6,
                          max_features='auto', min_samples_leaf=8,
                          min_samples_split=5, n_estimators=1250)

In [58]:
joblib.dump(model, 'nhl_home_fan.pkl')

['nhl_home_fan.pkl']

In [59]:
mse=mean_absolute_error(y_train, model.predict(X_train))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 5.48


In [60]:
mse=mean_absolute_error(y_test, model.predict(X_test))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 5.79


#### Train Away Model

In [61]:
X = away_model_df[features].values
y = away_model_df[target].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=234)

In [63]:
model = ensemble.GradientBoostingRegressor(n_estimators=1250, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto', loss='huber')

In [64]:
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=6,
                          max_features='auto', min_samples_leaf=8,
                          min_samples_split=5, n_estimators=1250)

In [65]:
joblib.dump(model, 'nhl_away_fan.pkl')

['nhl_away_fan.pkl']

In [66]:
mse=mean_absolute_error(y_train, model.predict(X_train))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.15


In [67]:
mse=mean_absolute_error(y_test, model.predict(X_test))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.85


### Applying Model

#### Home Split

In [68]:
model = joblib.load('nhl_home_fan.pkl')

In [69]:
predictions = model.predict(home_df[features].values)

In [70]:
home_df['prediction'] = predictions

#### Away Split

In [71]:
model = joblib.load('nhl_away_fan.pkl')

In [72]:
predictions = model.predict(away_df[features].values)

In [73]:
away_df['prediction'] = predictions

## Combine into Final Predictions

In [74]:
all_skaters_df = pd.concat([home_df, away_df])

In [75]:
final_pred_df = all_skaters_df[['skaterFullName', 'teamAbbrev', 'prediction']].sort_values(by='prediction', ascending=False)

In [76]:
final_pred_df.to_csv(f'predictions_{today}.csv')

In [77]:
final_pred_df

Unnamed: 0,skaterFullName,teamAbbrev,prediction
886559,Patrik Laine,CBJ,12.926496
886296,Patrick Kane,CHI,11.74191
886295,Jonathan Toews,CHI,10.585905
886546,Johnny Gaudreau,CBJ,10.471465
886556,Zach Werenski,CBJ,9.950972
886548,Boone Jenner,CBJ,9.731196
886542,Jakub Voracek,CBJ,9.278222
886293,Jack Johnson,CHI,8.942956
886305,Max Domi,CHI,8.205111
886543,Gustav Nyquist,CBJ,7.874062
