# Daily Predictions

Uses models in the pickle files to make predictions on daily data that is drawn in from daily lineups and linked with the most recent statistics from the original dataframe. This new dataframe is updated with current opponent/home/away and pushed through the respective models and output is displayed.

In [46]:
import pandas as pd
import numpy as np
import importlib
import os
import requests
import json
from lxml import etree
from lxml import html
from sklearn.model_selection import train_test_split
import joblib
from datetime import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
htmlparser =  etree.HTMLParser()

pd.options.mode.chained_assignment = None

import data_grab, data_proc, data_prep, data_explor

### Read in Daily Lineups

In [47]:
daily_url = "https://www.rotowire.com/hockey/nhl-lineups.php"
daily_results = requests.get(daily_url)
daily_results_tree = html.fromstring(daily_results.content)

In [48]:
away_teams_xpath = '/html/body/div[1]/div/main/div[3]//div//div//div//div//a[1]//div//text()'
away_teams = daily_results_tree.xpath(away_teams_xpath)

In [49]:
home_teams_xpath = '/html/body/div[1]/div/main/div[3]//div//div//div//div//a[2]//div//text()'
home_teams = daily_results_tree.xpath(home_teams_xpath)

In [50]:
nhl_teams = 'ANA ANH ARI BOS BUF CAR CGY CHI CLS CBJ COL DAL DET EDM FLA LA MIN MON NJ NSH NYI NYR OTT PHI PIT SEA SJ SJS STL TB TOR VAN VGK WAS WPG'.split()

In [51]:
away_teams = [x for x in away_teams if x in nhl_teams]
home_teams = [x for x in home_teams if x in nhl_teams]

In [52]:
games_away = [(x,y) for x,y in zip(away_teams, home_teams)]
games_home = [(x,y) for x,y in zip(home_teams, away_teams)]

games_dict_away = dict(games_away)
games_dict_home = dict(games_home)

### Read in Updated Dataframe

Read in the merged dataframe and then create new dataframe for players playing on teams playing today. Will then be able to generate the feature columns and use pickled files to predict their outputs. Simple.

In [53]:
## most recent update: 11-21-2022

In [54]:
importlib.reload(data_grab)
importlib.reload(data_prep)
importlib.reload(data_explor)
importlib.reload(data_proc)

<module 'data_proc' from '/Users/nickdimmitt/hockey/data_proc.py'>

In [55]:
end_date = "2022-11-19"
yesterday = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
today = datetime.today().strftime("%Y-%m-%d")

In [56]:
df_skaters = data_grab.main(yesterday, end_date, 'skater', "~/dfs/hockey/data/df_skaters.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_misc = data_grab.main(yesterday, end_date, 'misc', "~/dfs/hockey/data/df_skaters_misc.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_shot = data_grab.main(yesterday, end_date, 'shots', "~/dfs/hockey/data/df_skaters_shot.csv", 0, 10000, 100, update=True, saveData=False)
df_skate_toi = data_grab.main(yesterday, end_date, 'toi', "~/dfs/hockey/data/df_skaters_toi.csv", 0, 10000, 100, update=True, saveData=False)

In [57]:
df_merged = data_prep.main(df_skaters, df_skate_misc, df_skate_shot, df_skate_toi, ['gameId','playerId'], 'Unnamed: 0_y, goals_y, shots_y, gameDate_y, shootingPct_y, gamesPlayed_y, homeRoad_y, lastName_y, opponentTeamAbbrev_y, otGoals_y, positionCode_y, shootsCatches_y, skaterFullName_y, teamAbbrev_y, timeOnIcePerGame_y'.split(", "), ['Unnamed: 0', 'emptyNetAssists', 'emptyNetPoints', 'faceoffWinPct', 'shootingPctBackhand', 'shootingPctDeflected', 'shootingPctSlap', 'shootingPctSnap', 'shootingPctTipIn', 'shootingPctWrapAround', 'shootingPctWrist'], saveData=True)

  df = pd.merge(df1, df2, how='inner', left_index=True, right_index=True)


### Add Today's Rows

Either duplicate or generate rows for all the players playing today and append to bottom of dataframe.

In [58]:
today_home_df = df_merged[(df_merged['gameDate'] > "2022-11-01") & (df_merged['teamAbbrev'].isin(home_teams))]
today_away_df = df_merged[(df_merged['gameDate'] > "2022-11-01") & (df_merged['teamAbbrev'].isin(away_teams))]

In [59]:
today_home_df['gameDate'] = today
today_away_df['gameDate'] = today

today_home_df['homeRoad'] = 'H'
today_away_df['homeRoad'] = 'R'

In [60]:
today_away_df['teamAbbrev'].value_counts()


CGY    161
CAR    160
NYI    158
OTT    144
VGK    143
BOS    142
EDM    142
ARI    126
COL    126
Name: teamAbbrev, dtype: int64

In [61]:
today_home_df[['gamesPlayed', 'goals', 'evTimeOnIce', 'evTimeOnIcePerGame', 'otTimeOnIce', 'otTimeOnIcePerOtGame', 'goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap',
       'goalsTipIn', 'ppTimeOnIce',
       'ppTimeOnIcePerGame', 'shTimeOnIce', 'shTimeOnIcePerGame', 'shifts',
       'shiftsPerGame', 'goalsWrapAround', 'goalsWrist','shootingPct', 'shots', 'shotsOnNetBackhand',
       'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap',
       'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist','assists', 'evGoals', 'evPoints',
       'gameWinningGoals', 'otGoals', 'penaltyMinutes', 'plusMinus', 'points',
       'pointsPerGame', 'positionCode', 'ppGoals', 'ppPoints', 'shGoals',
       'shPoints', 'timeOnIcePerGame', 'blockedShots',
       'blockedShotsPer60', 'emptyNetGoals', 'firstGoals', 'giveaways',
       'giveawaysPer60', 'hits', 'hitsPer60', 'missedShotCrossbar',
       'missedShotGoalpost', 'missedShotOverNet', 'missedShotWideOfNet',
       'missedShots', 'takeaways', 'takeawaysPer60']] = 0

today_away_df[['gamesPlayed', 'goals', 'evTimeOnIce', 'evTimeOnIcePerGame', 'otTimeOnIce', 'otTimeOnIcePerOtGame', 'goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap',
       'goalsTipIn', 'ppTimeOnIce',
       'ppTimeOnIcePerGame', 'shTimeOnIce', 'shTimeOnIcePerGame', 'shifts',
       'shiftsPerGame', 'goalsWrapAround', 'goalsWrist','shootingPct', 'shots', 'shotsOnNetBackhand',
       'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap',
       'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist','assists', 'evGoals', 'evPoints',
       'gameWinningGoals', 'otGoals', 'penaltyMinutes', 'plusMinus', 'points',
       'pointsPerGame', 'positionCode', 'ppGoals', 'ppPoints', 'shGoals',
       'shPoints', 'timeOnIcePerGame', 'blockedShots',
       'blockedShotsPer60', 'emptyNetGoals', 'firstGoals', 'giveaways',
       'giveawaysPer60', 'hits', 'hitsPer60', 'missedShotCrossbar',
       'missedShotGoalpost', 'missedShotOverNet', 'missedShotWideOfNet',
       'missedShots', 'takeaways', 'takeawaysPer60']] = 0

In [62]:
today_away_df['opponentTeamAbbrev'] = today_away_df['teamAbbrev'].map(games_dict_away)
today_home_df['opponentTeamAbbrev'] = today_home_df['teamAbbrev'].map(games_dict_home)

In [63]:
today_df = pd.concat([today_home_df, today_away_df])
today_df.drop_duplicates(subset='playerId', inplace=True)

In [64]:
today_df['teamAbbrev'].value_counts()

STL    24
VAN    24
COL    24
TOR    22
PHI    22
NSH    22
BOS    22
OTT    22
CGY    22
WPG    21
CAR    21
EDM    21
DAL    20
NYI    20
VGK    19
ARI    19
Name: teamAbbrev, dtype: int64

In [65]:
df_merged = pd.concat([df_merged, today_df])

### Add Calculated Columns

In [66]:
df_merged['fanPoints'] = data_explor.fan_points(df_merged)
df_merged['overPerform'] = data_explor.overperform(df_merged, 'fanPoints', 'playerId')
df_merged['overPerformDummy'] = data_explor.over_perf_dummy(df_merged, 'overPerform')
df_merged['underPerformDummy'] = data_explor.under_perf_dummy(df_merged, 'overPerform')
df_merged['samePerfDummy'] = data_explor.same_perf_dummy(df_merged, 'overPerform')
df_merged['homeRoadPerf'] = data_explor.home_away_perf(df_merged, 'overPerform', ['playerId', 'homeRoad'])

In [67]:
better_home_skater = list(np.where((df_merged['homeRoad'] == 'H') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_away_skater = list(np.where((df_merged['homeRoad'] == 'R') & (df_merged['homeRoadPerf'] > 0), df_merged['playerId'], None))
better_home_skater = [*set(better_home_skater)]
better_away_skater = [*set(better_away_skater)]

In [68]:
df_skaters['OpHomeDummy'] = np.where(df_skaters['playerId'].isin(better_home_skater), 1, 0)
df_skaters['OpRoadDummy'] = np.where(df_skaters['playerId'].isin(better_away_skater), 1, 0)
df_skaters['OpNowhereDummy'] = np.where((df_skaters['OpHomeDummy'] == 0) & (df_skaters['OpRoadDummy'] == 0), 1, 0)

In [69]:
feature_list = ['assists', 'goals', 'plusMinus', 'points', 'ppPoints', 'fanPoints', 'shootingPct', 'shots', 'timeOnIcePerGame', 'ppTimeOnIce', 'timeOnIcePerShift']

In [70]:
for feature in feature_list:
    df_merged[f'{feature}Ma7'] = data_proc.moving_average(df_merged, feature, 'playerId', 7)
    df_merged[f'{feature}Ma7'] = df_merged[f'{feature}Ma7'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma3'] = data_proc.moving_average(df_merged, feature, 'playerId', 3)
    df_merged[f'{feature}Ma3'] = df_merged[f'{feature}Ma3'].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}LastGame'] = df_merged[feature].shift(1)
    
for feature in feature_list:
    df_merged[f'{feature}Ma10'] = data_proc.moving_average(df_merged, feature, 'playerId', 10)
    df_merged[f'{feature}Ma10'] = df_merged[f'{feature}Ma10'].shift(1)

for feature in feature_list:
    df_merged[f'{feature}Ma14'] = data_proc.moving_average(df_merged, feature, 'playerId', 14)
    df_merged[f'{feature}Ma14'] = df_merged[f'{feature}Ma14'].shift(1)

In [71]:
goals = ['goalsBackhand', 'goalsDeflected', 'goalsSlap', 'goalsSnap', 'goalsTipIn', 'goalsWrapAround', 'goalsWrist']

shots = ['shotsOnNetBackhand', 'shotsOnNetDeflected', 'shotsOnNetSlap', 'shotsOnNetSnap', 'shotsOnNetTipIn', 'shotsOnNetWrapAround', 'shotsOnNetWrist']

for goal in goals:
    df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')

for shot in shots:
    df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')

  df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')
  df_merged[f"%{goal}"] = data_proc.percShotType(df_merged, 'playerId', goal, 'goals')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')
  df_merged[f"%{shot}"] = data_proc.percShotType(df_merged, 'playerId', shot, 'shots')


### Cleaning

In [72]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()
drop_cols.append('Unnamed: 0')
drop_cols

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [73]:
impute_by_player = 'hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame fanPointsMa3 fanPointsMa7 ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa10 fanPointsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame fanPointsLastGame timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14'.split()
impute_by_perf = 'missedShotCrossbar missedShotGoalpost missedShotOverNet missedShotWideOfNet goalsBackhand goalsDeflected goalsSlap goalsSnap goalsTipIn goalsWrapAround goalsWrist shootingPct shotsOnNetBackhand shotsOnNetDeflected shotsOnNetSlap shotsOnNetSnap shotsOnNetTipIn shotsOnNetWrapAround shotsOnNetWrist hitsPer60 blockedShotsPer60 giveawaysPer60 takeawaysPer60 assistsMa3 assistsMa7 assistsMa10 assistsMa14 assistsLastGame goalsMa3 goalsMa7 goalsMa10 goalsMa14 goalsLastGame shotsMa3 shotsMa7 shotsMa10 shotsMa14 shotsLastGame plusMinusMa3 plusMinusMa7 plusMinusMa10 plusMinusMa14 plusMinusLastGame ppPointsLastGame ppPointsMa3 ppPointsMa7 ppPointsMa10 ppPointsMa14 timeOnIcePerGameMa3 timeOnIcePerGameMa7 timeOnIcePerGameMa10 timeOnIcePerGameMa14 timeOnIcePerGameLastGame pointsMa3 pointsMa7 pointsMa10 pointsMa14 pointsLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame fanPointsMa3 fanPointsMa7 fanPointsMa10 fanPointsMa14 fanPointsLastGame timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame'.split()

In [74]:
for col in impute_by_player:
    data_prep.handle_missing(df_merged, 'playerId', col)

In [75]:
for col in impute_by_perf:
    try:
        data_prep.handle_missing(df_merged, 'overPerformDummy', col)
    except:
        continue

### Final Scrub

In [76]:
drop_cols = 'shootingPctMa7 shootingPctMa3 shootingPctLastGame shootingPctMa10 shootingPctMa14'.split()

df_merged = data_prep.remove_columns(df_merged, drop_cols)

In [77]:
df_merged['homeRoad1'] = df_merged['homeRoad'].copy()
df_merged['positionCode1'] = df_merged['positionCode'].copy()

  df_merged['homeRoad1'] = df_merged['homeRoad'].copy()
  df_merged['positionCode1'] = df_merged['positionCode'].copy()


In [78]:
df_merged = pd.get_dummies(df_merged, columns=['homeRoad', 'shootsCatches', 'positionCode', 'opponentTeamAbbrev'])

## Pull Out Prediction DataFrame

Pull out the rows you want to predict values for and add them to a new dataframe. Drop their target columns.

In [79]:
predictable_df = df_merged[df_merged['gameDate'] == today]

#### Transform Predictable DataFrame

In [80]:
home_df = pd.concat([predictable_df[((predictable_df['homeRoadPerf'] > 0) & (predictable_df['homeRoad1'] == 'H'))], predictable_df[(predictable_df['homeRoadPerf'] <= 0) & (predictable_df['homeRoad1'] == 'R')]])
away_df = pd.concat([predictable_df[((predictable_df['homeRoadPerf'] > 0) & (predictable_df['homeRoad1'] == 'R'))], predictable_df[(predictable_df['homeRoadPerf'] <= 0) & (predictable_df['homeRoad1'] == 'H')]])

In [81]:
home_df.drop(['fanPoints', 'overPerform', 'overPerformDummy'], axis=1, inplace=True)
away_df.drop(['fanPoints', 'overPerform', 'overPerformDummy'], axis=1, inplace=True)

In [82]:
home_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'H'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'R')]])
away_model_df = pd.concat([df_merged[((df_merged['homeRoadPerf'] > 0) & (df_merged['homeRoad1'] == 'R'))], df_merged[(df_merged['homeRoadPerf'] <= 0) & (df_merged['homeRoad1'] == 'H')]])

### Train Models

Can skip this if already trained.

#### Train Home Model

In [83]:
features = '%shotsOnNetBackhand %shotsOnNetDeflected %shotsOnNetSlap %shotsOnNetSnap %shotsOnNetTipIn %shotsOnNetWrapAround %shotsOnNetWrist homeRoad_H positionCode_C positionCode_D positionCode_R assistsMa7 goalsMa7 plusMinusMa7 pointsMa7 fanPointsMa7 shotsMa7 assistsMa3 goalsMa3 plusMinusMa3 pointsMa3 fanPointsMa3 shotsMa3 assistsLastGame goalsLastGame plusMinusLastGame pointsLastGame fanPointsLastGame shotsLastGame assistsMa10 goalsMa10 plusMinusMa10 pointsMa10 fanPointsMa10 shotsMa10 assistsMa14 goalsMa14 plusMinusMa14 pointsMa14 fanPointsMa14 shotsMa14 timeOnIcePerShiftMa3 timeOnIcePerShiftMa7 timeOnIcePerShiftMa10 timeOnIcePerShiftMa14 timeOnIcePerShiftLastGame ppTimeOnIceMa3 ppTimeOnIceMa7 ppTimeOnIceMa10 ppTimeOnIceMa14 ppTimeOnIceLastGame'.split()
target = 'fanPoints'

In [84]:
X = home_model_df[features].values
y = home_model_df[target].values

In [85]:
scaler = MinMaxScaler()
model = scaler.fit(X)
scaled_X = model.transform(X)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.3, random_state=4)

In [87]:
model = ensemble.GradientBoostingRegressor(n_estimators=1500, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto', loss='huber')

In [88]:
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=6,
                          max_features='auto', min_samples_leaf=8,
                          min_samples_split=5, n_estimators=1500)

In [89]:
joblib.dump(model, 'nhl_home_fan_scaled.pkl')

['nhl_home_fan_scaled.pkl']

In [90]:
mse=mean_absolute_error(y_train, model.predict(X_train))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 5.40


In [91]:
mse=mean_absolute_error(y_test, model.predict(X_test))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 5.81


#### Train Away Model

In [92]:
X = away_model_df[features].values
y = away_model_df[target].values

In [93]:
scaler = MinMaxScaler()
model = scaler.fit(X)
scaled_X = model.transform(X)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.3, random_state=4)

In [95]:
model = ensemble.GradientBoostingRegressor(n_estimators=1500, learning_rate=0.05, max_depth=6, min_samples_split=5, min_samples_leaf=8, max_features='auto', loss='huber')

In [96]:
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=6,
                          max_features='auto', min_samples_leaf=8,
                          min_samples_split=5, n_estimators=1500)

In [97]:
joblib.dump(model, 'nhl_away_fan_scaled.pkl')

['nhl_away_fan_scaled.pkl']

In [98]:
mse=mean_absolute_error(y_train, model.predict(X_train))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.05


In [99]:
mse=mean_absolute_error(y_test, model.predict(X_test))
print("Training Set MSE: %.2f" %mse)

Training Set MSE: 4.85


### Applying Model

#### Home Split

In [133]:
scaler = MinMaxScaler()
model = scaler.fit(home_df[features].values)
scaled_X = model.transform(home_df[features])



In [151]:
X = home_df[features].values

In [152]:
model = joblib.load('nhl_home_fan.pkl')

In [153]:
predictions = model.predict(X)

In [154]:
home_df['prediction'] = predictions

In [155]:
home_df['prediction']

884551     9.219015
884552     9.948372
884553    12.150338
884554     3.591461
884557     9.135465
            ...    
887306     3.348030
887418     4.121558
887605     3.021518
887643     5.315027
887709     3.217179
Name: prediction, Length: 239, dtype: float64

#### Away Split

In [138]:
scaler = MinMaxScaler()
model = scaler.fit(away_df[features].values)
scaled_X = model.transform(away_df[features])



In [156]:
X = away_df[features].values

In [157]:
model = joblib.load('nhl_away_fan.pkl')

In [158]:
predictions = model.predict(X)

In [159]:
away_df['prediction'] = predictions

In [160]:
away_df['prediction']

884637    5.056829
884639    3.711435
884644    3.924253
884646    5.602734
884652    4.289475
            ...   
886868    2.622660
887188    4.208523
887489    4.045339
887576    6.366531
887811    2.715968
Name: prediction, Length: 106, dtype: float64

## Combine into Final Predictions

In [161]:
all_skaters_df = pd.concat([home_df, away_df])

In [163]:
final_pred_df = all_skaters_df[['skaterFullName', 'teamAbbrev', 'positionCode1', 'prediction']].sort_values(by='prediction', ascending=False)

In [165]:
final_pred_df.to_csv(f'predictions_{today}_scaled.csv')