# PART 3: Model Development

## C Model

In [3]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)

working_directory = 'D:/machine_learning/DFS/NHL/'
os.chdir(working_directory)
data_dir = 'Data/' #Where is your data located?
etl_dir = 'Data/ETL/' #Where is your output data going?

player_stats = pd.read_csv('Data/alldata_2016-2022.csv', index_col = 0) #Read In Our Main Dataset
c_df = pd.read_csv(etl_dir + 'c_stats.csv', index_col = 0)

# ordinal encode HomeorAway column
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
c_df['HomeOrAway'] = encoder.fit_transform(c_df['HomeOrAway'].to_numpy().reshape(-1, 1))

c_df = c_df.rename(columns={'Team_x' : 'Team'})

# convert date from object dtype to datetime
c_df['Date'] = pd.to_datetime(c_df['Date'], format = '%Y%m%d')
c_df['Date'] = c_df['Date'].dt.strftime('%Y%m%d')

#C DK PTS Rank For The Given Season & Date Pair
c_df['Act_C_FPRank'] = c_df.groupby(['Season','Date'])['FantasyPointsFanDuel'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Name','Act_C_FPRank']

#Make sure we have no duplicated columns or infinity errors
c_df = c_df.loc[:,~c_df.columns.duplicated()]
c_df= c_df.replace([np.inf, -np.inf], np.nan)

#Columns We Can't Include In Our Features Datasets
dcols = ['TeamID', 'PlayerID', 'Team', 'Position', 'Games', 'Started', 
         'Goals', 'Assists', 'Points', 'PlusMinus', 'HatTricks',
       'PenaltyMinutes', 'PowerPlayGoals', 'ShortHandedGoals', 'ShotsOnGoal',
       'Blocks',
       'Month', 'Year'
]


# g_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = c_df.drop(dcols, axis = 1)
Y = c_df['FantasyPointsFanDuel']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

more_dcols = ['Season', 'Date', 'Name', 'Opponent', 'FantasyPointsFanDuel', 'Act_C_FPRank']

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

# dump non-scaled train df for external scaling to work
filename = 'scalers/c_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
c_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(c_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(c_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/c_scaler.pkl'
joblib.dump(c_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))

  exec(code_obj, self.user_global_ns, self.user_ns)


Training set size: 59307
Testing set size: 19769

Num Possible Features: 90


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [4]:
""" MODEL SELECTION """

model = lgbm_mod()

"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Opponent','Name','FantasyPointsFanDuel']].copy()

pdf['Pred_FP_all'] = preds_all

# save the initial model to disk
filename = 'models/LGBM_models/C_model_allfeats.pkl'
joblib.dump(model, filename) 

# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

#Perform RFE (recursive feature elimination) using Top 20 Features, To Find Top 15
rfe_model = RFE(model, n_features_to_select = 10)
rfe_model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols10= dset[dset['importance']==1]['attr'].tolist()

print('T50 features', attr50, '\n')
print('T30 features', attr30, '\n')
print('T20 features', attr20, '\n')
print('T10 features',cols10, '\n')

model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])
filename = 'models/LGBM_models/C_model_50feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])
filename = 'models/LGBM_models/C_model_30feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])
filename = 'models/LGBM_models/C_model_20feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[cols10], Y_train)
preds10 = model.predict(X_test[cols10])
filename = 'models/LGBM_models/C_model_10feats.pkl'
joblib.dump(model, filename) 

# pdf = pred_df[['Season','Week','Team','Defense','PlayerID','Name','Act_G_DKPtsRank','Act_G_DKPts']].copy()
pdf['Pred_FP_50'] = preds50
pdf['Pred_FP_30'] = preds30
pdf['Pred_FP_20'] = preds20
pdf['Pred_FP_10'] = preds10
pdf.to_csv(etl_dir + 'c_predictions_lgbm_50_30_20_10.csv')

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

possible features: ['HomeOrAway', 'GM3', 'GM_pg3', 'G3', 'G_pg3', 'A3', 'A_pg3', 'PTS3', 'PTS_pg3', 'plusminus3', 'plusminus_pg3', 'HT3', 'HT_pg3', 'PIM3', 'PIM_pg3', 'PPG3', 'PPG_pg3', 'SHG3', 'SHG_pg3', 'SOG3', 'SOG_pg3', 'BLK3', 'BLK_pg3', 'FP3', 'FP_pg3', 'GM', 'G', 'G_pg', 'A', 'A_pg', 'PTS', 'PTS_pg', 'PM', 'PM_pg', 'HT', 'HT_pg', 'PIM', 'PIM_pg', 'PPG', 'PPG_pg', 'SHG_pg', 'SOG', 'SOG_pg', 'BLK', 'BLK_pg', 'FP', 'FP_pg', 'def_G3', 'def_G_pg3', 'def_A3', 'def_A_pg3', 'def_PTS3', 'def_PTS_pg3', 'def_plusminus3', 'def_plusminus_pg3', 'def_HT3', 'def_HT_pg3', 'def_PIM3', 'def_PIM_pg3', 'def_PPG3', 'def_PPG_pg3', 'def_SHG3', 'def_SHG_pg3', 'def_SOG3', 'def_SOG_pg3', 'def_BLK3', 'def_BLK_pg3', 'def_FP3', 'def_FP_pg3', 'def_G', 'def_G_pg', 'def_A', 'def_A_pg', 'def_PTS', 'def_PTS_pg', 'def_PM', 'def_PM_pg', 'def_HT', 'def_HT_pg', 'def_PIM', 'def_PIM_pg', 'def_PPG', 'def_PPG_pg', 'def_SHG_pg', 'def_SOG', 'def_SOG_pg', 'def_BLK', 'def_BLK_pg', 'def_FP', 'def_FP_pg'] 

T50 features ['SOG_

Unnamed: 0,Features,MAE
0,all,5.86
1,50,5.86
2,30,5.87
3,20,5.88
4,10,5.88


In [5]:
pdf[['Name', 'FantasyPointsFanDuel', 'Pred_FP_all', 'Pred_FP_50', 'Pred_FP_30', 'Pred_FP_20', 'Pred_FP_10']]

Unnamed: 0,Name,FantasyPointsFanDuel,FantasyPointsFanDuel.1,Pred_FP_all,Pred_FP_50,Pred_FP_30,Pred_FP_20,Pred_FP_10
0,J.T. Miller,1.6,1.6,7.688846,7.630887,7.670751,7.906294,7.290532
1,Reid Boucher,0.0,0.0,3.683440,3.764972,4.130186,4.074728,4.066100
2,Noah Gregor,13.6,13.6,3.803863,3.811769,3.554098,3.569678,3.673112
3,Sidney Crosby,3.2,3.2,21.151927,22.625053,23.908379,21.425370,22.462075
4,Patrick Marleau,3.2,3.2,7.712842,7.707133,7.388110,7.255552,6.958360
...,...,...,...,...,...,...,...,...
19764,Eric Staal,23.7,23.7,9.565264,9.236853,9.412636,9.804759,8.678860
19765,Brock Nelson,1.6,1.6,7.855819,7.850587,7.852205,7.197587,7.222626
19766,Travis Boyd,9.6,9.6,4.652252,4.475292,4.253412,4.358835,4.281073
19767,Tyler Seguin,6.4,6.4,13.021267,12.577750,13.322414,15.065061,14.942527


## W Model

In [6]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)

working_directory = 'D:/machine_learning/DFS/NHL/'
os.chdir(working_directory)
data_dir = 'Data/' #Where is your data located?
etl_dir = 'Data/ETL/' #Where is your output data going?

player_stats = pd.read_csv('Data/alldata_2016-2022.csv', index_col = 0) #Read In Our Main Dataset
w_df = pd.read_csv(etl_dir + 'w_stats.csv', index_col = 0)

# ordinal encode HomeorAway column
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
w_df['HomeOrAway'] = encoder.fit_transform(w_df['HomeOrAway'].to_numpy().reshape(-1, 1))

w_df = w_df.rename(columns={'Team_x' : 'Team'})

# convert date from object dtype to datetime
w_df['Date'] = pd.to_datetime(w_df['Date'], format = '%Y%m%d')
w_df['Date'] = w_df['Date'].dt.strftime('%Y%m%d')

#W DK PTS Rank For The Given Season & Date Pair
w_df['Act_W_FPRank'] = w_df.groupby(['Season','Date'])['FantasyPointsFanDuel'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Name','Act_W_FPRank']

#Make sure we have no duplicated columns or infinity errors
w_df = w_df.loc[:,~w_df.columns.duplicated()]
w_df= w_df.replace([np.inf, -np.inf], np.nan)

#Columns We Can't Include In Our Features Datasets
dcols = ['TeamID', 'PlayerID', 'Team', 'Position', 'Games', 'Started', 
         'Goals', 'Assists', 'Points', 'PlusMinus', 'HatTricks',
       'PenaltyMinutes', 'PowerPlayGoals', 'ShortHandedGoals', 'ShotsOnGoal',
       'Blocks',
       'Month', 'Year'
]


# g_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = w_df.drop(dcols, axis = 1)
Y = w_df['FantasyPointsFanDuel']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

more_dcols = ['Season', 'Date', 'Name', 'Opponent', 'FantasyPointsFanDuel', 'Act_W_FPRank']

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

# dump non-scaled train df for external scaling to work
filename = 'scalers/w_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
w_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(w_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(w_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/w_scaler.pkl'
joblib.dump(w_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))

  exec(code_obj, self.user_global_ns, self.user_ns)


Training set size: 60550
Testing set size: 20184

Num Possible Features: 90


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [7]:
""" MODEL SELECTION """

model = lgbm_mod()

"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Opponent','Name','FantasyPointsFanDuel']].copy()

pdf['Pred_FP_all'] = preds_all

# save the initial model to disk
filename = 'models/LGBM_models/W_model_allfeats.pkl'
joblib.dump(model, filename) 

# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

#Perform RFE (recursive feature elimination) using Top 20 Features, To Find Top 15
rfe_model = RFE(model, n_features_to_select = 10)
rfe_model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols10= dset[dset['importance']==1]['attr'].tolist()

print('T50 features', attr50, '\n')
print('T30 features', attr30, '\n')
print('T20 features', attr20, '\n')
print('T10 features',cols10, '\n')

model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])
filename = 'models/LGBM_models/W_model_50feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])
filename = 'models/LGBM_models/W_model_30feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])
filename = 'models/LGBM_models/W_model_20feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[cols10], Y_train)
preds10 = model.predict(X_test[cols10])
filename = 'models/LGBM_models/W_model_10feats.pkl'
joblib.dump(model, filename) 

# pdf = pred_df[['Season','Week','Team','Defense','PlayerID','Name','Act_W_DKPtsRank','Act_W_DKPts']].copy()
pdf['Pred_FP_50'] = preds50
pdf['Pred_FP_30'] = preds30
pdf['Pred_FP_20'] = preds20
pdf['Pred_FP_10'] = preds10
pdf.to_csv(etl_dir + 'w_predictions_lgbm_50_30_20_10.csv')

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

possible features: ['HomeOrAway', 'GM3', 'GM_pg3', 'G3', 'G_pg3', 'A3', 'A_pg3', 'PTS3', 'PTS_pg3', 'plusminus3', 'plusminus_pg3', 'HT3', 'HT_pg3', 'PIM3', 'PIM_pg3', 'PPG3', 'PPG_pg3', 'SHG3', 'SHG_pg3', 'SOG3', 'SOG_pg3', 'BLK3', 'BLK_pg3', 'FP3', 'FP_pg3', 'GM', 'G', 'G_pg', 'A', 'A_pg', 'PTS', 'PTS_pg', 'PM', 'PM_pg', 'HT', 'HT_pg', 'PIM', 'PIM_pg', 'PPG', 'PPG_pg', 'SHG_pg', 'SOG', 'SOG_pg', 'BLK', 'BLK_pg', 'FP', 'FP_pg', 'def_G3', 'def_G_pg3', 'def_A3', 'def_A_pg3', 'def_PTS3', 'def_PTS_pg3', 'def_plusminus3', 'def_plusminus_pg3', 'def_HT3', 'def_HT_pg3', 'def_PIM3', 'def_PIM_pg3', 'def_PPG3', 'def_PPG_pg3', 'def_SHG3', 'def_SHG_pg3', 'def_SOG3', 'def_SOG_pg3', 'def_BLK3', 'def_BLK_pg3', 'def_FP3', 'def_FP_pg3', 'def_G', 'def_G_pg', 'def_A', 'def_A_pg', 'def_PTS', 'def_PTS_pg', 'def_PM', 'def_PM_pg', 'def_HT', 'def_HT_pg', 'def_PIM', 'def_PIM_pg', 'def_PPG', 'def_PPG_pg', 'def_SHG_pg', 'def_SOG', 'def_SOG_pg', 'def_BLK', 'def_BLK_pg', 'def_FP', 'def_FP_pg'] 

T50 features ['SOG_

Unnamed: 0,Features,MAE
0,all,5.81
1,50,5.81
2,30,5.82
3,20,5.83
4,10,5.83


In [8]:
pdf[['Name', 'FantasyPointsFanDuel', 'Pred_FP_all', 'Pred_FP_50', 'Pred_FP_30', 'Pred_FP_20', 'Pred_FP_10']]

Unnamed: 0,Name,FantasyPointsFanDuel,FantasyPointsFanDuel.1,Pred_FP_all,Pred_FP_50,Pred_FP_30,Pred_FP_20,Pred_FP_10
0,Timo Meier,3.2,3.2,4.589318,4.575845,4.591920,4.526959,4.153235
1,Martin Frk,4.8,4.8,3.896026,3.905810,3.799460,3.824853,3.882637
2,Kyle Palmieri,0.0,0.0,9.464828,9.618115,10.613752,10.450582,11.093447
3,Thomas Vanek,8.5,8.5,5.987462,5.989495,5.905314,6.307128,6.175032
4,Kris Versteeg,22.1,22.1,6.858372,6.798116,6.809300,6.854523,6.390102
...,...,...,...,...,...,...,...,...
20179,Taylor Leier,1.6,1.6,6.178045,6.047882,6.284507,6.235827,5.942053
20180,Joonas Donskoi,0.0,0.0,6.988086,6.787917,6.698339,6.826358,6.923340
20181,Patrick Sharp,0.0,0.0,3.755407,3.394232,2.919445,3.353604,3.599713
20182,Josh Bailey,11.7,11.7,9.769311,9.575595,9.769985,10.576703,10.189084


## D Model

In [9]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)

working_directory = 'D:/machine_learning/DFS/NHL/'
os.chdir(working_directory)
data_dir = 'Data/' #Where is your data located?
etl_dir = 'Data/ETL/' #Where is your output data going?

player_stats = pd.read_csv('Data/alldata_2016-2022.csv', index_col = 0) #Read In Our Main Dataset
d_df = pd.read_csv(etl_dir + 'd_stats.csv', index_col = 0)

# ordinal encode HomeorAway column
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
d_df['HomeOrAway'] = encoder.fit_transform(d_df['HomeOrAway'].to_numpy().reshape(-1, 1))

d_df = d_df.rename(columns={'Team_x' : 'Team'})

# convert date from object dtype to datetime
d_df['Date'] = pd.to_datetime(d_df['Date'], format = '%Y%m%d')
d_df['Date'] = d_df['Date'].dt.strftime('%Y%m%d')

#D DK PTS Rank For The Given Season & Date Pair
d_df['Act_D_FPRank'] = d_df.groupby(['Season','Date'])['FantasyPointsFanDuel'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Name','Act_D_FPRank']

#Make sure we have no duplicated columns or infinity errors
d_df = d_df.loc[:,~d_df.columns.duplicated()]
d_df= d_df.replace([np.inf, -np.inf], np.nan)

#Columns We Can't Include In Our Features Datasets
dcols = ['TeamID', 'PlayerID', 'Team', 'Position', 'Games', 'Started', 
         'Goals', 'Assists', 'Points', 'PlusMinus', 'HatTricks',
       'PenaltyMinutes', 'PowerPlayGoals', 'ShortHandedGoals', 'ShotsOnGoal',
       'Blocks',
       'Month', 'Year'
]


# g_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = d_df.drop(dcols, axis = 1)
Y = d_df['FantasyPointsFanDuel']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

more_dcols = ['Season', 'Date', 'Name', 'Opponent', 'FantasyPointsFanDuel', 'Act_D_FPRank']

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

# dump non-scaled train df for external scaling to work
filename = 'scalers/d_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
d_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(d_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(d_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/d_scaler.pkl'
joblib.dump(d_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))

  exec(code_obj, self.user_global_ns, self.user_ns)


Training set size: 65250
Testing set size: 21750

Num Possible Features: 90


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
""" MODEL SELECTION """

model = lgbm_mod()

"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Opponent','Name','FantasyPointsFanDuel']].copy()

pdf['Pred_FP_all'] = preds_all

# save the initial model to disk
filename = 'models/LGBM_models/D_model_allfeats.pkl'
joblib.dump(model, filename) 

# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

#Perform RFE (recursive feature elimination) using Top 20 Features, To Find Top 15
rfe_model = RFE(model, n_features_to_select = 10)
rfe_model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols10= dset[dset['importance']==1]['attr'].tolist()

print('T50 features', attr50, '\n')
print('T30 features', attr30, '\n')
print('T20 features', attr20, '\n')
print('T10 features',cols10, '\n')

model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])
filename = 'models/LGBM_models/D_model_50feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])
filename = 'models/LGBM_models/D_model_30feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])
filename = 'models/LGBM_models/D_model_20feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[cols10], Y_train)
preds10 = model.predict(X_test[cols10])
filename = 'models/LGBM_models/D_model_10feats.pkl'
joblib.dump(model, filename) 

# pdf = pred_df[['Season','Week','Team','Defense','PlayerID','Name','Act_D_DKPtsRank','Act_D_DKPts']].copy()
pdf['Pred_FP_50'] = preds50
pdf['Pred_FP_30'] = preds30
pdf['Pred_FP_20'] = preds20
pdf['Pred_FP_10'] = preds10
pdf.to_csv(etl_dir + 'd_predictions_lgbm_50_30_20_10.csv')

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

possible features: ['HomeOrAway', 'GM3', 'GM_pg3', 'G3', 'G_pg3', 'A3', 'A_pg3', 'PTS3', 'PTS_pg3', 'plusminus3', 'plusminus_pg3', 'HT3', 'HT_pg3', 'PIM3', 'PIM_pg3', 'PPG3', 'PPG_pg3', 'SHG3', 'SHG_pg3', 'SOG3', 'SOG_pg3', 'BLK3', 'BLK_pg3', 'FP3', 'FP_pg3', 'GM', 'G', 'G_pg', 'A', 'A_pg', 'PTS', 'PTS_pg', 'PM', 'PM_pg', 'HT', 'HT_pg', 'PIM', 'PIM_pg', 'PPG', 'PPG_pg', 'SHG_pg', 'SOG', 'SOG_pg', 'BLK', 'BLK_pg', 'FP', 'FP_pg', 'def_G3', 'def_G_pg3', 'def_A3', 'def_A_pg3', 'def_PTS3', 'def_PTS_pg3', 'def_plusminus3', 'def_plusminus_pg3', 'def_HT3', 'def_HT_pg3', 'def_PIM3', 'def_PIM_pg3', 'def_PPG3', 'def_PPG_pg3', 'def_SHG3', 'def_SHG_pg3', 'def_SOG3', 'def_SOG_pg3', 'def_BLK3', 'def_BLK_pg3', 'def_FP3', 'def_FP_pg3', 'def_G', 'def_G_pg', 'def_A', 'def_A_pg', 'def_PTS', 'def_PTS_pg', 'def_PM', 'def_PM_pg', 'def_HT', 'def_HT_pg', 'def_PIM', 'def_PIM_pg', 'def_PPG', 'def_PPG_pg', 'def_SHG_pg', 'def_SOG', 'def_SOG_pg', 'def_BLK', 'def_BLK_pg', 'def_FP', 'def_FP_pg'] 

T50 features ['SOG_

Unnamed: 0,Features,MAE
0,all,4.35
1,50,4.35
2,30,4.37
3,20,4.37
4,10,4.37


In [21]:
pdf[['Name', 'FantasyPointsFanDuel', 'Pred_FP_all', 'Pred_FP_50', 'Pred_FP_30', 'Pred_FP_20', 'Pred_FP_10']]

Unnamed: 0,Name,FantasyPointsFanDuel,FantasyPointsFanDuel.1,Pred_FP_all,Pred_FP_50,Pred_FP_30,Pred_FP_20,Pred_FP_10
0,Zach Werenski,14.4,14.4,5.3274,4.8950,4.8258,2.4314,2.3360
1,Ian Cole,4.8,4.8,7.8586,7.7790,8.2474,8.7848,8.5032
2,Matt Irwin,4.8,4.8,4.4934,4.8826,5.2250,5.2010,4.7906
3,Deryk Engelland,1.6,1.6,5.7544,5.7594,5.9944,6.8192,7.4642
4,MacKenzie Weegar,3.2,3.2,4.8464,4.7330,5.2606,4.1384,5.1978
...,...,...,...,...,...,...,...,...
21745,Brenden Dillon,6.4,6.4,7.6408,6.9542,7.2984,7.9110,6.8510
21746,Michael Stone,9.6,9.6,4.9832,4.4548,5.3800,5.1872,7.6112
21747,Matt Dumba,11.2,11.2,9.0702,9.3572,9.3330,9.2604,9.9840
21748,Kurtis MacDermid,1.6,1.6,3.3426,3.4786,3.9302,4.0714,4.2632


## G Model

In [40]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)

working_directory = 'D:/machine_learning/DFS/NHL/'
os.chdir(working_directory)
data_dir = 'Data/' #Where is your data located?
etl_dir = 'Data/ETL/' #Where is your output data going?

player_stats = pd.read_csv('Data/alldata_2016-2022.csv', index_col = 0) #Read In Our Main Dataset
g_df = pd.read_csv(etl_dir + 'g_stats.csv', index_col = 0)

# ordinal encode HomeorAway column
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
g_df['HomeOrAway'] = encoder.fit_transform(g_df['HomeOrAway'].to_numpy().reshape(-1, 1))

g_df = g_df.rename(columns={'Team_x' : 'Team'})

# convert date from object dtype to datetime
g_df['Date'] = pd.to_datetime(g_df['Date'], format = '%Y%m%d')
g_df['Date'] = g_df['Date'].dt.strftime('%Y%m%d')

#G DK PTS Rank For The Given Season & Date Pair
g_df['Act_G_FPRank'] = g_df.groupby(['Season','Date'])['FantasyPointsFanDuel'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Name','Act_G_FPRank']

#Make sure we have no duplicated columns or infinity errors
g_df = g_df.loc[:,~g_df.columns.duplicated()]
g_df= g_df.replace([np.inf, -np.inf], np.nan)

#Columns We Can't Include In Our Features Datasets
dcols = [
     'TeamID',
     'PlayerID',
     'Team',
     'Position',
     'Games',
     'Started',
     'GoaltendingWins',
     'GoaltendingLosses',
     'GoaltendingOvertimeLosses',
     'GoaltendingShotsAgainst',
     'GoaltendingGoalsAgainst',
     'GoaltendingSaves',
     'GoaltendingShutouts',
     'GoaltendingGoalsAgainstAverage',
     'GoaltendingSavePercentage',
     'GoaltendingMinutes',
     'Month',
     'Year',
    ]


# g_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = g_df.drop(dcols, axis = 1)
Y = g_df['FantasyPointsFanDuel']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

# # get nonzero entries for GTGAA_pg3
# X_test_nz = X_test.loc[X_test['GTGAA_pg3'] != 0]

# # get indices of dropped rows
# dropped_row_idx = list(X_test.index[~X_test.index.isin(X_test_nz.index) == False])

# # drop Y_test entries that were dropped from X test
# Y_test = Y_test.iloc[dropped_row_idx]

# # reassign X_test and reset indices
# X_test = X_test_nz
# X_test.reset_index(inplace = True, drop=True)
# Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

more_dcols = ['Season', 'Date', 'Name', 'Opponent', 'FantasyPointsFanDuel', 'Act_G_FPRank']

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

  exec(code_obj, self.user_global_ns, self.user_ns)


Training set size: 19128
Testing set size: 6377


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [41]:
# dump non-scaled train df for external scaling to work
filename = 'scalers/g_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
g_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(g_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(g_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/g_scaler.pkl'
joblib.dump(g_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))


Num Possible Features: 95


In [42]:
""" MODEL SELECTION """

model = lgbm_mod()

"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Opponent','Name','FantasyPointsFanDuel']].copy()

pdf['Pred_FP_all'] = preds_all

# save the initial model to disk
filename = 'models/LGBM_models/G_model_allfeats.pkl'
joblib.dump(model, filename) 

# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

#Perform RFE (recursive feature elimination) using Top 20 Features, To Find Top 15
rfe_model = RFE(model, n_features_to_select = 10)
rfe_model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols10= dset[dset['importance']==1]['attr'].tolist()

print('T50 features', attr50, '\n')
print('T30 features', attr30, '\n')
print('T20 features', attr20, '\n')
print('T10 features',cols10, '\n')

model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])
filename = 'models/LGBM_models/G_model_50feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])
filename = 'models/LGBM_models/G_model_30feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])
filename = 'models/LGBM_models/G_model_20feats.pkl'
joblib.dump(model, filename) 

model.fit(X_train[cols10], Y_train)
preds10 = model.predict(X_test[cols10])
filename = 'models/LGBM_models/G_model_10feats.pkl'
joblib.dump(model, filename) 

# pdf = pred_df[['Season','Week','Team','Defense','PlayerID','Name','Act_G_DKPtsRank','Act_G_DKPts']].copy()
pdf['Pred_FP_50'] = preds50
pdf['Pred_FP_30'] = preds30
pdf['Pred_FP_20'] = preds20
pdf['Pred_FP_10'] = preds10
pdf.to_csv(etl_dir + 'g_predictions_lgbm_50_30_20_10.csv')

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

possible features: ['HomeOrAway', 'GM3', 'GM_pg3', 'GTW3', 'GTW_pg3', 'GTL3', 'GTL_pg3', 'GTOTL3', 'GTOTL_pg3', 'GTSA3', 'GTSA_pg3', 'GTGA3', 'GTGA_pg3', 'GTS3', 'GTS_pg3', 'GTSO3', 'GTSO_pg3', 'GTGAA_pg3', 'GTSP_pg3', 'GTM3', 'GTM_pg3', 'FP3', 'FP_pg3', 'WP3', 'SPM3', 'GM', 'GTW', 'GTW_pg', 'GTL', 'GTL_pg', 'GTOTL', 'GTOTL_pg', 'GTSA', 'GTSA_pg', 'GTGA', 'GTGA_pg', 'GTS', 'GTS_pg', 'GTSO', 'GTSO_pg', 'GTGAA_pg', 'GTSP_pg', 'GTM', 'GTM_pg', 'FP', 'FP_pg', 'WP', 'SPM', 'def_GM3', 'def_GM_pg3', 'def_GTW3', 'def_GTW_pg3', 'def_GTL3', 'def_GTL_pg3', 'def_GTOTL3', 'def_GTOTL_pg3', 'def_GTSA3', 'def_GTSA_pg3', 'def_GTGA3', 'def_GTGA_pg3', 'def_GTS3', 'def_GTS_pg3', 'def_GTSO3', 'def_GTSO_pg3', 'def_GTGAA_pg3', 'def_GTSP_pg3', 'def_GTM3', 'def_GTM_pg3', 'def_FP3', 'def_FP_pg3', 'def_WP3', 'def_SPM3', 'def_GM', 'def_GTW', 'def_GTW_pg', 'def_GTL', 'def_GTL_pg', 'def_GTOTL', 'def_GTOTL_pg', 'def_GTSA', 'def_GTSA_pg', 'def_GTGA', 'def_GTGA_pg', 'def_GTS', 'def_GTS_pg', 'def_GTSO', 'def_GTSO_pg', 

Unnamed: 0,Features,MAE
0,all,9.67
1,50,9.72
2,30,9.74
3,20,9.78
4,10,9.8


In [43]:
pdf[['Name', 'FantasyPointsFanDuel', 'Pred_FP_all', 'Pred_FP_50', 'Pred_FP_30', 'Pred_FP_20', 'Pred_FP_10']]

Unnamed: 0,Name,FantasyPointsFanDuel,FantasyPointsFanDuel.1,Pred_FP_all,Pred_FP_50,Pred_FP_30,Pred_FP_20,Pred_FP_10
0,Frederik Andersen,27.2,27.2,14.590631,15.367259,14.448247,13.292447,15.565027
1,Michal Neuvirth,9.6,9.6,6.249915,7.124373,5.425216,5.712783,5.874368
2,Martin Jones,0.0,0.0,10.092011,8.326732,11.790089,11.055085,11.037476
3,Martin Jones,32.8,32.8,14.394197,14.734338,13.565939,14.528435,16.245499
4,Jimmy Howard,36.0,36.0,11.254156,9.778327,10.438184,11.232283,11.939451
...,...,...,...,...,...,...,...,...
6372,Anton Forsberg,0.0,0.0,2.873817,2.878365,2.798817,2.836061,2.770586
6373,Keith Kinkaid,0.0,0.0,2.785185,3.094633,3.362242,3.614268,3.102860
6374,Antti Raanta,21.6,21.6,6.710030,5.310932,5.613977,5.539519,5.591537
6375,Philipp Grubauer,0.0,0.0,5.608029,5.740670,5.357241,5.227709,5.512778
