# PART 3.1: LGBM Model Development

## Guard Model

In [135]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE


working_directory = 'D:/machine_learning/DFS/NBA/NBA_moredata'
os.chdir(working_directory)
data_dir = 'Data/'
etl_dir = 'Data/ETL/'

player_stats = pd.read_csv(data_dir + 'player_stats_all.csv', index_col = 0)
g_vs = pd.read_csv(etl_dir + 'g_stats.csv', index_col = 0)

g_vs = g_vs.rename(columns={'Opp':'Defense', 'Team_x' : 'Team'})

print(g_vs.columns.tolist())

#Grab Only Necessary Columns & Filter Only To G Data
g_act_stats = player_stats[(player_stats['Pos.']=='G') | (player_stats['Pos.']=='F-G') | (player_stats['Pos.']=='G-F')].copy().reset_index(drop=True)

g_act_stats = g_act_stats.fillna(0)

g_act_stats.drop(list(set(g_act_stats.columns) - set(g_vs)), axis = 1, inplace = True)

#Calculate The Draftkings Points for each player on each date
g_act_stats['Act_G_DKPts'] = (g_act_stats['3P'] * 1 + g_act_stats['AST'] * 1.5 +\
                           g_act_stats['BLK'] * 3 + g_act_stats['FG'] * 2 +\
                           g_act_stats['FT'] * 1 + g_act_stats['TRB'] * 1.2 +\
                           g_act_stats['STL'] * 3 + g_act_stats['TOV'] * -1)

#G DK PTS Rank For The Given Season & Date Pair
g_act_stats['Act_G_DKPtsRank'] = g_act_stats.groupby(['Season','Date'])['Act_G_DKPts'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Player','Act_G_DKPtsRank','Act_G_DKPts']

#Append Actual DK Pts Rank & DK Pts
g_vs_act = pd.merge(g_vs, g_act_stats[keep_cols], how = 'left', on = ['Season','Date','Player'])
g_vs_act = g_vs_act[g_vs_act['Act_G_DKPts']>0].reset_index(drop=True)

#Make sure we have no duplicated columns or infinity errors
g_vs_act = g_vs_act.loc[:,~g_vs_act.columns.duplicated()]
g_vs_act = g_vs_act.replace([np.inf, -np.inf], np.nan)
g_vs_act.to_csv(etl_dir + 'g_v_def_stats.csv')

#Columns We Can't Include In Our Features Datasets
dcols = ['Age',
         'at',
         'Result',
         'GS',
         'FG',
         'FGA',
         'FG%',
         '2P',
         '2PA',
         '2P%',
         '3P',
         '3PA',
         '3P%',
         'FT',
         'FTA',
         'FT%',
         'TS%',
         'ORB',
         'DRB',
         'TRB',
         'AST',
         'STL',
         'BLK',
         'TOV',
         'PF',
         'PTS',
         'GmSc',
         'BPM',
         'Pos.',
         'Month',
         'Year',
         'Team_y',
         'Act_G_DKPts',
         'EFF',
         'MP'
]


more_dcols = ['Season', 'Date', 'Team', 'Defense', 'Player', 'Act_G_DKPtsRank']

# g_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = g_vs_act.drop(dcols, axis = 1)
Y = g_vs_act['Act_G_DKPts']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

['Player', 'Date', 'Age', 'Team', 'at', 'Defense', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TS%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'BPM', 'Pos.', 'EFF', 'Month', 'Year', 'Season', 'MP3', 'MP_pg3', 'FG3', 'FG_pg3', 'FGA3', 'FGA_pg3', 'FG%_pg3', '2P3', '2P_pg3', '2PA3', '2PA_pg3', '2P%_pg3', '3P3', '3P_pg3', '3PA3', '3PA_pg3', '3P%_pg3', 'FT3', 'FT_pg3', 'FTA3', 'FTA_pg3', 'FT%3', 'FT%_pg3', 'TS%_pg3', 'ORB3', 'ORB_pg3', 'DRB3', 'DRB_pg3', 'TRB3', 'TRB_pg3', 'AST3', 'AST_pg3', 'STL3', 'STL_pg3', 'BLK3', 'BLK_pg3', 'TOV3', 'TOV_pg3', 'PF3', 'PF_pg3', 'PTS3', 'PTS_pg3', 'GmSc3', 'GmSc_pg3', 'BPM_pg3', 'EFF3', 'EFF_pg3', 'g_MP3Rank3', 'g_FG3Rank3', 'g_FGARank3', 'g_FG%Rank3', 'g_2PRank3', 'g_2PARank3', 'g_2P%Rank3', 'g_3PRank3', 'g_3PARank3', 'g_3P%Rank3', 'g_FTRank3', 'g_FTARank3', 'g_FT%Rank3', 'g_TS%Rank3', 'g_ORBRank3', 'g_DRBRank3', 'g_TRBRank3', 'g_ASTRank3', 'g_STLRank3', 'g_BLKRank3', 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [136]:
# dump non-scaled train df for external scaling to work
filename = 'scalers/g_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
g_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(g_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(g_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/g_scaler.pkl'
joblib.dump(g_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))


Num Possible Features: 241


In [137]:
# hyperparameter tuning function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

def hyperparameter_tuning(feat_set):
    # define model for param tuning
    model = LGBMRegressor()

    # define param grid
    param_grid = {
        'random_state' : [1],
        'n_jobs' : [-1],
        'n_estimators' : [1000],
        'learning_rate': [0.1],
        'num_leaves': [62, 93, 127],
        'max_depth' :[-1, 20, 40],
        'min_child_samples' : [20, 80],
        'max_bin' : [63],
        'device' : ['gpu']
    }

    gsearch = GridSearchCV(model, param_grid, verbose = 10, scoring = 'neg_mean_absolute_error') 

    lgb_model = gsearch.fit(X_train[feat_set], Y_train)
    print('best params')
    print(gsearch.best_params_, gsearch.best_score_)
    preds_lgb_model = lgb_model.predict(X_test[feat_set])
    mae_lgb = mean_absolute_error(Y_test, preds_lgb_model)
    print(" MAE: %f" % (mae_lgb ))
    return lgb_model.best_params_

In [138]:
# hyperparameter tuning for all features
best_params_all = hyperparameter_tuning(X_train.columns.tolist())

""" MODEL SELECTION (ALL FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_all)
"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Team','Defense','Player','Act_G_DKPtsRank','Act_G_DKPts']].copy()
pdf['Pred_G_DKPts_all'] = preds_all
pdf['PredictedallRank'] = pdf.groupby(['Season','Date'])['Pred_G_DKPts_all'].rank(method='min', ascending = False)
temp_df_all = pdf[pdf['PredictedallRank']<=5]

# save the initial model to disk
filename = 'models/LGBM_models/G_model_allfeats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.520 total time=  14.8s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.570 total time=  14.8s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.353 total time=  20.0s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.374 total time=  19.5s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.123 total time=  26.1s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.583 total time=  13.8s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.610 total time=  13.8s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.630 total time=  13.6s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.118 total time=  28.5s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.087 total time=  27.8s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.101 total time=  27.0s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/G_model_allfeats.pkl']

In [139]:
# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# hyperparameter tuning for 50 features
best_params_50 = hyperparameter_tuning(attr50)

""" MODEL SELECTION (50 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_50)
"""                 """

#print top50 features
print('T50 features', attr50, '\n')

# Fit model, make predictions with t50 features
model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])

# add model performance to pdf dataframe
pdf['Pred_G_DKPts_50'] = preds50
pdf['Predicted50Rank'] = pdf.groupby(['Season','Date'])['Pred_G_DKPts_50'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/G_model_50feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.881 total time=  12.1s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.878 total time=  11.7s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.656 total time=  16.3s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.736 total time=  16.4s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.469 total time=  21.7s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.907 total time=  11.6s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.924 total time=  11.6s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.970 total time=  11.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.407 total time=  22.4s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.367 total time=  22.2s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.432 total time=  22.2s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/G_model_50feats.pkl']

In [140]:
# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# hyperparameter tuning for 30 features
best_params_30 = hyperparameter_tuning(attr30)

""" MODEL SELECTION (30 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_30)
"""                 """

#print top30 features
print('T30 features', attr30, '\n')

# Fit model, make predictions with t30 features
model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])

# add model performance to pdf dataframe
pdf['Pred_G_DKPts_30'] = preds30
pdf['Predicted30Rank'] = pdf.groupby(['Season','Date'])['Pred_G_DKPts_30'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/G_model_30feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.193 total time=  11.0s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.213 total time=  11.2s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.049 total time=  15.5s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.077 total time=  15.5s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.796 total time=  20.9s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.320 total time=  10.6s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.289 total time=  10.7s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.314 total time=  10.7s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.739 total time=  21.7s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.795 total time=  21.6s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.765 total time=  21.5s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/G_model_30feats.pkl']

In [141]:
# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

# hyperparameter tuning for 20 features
best_params_20 = hyperparameter_tuning(attr20)

""" MODEL SELECTION (20 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_20)
"""                 """

#print top20 features
print('T20 features', attr20, '\n')

# Fit model, make predictions with t20 features
model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])

# add model performance to pdf dataframe
pdf['Pred_G_DKPts_20'] = preds20
pdf['Predicted20Rank'] = pdf.groupby(['Season','Date'])['Pred_G_DKPts_20'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/G_model_20feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.289 total time=  10.8s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.307 total time=  10.9s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.165 total time=  15.3s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.193 total time=  15.2s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.934 total time=  20.5s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.416 total time=  10.5s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.365 total time=  10.5s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.434 total time=  10.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.871 total time=  21.2s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.906 total time=  21.2s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.870 total time=  21.2s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/G_model_20feats.pkl']

In [142]:
# Using Top 20 Features, Find Top 10 Features
model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr10 = dset['attr'][0:10].tolist()

# hyperparameter tuning for 10 features
best_params_10 = hyperparameter_tuning(attr10)

""" MODEL SELECTION (10 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_10)
"""                 """

#print top10 features
print('T10 features', attr10, '\n')

# Fit model, make predictions with t10 features
model.fit(X_train[attr10], Y_train)
preds10 = model.predict(X_test[attr10])

# add model performance to pdf dataframe
pdf['Pred_G_DKPts_10'] = preds10
pdf['Predicted10Rank'] = pdf.groupby(['Season','Date'])['Pred_G_DKPts_10'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/G_model_10feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-8.439 total time=  10.4s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-8.350 total time=  10.3s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-8.396 total time=  15.0s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-8.363 total time=  14.9s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-8.184 total time=  20.0s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-8.612 total time=  10.2s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-8.582 total time=  10.2s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-8.537 total time=  10.2s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-7.965 total time=  20.4s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-8.071 total time=  20.4s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-8.052 total time=  20.4s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/G_model_10feats.pkl']

In [143]:
# write predictions to csv
pdf.to_csv(etl_dir + 'g_predictions_lgbm_50_30_20_15_10.csv')

# create df to summarize results

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

Unnamed: 0,Features,MAE
0,all,5.92
1,50,6.24
2,30,6.61
3,20,6.72
4,10,7.89


In [144]:
pdf

Unnamed: 0,Season,Date,Team,Defense,Player,Act_G_DKPtsRank,Act_G_DKPts,Pred_G_DKPts_all,PredictedallRank,Pred_G_DKPts_50,Predicted50Rank,Pred_G_DKPts_30,Predicted30Rank,Pred_G_DKPts_20,Predicted20Rank,Pred_G_DKPts_10,Predicted10Rank
0,2014,471,CHO,BOS,P.J. Hairston,86.0,5.6,10.995620,24.0,9.446317,22.0,10.527105,23.0,10.489193,24.0,11.759069,25.0
1,2014,358,POR,MIN,Wesley Matthews,20.0,30.2,30.514159,3.0,25.701748,9.0,26.221791,9.0,26.343586,6.0,25.294755,9.0
2,2014,450,MEM,CHI,Vince Carter,63.0,12.4,10.624259,17.0,9.813612,17.0,10.204702,17.0,11.603103,15.0,12.009417,16.0
3,2017,819,SAS,MIA,Kyle Anderson,31.0,30.0,21.847746,9.0,22.659101,9.0,16.887598,14.0,15.570963,20.0,16.284435,16.0
4,2020,1328,MIN,NOP,Anthony Edwards,40.0,24.7,27.698950,7.0,24.707482,10.0,14.942860,21.0,16.926019,17.0,27.987111,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40170,2019,1244,IND,CHI,Jeremy Lamb,33.0,21.0,22.411779,9.0,20.095369,13.0,20.134261,13.0,22.467683,8.0,18.303028,11.0
40171,2016,777,BRK,DAL,Caris LeVert,101.0,8.6,14.249328,26.0,11.808886,33.0,12.108423,33.0,11.603869,32.0,12.167967,33.0
40172,2016,685,ORL,MEM,Evan Fournier,15.0,31.0,32.140805,4.0,33.642789,4.0,31.384204,3.0,31.289650,3.0,32.353490,4.0
40173,2015,520,CHO,MIL,Jeremy Lin,63.0,16.6,20.525212,12.0,27.653902,5.0,28.652697,4.0,24.728580,7.0,35.232854,3.0


## Forward Model

In [145]:
"""
"""

import os
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

working_directory = 'D:/machine_learning/DFS/NBA/NBA_moredata'
os.chdir(working_directory)
data_dir = 'Data/'
etl_dir = 'Data/ETL/'

player_stats = pd.read_csv(data_dir + 'player_stats_all.csv', index_col = 0)
f_vs = pd.read_csv(etl_dir + 'f_stats.csv', index_col = 0)

f_vs = f_vs.rename(columns={'Opp':'Defense', 'Team_x' : 'Team'})

print(f_vs.columns.tolist())

#Grab Only Necessary Columns & Filter Only To F Data
f_act_stats = player_stats[(player_stats['Pos.']=='F') | (player_stats['Pos.']=='F-G') | (player_stats['Pos.']=='G-F') | (player_stats['Pos.']=='F-C')].copy().reset_index(drop=True)

f_act_stats = f_act_stats.fillna(0)

f_act_stats.drop(list(set(f_act_stats.columns) - set(f_vs)), axis = 1, inplace = True)

#Calculate The Draftkings Points for each player on each date
f_act_stats['Act_F_DKPts'] = (f_act_stats['3P'] * 1 + f_act_stats['AST'] * 1.5 +\
                           f_act_stats['BLK'] * 3 + f_act_stats['FG'] * 2 +\
                           f_act_stats['FT'] * 1 + f_act_stats['TRB'] * 1.2 +\
                           f_act_stats['STL'] * 3 + f_act_stats['TOV'] * -1)

#G DK PTS Rank For The Given Season & Date Pair
f_act_stats['Act_F_DKPtsRank'] = f_act_stats.groupby(['Season','Date'])['Act_F_DKPts'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Player','Act_F_DKPtsRank','Act_F_DKPts']

#Append Actual DK Pts Rank & DK Pts
f_vs_act = pd.merge(f_vs, f_act_stats[keep_cols], how = 'left', on = ['Season','Date','Player'])
f_vs_act = f_vs_act[f_vs_act['Act_F_DKPts']>0].reset_index(drop=True)

#Make sure we have no duplicated columns or infinity errors
f_vs_act = f_vs_act.loc[:,~f_vs_act.columns.duplicated()]
f_vs_act = f_vs_act.replace([np.inf, -np.inf], np.nan)
f_vs_act.to_csv(etl_dir + 'g_v_def_stats.csv')

#Columns We Can't Include In Our Features Datasets
dcols = ['Age',
         'at',
         'Result',
         'GS',
         'FG',
         'FGA',
         'FG%',
         '2P',
         '2PA',
         '2P%',
         '3P',
         '3PA',
         '3P%',
         'FT',
         'FTA',
         'FT%',
         'TS%',
         'ORB',
         'DRB',
         'TRB',
         'AST',
         'STL',
         'BLK',
         'TOV',
         'PF',
         'PTS',
         'GmSc',
         'BPM',
         'Pos.',
         'Month',
         'Year',
         'Team_y',
         'Act_F_DKPts',
         'EFF',
         'MP'
]


more_dcols = ['Season', 'Date', 'Team', 'Defense', 'Player', 'Act_F_DKPtsRank']

# f_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = f_vs_act.drop(dcols, axis = 1)
Y = f_vs_act['Act_F_DKPts']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

['Player', 'Date', 'Age', 'Team', 'at', 'Defense', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TS%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'BPM', 'Pos.', 'EFF', 'Month', 'Year', 'Season', 'MP3', 'MP_pg3', 'FG3', 'FG_pg3', 'FGA3', 'FGA_pg3', 'FG%_pg3', '2P3', '2P_pg3', '2PA3', '2PA_pg3', '2P%_pg3', '3P3', '3P_pg3', '3PA3', '3PA_pg3', '3P%_pg3', 'FT3', 'FT_pg3', 'FTA3', 'FTA_pg3', 'FT%3', 'FT%_pg3', 'TS%_pg3', 'ORB3', 'ORB_pg3', 'DRB3', 'DRB_pg3', 'TRB3', 'TRB_pg3', 'AST3', 'AST_pg3', 'STL3', 'STL_pg3', 'BLK3', 'BLK_pg3', 'TOV3', 'TOV_pg3', 'PF3', 'PF_pg3', 'PTS3', 'PTS_pg3', 'GmSc3', 'GmSc_pg3', 'BPM_pg3', 'EFF3', 'EFF_pg3', 'f_MP3Rank3', 'f_FG3Rank3', 'f_FGARank3', 'f_FG%Rank3', 'f_2PRank3', 'f_2PARank3', 'f_2P%Rank3', 'f_3PRank3', 'f_3PARank3', 'f_3P%Rank3', 'f_FTRank3', 'f_FTARank3', 'f_FT%Rank3', 'f_TS%Rank3', 'f_ORBRank3', 'f_DRBRank3', 'f_TRBRank3', 'f_ASTRank3', 'f_STLRank3', 'f_BLKRank3', 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [146]:
# dump non-scaled train df for external scaling to work
filename = 'scalers/f_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
f_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(f_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(f_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/f_scaler.pkl'
joblib.dump(f_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))


Num Possible Features: 241


In [147]:
# hyperparameter tuning function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

def hyperparameter_tuning(feat_set):
    # define model for param tuning
    model = LGBMRegressor()

    # define param grid
    param_grid = {
        'random_state' : [1],
        'n_jobs' : [-1],
        'n_estimators' : [1000],
        'learning_rate': [0.1],
        'num_leaves': [62, 93, 127],
        'max_depth' :[-1, 20, 40],
        'min_child_samples' : [20, 80],
        'max_bin' : [63],
        'device' : ['gpu']
    }

    gsearch = GridSearchCV(model, param_grid, verbose = 10, scoring = 'neg_mean_absolute_error') 

    lgb_model = gsearch.fit(X_train[feat_set], Y_train)
    print('best params')
    print(gsearch.best_params_, gsearch.best_score_)
    preds_lgb_model = lgb_model.predict(X_test[feat_set])
    mae_lgb = mean_absolute_error(Y_test, preds_lgb_model)
    print(" MAE: %f" % (mae_lgb ))
    return lgb_model.best_params_

In [148]:
# hyperparameter tuning for all features
best_params_all = hyperparameter_tuning(X_train.columns.tolist())

""" MODEL SELECTION (ALL FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_all)
"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Team','Defense','Player','Act_F_DKPtsRank','Act_F_DKPts']].copy()
pdf['Pred_F_DKPts_all'] = preds_all
pdf['PredictedallRank'] = pdf.groupby(['Season','Date'])['Pred_F_DKPts_all'].rank(method='min', ascending = False)
temp_df_all = pdf[pdf['PredictedallRank']<=5]

# save the initial model to disk
filename = 'models/LGBM_models/F_model_allfeats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.095 total time=  14.2s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.119 total time=  14.0s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.760 total time=  19.4s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.821 total time=  19.3s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.485 total time=  25.5s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.156 total time=  14.4s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.144 total time=  14.8s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.200 total time=  14.1s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.456 total time=  29.2s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.428 total time=  28.2s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.372 total time=  28.0s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/F_model_allfeats.pkl']

In [149]:
# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# hyperparameter tuning for 50 features
best_params_50 = hyperparameter_tuning(attr50)

""" MODEL SELECTION (50 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_50)
"""                 """

#print top50 features
print('T50 features', attr50, '\n')

# Fit model, make predictions with t50 features
model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])

# add model performance to pdf dataframe
pdf['Pred_F_DKPts_50'] = preds50
pdf['Predicted50Rank'] = pdf.groupby(['Season','Date'])['Pred_F_DKPts_50'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/F_model_50feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.471 total time=  12.5s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.529 total time=  12.4s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.162 total time=  16.8s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.222 total time=  16.8s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.852 total time=  22.6s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.560 total time=  11.6s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.537 total time=  11.8s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.582 total time=  11.8s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.845 total time=  24.1s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.818 total time=  23.0s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.759 total time=  23.3s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/F_model_50feats.pkl']

In [150]:
# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# hyperparameter tuning for 30 features
best_params_30 = hyperparameter_tuning(attr30)

""" MODEL SELECTION (30 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_30)
"""                 """

#print top50 features
print('T30 features', attr30, '\n')

# Fit model, make predictions with t30 features
model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])

# add model performance to pdf dataframe
pdf['Pred_F_DKPts_30'] = preds30
pdf['Predicted30Rank'] = pdf.groupby(['Season','Date'])['Pred_F_DKPts_30'].rank(method='min', ascending = False)

# save the initial model to disk
filename = 'models/LGBM_models/F_model_30feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.881 total time=  12.1s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.918 total time=  12.0s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.542 total time=  17.0s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.592 total time=  16.9s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.251 total time=  22.2s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.963 total time=  11.7s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.918 total time=  11.6s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.983 total time=  11.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.230 total time=  22.7s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.212 total time=  23.2s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.163 total time=  24.7s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/F_model_30feats.pkl']

In [151]:
# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

# hyperparameter tuning for 20 features
best_params_20 = hyperparameter_tuning(attr20)

""" MODEL SELECTION (20 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_20)
"""                 """

#print top20 features
print('T20 features', attr20, '\n')

# Fit model, make predictions with t20 features
model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])

# add model performance to pdf dataframe
pdf['Pred_F_DKPts_20'] = preds20
pdf['Predicted20Rank'] = pdf.groupby(['Season','Date'])['Pred_F_DKPts_20'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/F_model_20feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.180 total time=  11.7s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.204 total time=  12.6s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.896 total time=  15.7s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.914 total time=  15.6s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.590 total time=  20.9s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.279 total time=  10.8s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.233 total time=  10.7s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.245 total time=  10.7s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.543 total time=  21.7s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.469 total time=  21.8s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.435 total time=  21.7s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/F_model_20feats.pkl']

In [152]:
# Using Top 20 Features, Find Top 10 Features
model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr10 = dset['attr'][0:10].tolist()

# hyperparameter tuning for 10 features
best_params_10 = hyperparameter_tuning(attr10)

""" MODEL SELECTION (10 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_10)
"""                 """

#print top10 features
print('T10 features', attr10, '\n')

# Fit model, make predictions with t10 features
model.fit(X_train[attr10], Y_train)
preds10 = model.predict(X_test[attr10])

# add model performance to pdf dataframe
pdf['Pred_F_DKPts_10'] = preds10
pdf['Predicted10Rank'] = pdf.groupby(['Season','Date'])['Pred_F_DKPts_10'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/F_model_10feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.451 total time=  10.7s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.481 total time=  10.9s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.274 total time=  15.2s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.286 total time=  15.2s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-7.077 total time=  20.5s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.573 total time=  10.5s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.540 total time=  10.5s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.536 total time=  10.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.928 total time=  20.9s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.884 total time=  20.7s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.812 total time=  20.9s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/F_model_10feats.pkl']

In [153]:
# write predictions to csv
pdf.to_csv(etl_dir + 'f_predictions_lgbm_50_30_20_15_10.csv')

# create df to summarize results

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

Unnamed: 0,Features,MAE
0,all,5.14
1,50,5.54
2,30,5.95
3,20,6.28
4,10,6.71


In [154]:
pdf.sort_values(by = ['Date', 'Act_F_DKPts'], ascending = [True, False]).head(n=10)

Unnamed: 0,Season,Date,Team,Defense,Player,Act_F_DKPtsRank,Act_F_DKPts,Pred_F_DKPts_all,PredictedallRank,Pred_F_DKPts_50,Predicted50Rank,Pred_F_DKPts_30,Predicted30Rank,Pred_F_DKPts_20,Predicted20Rank,Pred_F_DKPts_10,Predicted10Rank
24645,2012,0,CLE,WAS,Tristan Thompson,10.0,32.5,29.404101,3.0,27.779141,2.0,27.081601,3.0,30.290438,2.0,31.85855,1.0
2124,2012,0,DAL,LAL,Elton Brand,13.0,30.2,13.57762,11.0,17.665108,8.0,13.984526,11.0,15.180621,10.0,16.691359,8.0
27911,2012,0,DAL,LAL,Elton Brand,13.0,30.2,13.57762,11.0,17.665108,8.0,13.984526,11.0,15.180621,10.0,16.691359,8.0
2955,2012,0,BOS,MIA,Brandon Bass,15.0,28.7,30.727116,2.0,25.401082,4.0,27.26807,2.0,28.254452,3.0,25.225692,2.0
21755,2012,0,DAL,LAL,Brandan Wright,18.0,26.0,24.341066,5.0,25.610543,3.0,24.996318,4.0,21.623814,5.0,14.921484,10.0
40267,2012,0,LAL,DAL,Kobe Bryant,21.0,24.2,21.289287,7.0,23.001088,6.0,21.497278,7.0,23.043488,4.0,17.791553,7.0
9370,2012,0,MIA,BOS,Rashard Lewis,25.0,20.5,24.817483,4.0,18.478969,7.0,23.07053,6.0,20.201218,7.0,14.426824,12.0
22673,2012,0,DAL,LAL,Jae Crowder,28.0,17.6,20.470319,9.0,17.147059,10.0,18.22558,8.0,20.202886,6.0,19.496967,5.0
29657,2012,0,LAL,DAL,Antawn Jamison,29.0,17.0,35.932099,1.0,29.266876,1.0,32.762857,1.0,33.664212,1.0,18.862694,6.0
37554,2012,0,CLE,WAS,Tyler Zeller,32.0,13.4,17.235267,10.0,16.501248,12.0,14.467916,10.0,17.908913,8.0,19.973734,4.0


## Center Model

In [116]:
"""
"""

import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

working_directory = 'D:/machine_learning/DFS/NBA/NBA_moredata'
os.chdir(working_directory)
data_dir = 'Data/'
etl_dir = 'Data/ETL/'

player_stats = pd.read_csv(data_dir + 'player_stats_all.csv', index_col = 0)
c_vs = pd.read_csv(etl_dir + 'c_stats.csv', index_col = 0)

c_vs = c_vs.rename(columns={'Opp':'Defense', 'Team_x' : 'Team'})

print(c_vs.columns.tolist())

#Grab Only Necessary Columns & Filter Only To C Data
c_act_stats = player_stats[(player_stats['Pos.']=='C') | (player_stats['Pos.']=='C-F') | (player_stats['Pos.']=='F-C')].copy().reset_index(drop=True)

c_act_stats = c_act_stats.fillna(0)

c_act_stats.drop(list(set(c_act_stats.columns) - set(c_vs)), axis = 1, inplace = True)

#Calculate The Draftkings Points for each player on each date
c_act_stats['Act_C_DKPts'] = (c_act_stats['3P'] * 1 + c_act_stats['AST'] * 1.5 +\
                           c_act_stats['BLK'] * 3 + c_act_stats['FG'] * 2 +\
                           c_act_stats['FT'] * 1 + c_act_stats['TRB'] * 1.2 +\
                           c_act_stats['STL'] * 3 + c_act_stats['TOV'] * -1)

#G DK PTS Rank For The Given Season & Date Pair
c_act_stats['Act_C_DKPtsRank'] = c_act_stats.groupby(['Season','Date'])['Act_C_DKPts'].rank(method='min', ascending = False)

#Columns We Want To Add To Dataset
keep_cols = ['Season','Date','Player','Act_C_DKPtsRank','Act_C_DKPts']

#Append Actual DK Pts Rank & DK Pts
c_vs_act = pd.merge(c_vs, c_act_stats[keep_cols], how = 'left', on = ['Season','Date','Player'])
c_vs_act = c_vs_act[c_vs_act['Act_C_DKPts']>0].reset_index(drop=True)

#Make sure we have no duplicated columns or infinity errors
c_vs_act = c_vs_act.loc[:,~c_vs_act.columns.duplicated()]
c_vs_act = c_vs_act.replace([np.inf, -np.inf], np.nan)
c_vs_act.to_csv(etl_dir + 'g_v_def_stats.csv')

#Columns We Can't Include In Our Features Datasets
dcols = ['Age',
         'at',
         'Result',
         'GS',
         'FG',
         'FGA',
         'FG%',
         '2P',
         '2PA',
         '2P%',
         '3P',
         '3PA',
         '3P%',
         'FT',
         'FTA',
         'FT%',
         'TS%',
         'ORB',
         'DRB',
         'TRB',
         'AST',
         'STL',
         'BLK',
         'TOV',
         'PF',
         'PTS',
         'GmSc',
         'BPM',
         'Pos.',
         'Month',
         'Year',
         'Team_y',
         'Act_C_DKPts',
         'EFF',
         'MP'
]


more_dcols = ['Season', 'Date', 'Team', 'Defense', 'Player', 'Act_C_DKPtsRank']

# c_vs_act.drop_duplicates(subset=['Player', 'Date'], keep='first', inplace = True, ignore_index = True)

X = c_vs_act.drop(dcols, axis = 1)
Y = c_vs_act['Act_C_DKPts']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

X_train.drop(more_dcols, axis = 1, inplace = True)
X_test.drop(more_dcols, axis = 1, inplace = True)

['Player', 'Date', 'Age', 'Team', 'at', 'Defense', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TS%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'BPM', 'Pos.', 'EFF', 'Month', 'Year', 'Season', 'MP3', 'MP_pg3', 'FG3', 'FG_pg3', 'FGA3', 'FGA_pg3', 'FG%_pg3', '2P3', '2P_pg3', '2PA3', '2PA_pg3', '2P%_pg3', '3P3', '3P_pg3', '3PA3', '3PA_pg3', '3P%_pg3', 'FT3', 'FT_pg3', 'FTA3', 'FTA_pg3', 'FT%3', 'FT%_pg3', 'TS%_pg3', 'ORB3', 'ORB_pg3', 'DRB3', 'DRB_pg3', 'TRB3', 'TRB_pg3', 'AST3', 'AST_pg3', 'STL3', 'STL_pg3', 'BLK3', 'BLK_pg3', 'TOV3', 'TOV_pg3', 'PF3', 'PF_pg3', 'PTS3', 'PTS_pg3', 'GmSc3', 'GmSc_pg3', 'BPM_pg3', 'EFF3', 'EFF_pg3', 'c_MP3Rank3', 'c_FG3Rank3', 'c_FGARank3', 'c_FG%Rank3', 'c_2PRank3', 'c_2PARank3', 'c_2P%Rank3', 'c_3PRank3', 'c_3PARank3', 'c_3P%Rank3', 'c_FTRank3', 'c_FTARank3', 'c_FT%Rank3', 'c_TS%Rank3', 'c_ORBRank3', 'c_DRBRank3', 'c_TRBRank3', 'c_ASTRank3', 'c_STLRank3', 'c_BLKRank3', 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [117]:
# dump non-scaled train df for external scaling to work
filename = 'scalers/c_X_train.pkl'
joblib.dump(X_train, filename)

# Scaling
from sklearn.preprocessing import StandardScaler
c_scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(c_scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(c_scaler.transform(X_test), columns = X_test.columns)
filename = 'scalers/c_scaler.pkl'
joblib.dump(c_scaler, filename)

print('\nNum Possible Features:',len(X_train.columns.tolist()))


Num Possible Features: 241


In [126]:
# hyperparameter tuning function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

def hyperparameter_tuning(feat_set):
    # define model for param tuning
    model = LGBMRegressor()

    # define param grid
    param_grid = {
        'random_state' : [1],
        'n_jobs' : [-1],
        'n_estimators' : [1000],
        'learning_rate': [0.1],
        'num_leaves': [62, 93, 127],
        'max_depth' :[-1, 20, 40],
        'min_child_samples' : [20, 80],
        'max_bin' : [63],
        'device' : ['gpu']
    }

    gsearch = GridSearchCV(model, param_grid, verbose = 10, scoring = 'neg_mean_absolute_error') 

    lgb_model = gsearch.fit(X_train[feat_set], Y_train)
    print('best params')
    print(gsearch.best_params_, gsearch.best_score_)
    preds_lgb_model = lgb_model.predict(X_test[feat_set])
    mae_lgb = mean_absolute_error(Y_test, preds_lgb_model)
    print(" MAE: %f" % (mae_lgb ))
    return lgb_model.best_params_

In [125]:
# hyperparameter tuning for all features
best_params_all = hyperparameter_tuning(X_train.columns.tolist())

""" MODEL SELECTION (ALL FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_all)
"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Season','Date','Team','Defense','Player','Act_C_DKPtsRank','Act_C_DKPts']].copy()
pdf['Pred_C_DKPts_all'] = preds_all
pdf['PredictedallRank'] = pdf.groupby(['Season','Date'])['Pred_C_DKPts_all'].rank(method='min', ascending = False)
temp_df_all = pdf[pdf['PredictedallRank']<=5]

# save the initial model to disk
filename = 'models/LGBM_models/C_model_allfeats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.223 total time=  16.3s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.131 total time=  13.3s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_

[CV 3/5; 4/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.285 total time=  12.3s
[CV 4/5; 4/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 4/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.232 total time=  12.7s
[CV 5/5; 4/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 4/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.143 total time=  12.3s
[CV 1/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_data_in_leaf=80, n_estimato

[CV 1/5; 8/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.966 total time=  18.8s
[CV 2/5; 8/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 2/5; 8/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.801 total time=  19.6s
[CV 3/5; 8/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 3/5; 8/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.901 total time=  22.6s
[CV 4/5; 8/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=20, n_estimato

[CV 4/5; 11/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.841 total time=  17.5s
[CV 5/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 11/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-5.814 total time=  17.8s
[CV 1/5; 12/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 12/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.779 total time=  23.2s
[CV 2/5; 12/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_data_in_leaf=80, n_

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.606 total time=  24.8s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.672 total time=  25.1s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.604 total time=  25.0s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=20,

[CV 5/5; 18/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_data_in_leaf=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.586 total time=  24.1s
best params
{'device': 'gpu', 'learning_rate': 0.1, 'max_bin': 63, 'max_depth': 20, 'min_data_in_leaf': 20, 'n_estimators': 1000, 'n_jobs': -1, 'num_leaves': 127, 'random_state': 1} -5.6187139917891304
 MAE: 5.263495
possible features: ['MP3', 'MP_pg3', 'FG3', 'FG_pg3', 'FGA3', 'FGA_pg3', 'FG%_pg3', '2P3', '2P_pg3', '2PA3', '2PA_pg3', '2P%_pg3', '3P3', '3P_pg3', '3PA3', '3PA_pg3', '3P%_pg3', 'FT3', 'FT_pg3', 'FTA3', 'FTA_pg3', 'FT%3', 'FT%_pg3', 'TS%_pg3', 'ORB3', 'ORB_pg3', 'DRB3', 'DRB_pg3', 'TRB3', 'TRB_pg3', 'AST3', 'AST_pg3', 'STL3', 'STL_pg3', 'BLK3', 'BLK_pg3', 'TOV3', 'TOV_pg3', 'PF3', 'PF_pg3', 'PTS3', 'PTS_pg3', 'GmSc3', 'GmSc_pg3', 'BPM_pg3', 'EFF3', 'EFF_pg3', 'c_MP3Rank3', 'c_FG3Rank3', 'c_FGARank3', 'c_FG%Rank3', 'c_2PRank3', 'c_2PARank3', 'c_2P%Rank3', 'c_3PRank3', 'c_3PARank3', 'c

['models/LGBM_models/C_model_allfeats.pkl']

In [127]:
# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# hyperparameter tuning for 50 features
best_params_50 = hyperparameter_tuning(attr50)

""" MODEL SELECTION (50 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_50)
"""                 """

#print top50 features
print('T50 features', attr50, '\n')

# Fit model, make predictions with t50 features
model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])

# add model performance to pdf dataframe
pdf['Pred_C_DKPts_50'] = preds50
pdf['Predicted50Rank'] = pdf.groupby(['Season','Date'])['Pred_C_DKPts_50'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/C_model_50feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.609 total time=  11.4s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.565 total time=  11.2s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.209 total time=  15.2s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.147 total time=  15.4s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.106 total time=  20.8s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.711 total time=  11.0s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.592 total time=  11.0s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.503 total time=  10.9s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.978 total time=  27.8s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.997 total time=  22.5s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-5.897 total time=  22.3s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/C_model_50feats.pkl']

In [128]:
# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# hyperparameter tuning for 30 features
best_params_30 = hyperparameter_tuning(attr30)

""" MODEL SELECTION (30 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_30)
"""                 """

#print top50 features
print('T30 features', attr30, '\n')

# Fit model, make predictions with t30 features
model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])

# add model performance to pdf dataframe
pdf['Pred_C_DKPts_30'] = preds30
pdf['Predicted30Rank'] = pdf.groupby(['Season','Date'])['Pred_C_DKPts_30'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/C_model_30feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.890 total time=  10.7s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.802 total time=  11.3s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.498 total time=  15.6s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.417 total time=  15.3s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.373 total time=  20.4s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.927 total time=  11.0s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.860 total time=  10.4s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-6.779 total time=  10.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.149 total time=  22.9s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.188 total time=  22.1s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.093 total time=  21.9s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/C_model_30feats.pkl']

In [129]:
# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

# hyperparameter tuning for 20 features
best_params_20 = hyperparameter_tuning(attr20)

""" MODEL SELECTION (20 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_20)
"""                 """

#print top20 features
print('T20 features', attr20, '\n')

# Fit model, make predictions with t20 features
model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])

# add model performance to pdf dataframe
pdf['Pred_C_DKPts_20'] = preds20
pdf['Predicted20Rank'] = pdf.groupby(['Season','Date'])['Pred_C_DKPts_20'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/C_model_20feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.095 total time=  12.1s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.020 total time=  12.0s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.782 total time=  15.5s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-6.766 total time=  15.4s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.632 total time=  21.4s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.215 total time=  10.2s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.137 total time=  10.3s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.130 total time=  10.5s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.358 total time=  21.6s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.433 total time=  21.6s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.345 total time=  21.7s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/C_model_20feats.pkl']

In [130]:
# Using Top 20 Features, Find Top 10 Features
model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr10 = dset['attr'][0:10].tolist()

# hyperparameter tuning for 10 features
best_params_10 = hyperparameter_tuning(attr10)

""" MODEL SELECTION (10 FEATS)"""
# all feature model with tuned hyperparameters
model = LGBMRegressor(**best_params_10)
"""                 """

#print top10 features
print('T10 features', attr10, '\n')

# Fit model, make predictions with t10 features
model.fit(X_train[attr10], Y_train)
preds10 = model.predict(X_test[attr10])

# add model performance to pdf dataframe
pdf['Pred_C_DKPts_10'] = preds10
pdf['Predicted10Rank'] = pdf.groupby(['Season','Date'])['Pred_C_DKPts_10'].rank(method='min', ascending = False)

# save the model to disk
filename = 'models/LGBM_models/C_model_10feats.pkl'
joblib.dump(model, filename) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 1/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.441 total time=  10.1s
[CV 2/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 2/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.448 total time=  10.1s
[CV 3/5; 1/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 3/5; 1/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1,

[CV 4/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.322 total time=  15.1s
[CV 5/5; 5/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1
[CV 5/5; 5/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=93, random_state=1;, score=-7.264 total time=  15.0s
[CV 1/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 1/5; 6/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-7.064 total time=  20.6s
[CV 2/5; 6/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=-1, min_child_samples=80, n_

[CV 3/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.677 total time=  10.3s
[CV 4/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 4/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.627 total time=  10.6s
[CV 5/5; 10/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1
[CV 5/5; 10/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80, n_estimators=1000, n_jobs=-1, num_leaves=62, random_state=1;, score=-7.575 total time=  10.6s
[CV 1/5; 11/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=20, min_child_samples=80

[CV 2/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.829 total time=  20.6s
[CV 3/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 3/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.849 total time=  20.9s
[CV 4/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1
[CV 4/5; 15/18] END device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_samples=20, n_estimators=1000, n_jobs=-1, num_leaves=127, random_state=1;, score=-6.826 total time=  20.7s
[CV 5/5; 15/18] START device=gpu, learning_rate=0.1, max_bin=63, max_depth=40, min_child_sampl

['models/LGBM_models/C_model_10feats.pkl']

In [133]:
# write predictions to csv
pdf.to_csv(etl_dir + 'c_predictions_lgbm_50_30_20_15_10.csv')

# create df to summarize results

feature_sets = ['all', '50', '30', '20', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
display(results_df)

Unnamed: 0,Features,MAE
0,all,5.26
1,50,5.59
2,30,5.81
3,20,6.14
4,10,6.61


In [134]:
pdf.sort_values(by = ['Date', 'Act_C_DKPts'], ascending = [True, False]).head(n=20)

Unnamed: 0,Season,Date,Team,Defense,Player,Act_C_DKPtsRank,Act_C_DKPts,Pred_C_DKPts_all,PredictedallRank,Pred_C_DKPts_50,Predicted50Rank,Pred_C_DKPts_30,Predicted30Rank,Pred_C_DKPts_20,Predicted20Rank,Pred_C_DKPts_10,Predicted10Rank
7237,2012,0,CLE,WAS,Anderson Varejão,1.0,55.1,53.685842,1.0,53.431763,1.0,51.620945,1.0,48.961527,1.0,46.134551,1.0
12620,2012,0,MIA,BOS,Chris Bosh,4.0,40.5,37.982321,2.0,37.166619,2.0,37.122666,2.0,37.427944,2.0,32.526078,3.0
5872,2012,0,LAL,DAL,Dwight Howard,6.0,34.0,23.503385,7.0,19.091871,7.0,19.490786,7.0,16.651493,7.0,32.141061,4.0
2108,2012,0,CLE,WAS,Tristan Thompson,7.0,32.5,31.360322,3.0,29.78476,3.0,32.670381,3.0,30.742896,3.0,36.684435,2.0
3036,2012,0,BOS,MIA,Kevin Garnett,12.0,27.4,27.205698,4.0,26.484996,4.0,27.208818,4.0,25.697883,4.0,31.677585,5.0
9714,2012,0,DAL,LAL,Brandan Wright,14.0,26.0,25.661536,5.0,25.013578,5.0,25.944744,5.0,24.240652,5.0,29.652342,6.0
4850,2012,0,WAS,CLE,Earl Barron,16.0,23.6,24.136016,6.0,21.427778,6.0,22.530366,6.0,19.452227,6.0,19.805073,7.0
10426,2012,1,SAS,NOH,Tim Duncan,5.0,45.7,43.058412,1.0,43.110357,1.0,42.971058,1.0,43.520786,1.0,39.329407,1.0
14431,2012,1,IND,TOR,David West,19.0,31.9,29.90208,2.0,30.344086,3.0,28.544788,6.0,29.49191,2.0,28.799825,4.0
15103,2012,1,SAC,CHI,DeMarcus Cousins,19.0,31.9,23.201327,8.0,19.990157,11.0,18.387854,10.0,22.975164,8.0,28.19768,5.0
