In [114]:
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf

from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense 

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV   
import pandas as pd
import numpy as np

In [115]:
def myround(x, prec, base):
    if x<0.75: x=0.751
    if x>.95: x=0.951
    return round((x - (x%base))*100,0)
    #return round(base * round(x/base),prec)*100

In [116]:
added_features=pd.read_csv('assets/event_features.csv')
added_features=added_features.dropna()
added_features.rename(columns = {'event_id':'game_id'}, inplace = True)

In [117]:
game_feature=pd.read_csv('training_data/game_feature_predict_nhl.csv')
game_feature=pd.merge(game_feature, added_features,  how='left', on=['game_id'])
game_feature.to_csv('check.csv')
game_feature=game_feature.dropna()

scan_cat=[]
for index, row in game_feature.iterrows():
      scan_cat.append(myround(row['scan_rate'],2,.05))

game_feature['scan_category']=scan_cat

In [118]:
#keep just features and response
X=game_feature[[\
'distance',\
'distance_z',\
'home_max_attendance',\
'home_max_attendance_z',\
'home_avg_attendance',\
'home_avg_attendance_z',\
'away_max_attendance',\
'away_max_attendance_z',\
'away_avg_attendance',\
'away_avg_attendance_z',\
'weekend_flag',\
'first_game_flag',\
'wl_ratio',\
'pct_neg_streak',\
'pct_pos_streak_gt_2_days',\
'pct_c10_neg_streak_days',\
'pct_c10_pos_streak_gt_2_days',\
'Friday',\
'Monday',\
'Saturday',\
'Sunday',\
'Thursday',\
'Tuesday',\
'Wednesday',\
'h_1_a_1',\
'h_1_a_2',\
'h_2_a_1',\
'h_2_a_2',\
'h_2_a_3',\
'h_3_a_2',\
'h_3_a_3',\
'c_sell_through_avg_shift_fix',\
'c_scan_through_avg_shift_fix',\
'c_scan_rate_avg_shift_fix',\
'sell_through_avg',\
'scan_through_avg',\
'scan_rate_avg']]

#keep just features and response
y=game_feature[['scan_category']]

In [119]:
#read in data
game_feature_current=pd.read_csv('training_data/game_feature_predict_nhl_current.csv')
game_feature_current.to_csv('check_curr.csv')
#game_feature_current=pd.merge(game_feature_current, added_features,  how='left', on=['game_id'])
game_feature_current=game_feature_current.dropna()

scan_cat=[]
for index, row in game_feature_current.iterrows():
      scan_cat.append(myround(row['scan_rate'],2,.05))

game_feature_current['scan_category']=scan_cat


In [120]:
#keep just features and response
X_current=game_feature_current[[\
'distance',\
'distance_z',\
'home_max_attendance',\
'home_max_attendance_z',\
'home_avg_attendance',\
'home_avg_attendance_z',\
'away_max_attendance',\
'away_max_attendance_z',\
'away_avg_attendance',\
'away_avg_attendance_z',\
'weekend_flag',\
'first_game_flag',\
'wl_ratio',\
'pct_neg_streak',\
'pct_pos_streak_gt_2_days',\
'pct_c10_neg_streak_days',\
'pct_c10_pos_streak_gt_2_days',\
'Friday',\
'Monday',\
'Saturday',\
'Sunday',\
'Thursday',\
'Tuesday',\
'Wednesday',\
'h_1_a_1',\
'h_1_a_2',\
'h_2_a_1',\
'h_2_a_2',\
'h_2_a_3',\
'h_3_a_2',\
'h_3_a_3',\
'c_sell_through_avg_shift_fix',\
'c_scan_through_avg_shift_fix',\
'c_scan_rate_avg_shift_fix',\
'sell_through_avg',\
'scan_through_avg',\
'scan_rate_avg']]

#keep just features and response
y_current=game_feature_current[['scan_category']]

In [121]:
X=pd.concat([X, X_current], ignore_index=True)
y=pd.concat([y, y_current], ignore_index=True)

In [122]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [123]:
dtrain = xgb.DMatrix(data=X_train,label=y_train)
dtest = xgb.DMatrix(data=X_test,label=y_test)

In [124]:
xg_reg = xgb.XGBRegressor(objective ='multi:softprob',\
                          colsample_bytree = 0.9,\
                          learning_rate = 0.1,\
                          max_depth = 9,\
                          alpha = 10,n_estimators = 10,num_class=100)

In [125]:
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)

In [126]:
best_preds_xg = np.asarray([np.argmax(line) for line in preds])

In [127]:
from sklearn.metrics import precision_score
print(precision_score(y_test, best_preds_xg, average='macro'))

0.629247311827957


In [128]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [167]:
parameters = {
          "objective" : "multiclass",
          "num_class" : 100,
          "num_leaves" : 10,
          "max_depth": 8,
          "learning_rate" : 0.1,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.7,  # colsample_bytree
          "bagging_freq" : 3,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : 1 }

gbm_model = lgb.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       verbose_eval=200,
                       early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[70]	valid_0's multi_logloss: 0.988661


In [168]:
y_out = gbm_model.predict(X_test)
best_preds_lgb = np.asarray([np.argmax(line) for line in y_out])
print(precision_score(y_test, best_preds_lgb, average='macro'))

0.6965848965848965


In [169]:
test_scan = y_test['scan_category'].tolist()
results_df = pd.DataFrame(list(zip(test_scan, best_preds_xg, best_preds_lgb)), columns =['test_scan', 'best_preds_xg', 'best_preds_lgb']) 
results_df.to_csv('nhl_modeling_results.csv')

In [109]:
gbm_model.save_model('model_store/nhl_scan_prediction_lgb.txt')
feature_importance = pd.DataFrame({'Feature': gbm_model.feature_name(),'Importance': list(gbm_model.feature_importance())})
feature_importance.sort_values(by='Importance', ascending=False, inplace=True)
feature_importance.to_csv('model_store/nhl_lgbm_feature_importance.csv')

In [110]:
xg_reg.save_model("model_store/nhl_scan_prediction.model")