In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

In [None]:
#import model data
mdl_df = pd.read_pickle("./data/mdl_df_121420.pkl")

In [None]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
import sys

In [None]:
##modeling
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
#from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV

from sklearn import metrics
import pickle

## simple imputation

In [None]:
mdl_df_reduced = (mdl_df.loc[:,feature_columns + ['y_var']])
mdl_df_imputed_simple = pd.DataFrame(preprocessor.fit_transform(mdl_df_reduced.to_numpy()),columns = mdl_df_reduced.columns)

## Models

In [None]:
#pip install -U imbalanced-learn
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import AllKNN
#from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import TomekLinks

from collections import Counter
from sklearn.model_selection import StratifiedKFold

#classifers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

import lightgbm as lgbm
from lightgbm.sklearn import LGBMClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

In [None]:
def get_modeldt_imb_imp(imb_method,imp_method_name,imputed_dt):
    """ get the model train data using imbalance method and imputation method"""
    print('imbalance resampling method: ',type(imb_method).__name__)
    print('imputation method: ',imp_method_name)
    return imb_method.fit_resample(imputed_dt[feature_columns].to_numpy(), imputed_dt[['y_var']].to_numpy())

In [None]:
# grid search, build models
def buildmodel(clsifier,param_grid,n_folds,val_metric):
    """build best model with sepcific classifer by gird search
    clsifier: classifer
    val_metric: metric to validate the best model """
    
    estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clsifier)])
    grid_classifier = GridSearchCV(estimator, param_grid = param_grid, cv = n_folds, scoring= val_metric)
    grid_mdls = grid_classifier.fit(X_train, y_train.ravel())
    y_pred = grid_mdls.predict(X_test)
    y_prob = grid_mdls.predict_proba(X_test)
    
    print(type(clsifier).__name__)
    print("tuned hpyerparameters :(best parameters) ",grid_mdls.best_params_)
    print("performance metric :",grid_mdls.best_score_)
    print("roc auc:", metrics.roc_auc_score(y_test,y_prob[:,1]))
    print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')
    return grid_mdls
    

In [None]:
#plot features importance for tree based model
def featr_imprt(tree_mdl,encoded_feature_names):
    """plot the features importance"""
    pd.Series(tree_mdl.named_steps['classifier'].feature_importances_, encoded_feature_names)\
    .sort_values(ascending=True)\
    .plot(kind='barh', title='Feature Importances',figsize=(10, 20))
    return pd.Series(tree_mdl.named_steps['classifier'].feature_importances_, encoded_feature_names).sort_values(ascending=False).nlargest(10)

## Binary outcome models

### pipeline

In [None]:
preprocessor = make_column_transformer(("passthrough",np.arange(0,14)),(StandardScaler(),np.arange(14,19,1)))

### split train and test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mdl_df_imputed_simple[feature_columns].to_numpy(), mdl_df_imputed_simple[['y_var']].to_numpy(), test_size = 0.3, random_state=0)

## grid search lightgbm

In [None]:
'''
#classifer estimator
lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=5, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)
#parameters
param_grid = dict(
#    classifier__boosting_type = ['dart','gbdt'],
    classifier__learning_rate = [0.001,0.1,1], 
    classifier__n_estimators = [50,150,200],
    classifier__num_leaves = [10,31,100,150],
#   classifier__max_depth = [5,7,10],
    classifier__min_child_samples = [10,20,50,100],
#   classifier__min_child_weight = [0.001,0.1,1,10],
    classifier__subsample = [0.5,0.8,1],
    classifier__colsample_bytree = [0.5,0.8,1]
)
# the parameters of the best models obtain from the above param_grid search
param_grid = dict(
#    classifier__boosting_type = ['dart','gbdt'],
    classifier__learning_rate = [0.1], 
    classifier__n_estimators = [150],
    classifier__num_leaves = [100],
#   classifier__max_depth = [5,7,10],
#    classifier__min_child_samples = [10,20,50,100],
#   classifier__min_child_weight = [0.001,0.1,1,10],
    classifier__subsample = [0.5],
    classifier__colsample_bytree = [0.5],
#    classifier__early_stopping_rounds = [10],
    classifier__categorical_feature = np.arange(0,13)
)

n_folds = 5
val_metric = "roc_auc"

estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', lgbm_estimator)])
grid_classifier = GridSearchCV(estimator, param_grid = param_grid, cv = n_folds, scoring= val_metric)
grid_mdls = grid_classifier.fit(X_train, y_train.ravel())

grid_lgbm_mdl = buildmodel(lgbm_estimator,param_grid,n_folds,val_metric)
'''

### best model: NONE resampling, LightGBM

In [None]:
# imbalance resampling, models overfitting
#akn = AllKNN()
#ncr = NeighbourhoodCleaningRule()
#X_train_resampled, y_train_resampled = akn.fit_resample(X_train, y_train)
#X_train = X_train_resampled
#y_train = y_train_resampled 

lgbm_classifier = LGBMClassifier(boosting_type='gbdt', num_leaves=100, learning_rate=0.1, n_estimators=150, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=1, min_child_samples=20, \
                   subsample=0.5, subsample_freq=0, colsample_bytree=0.5, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)

lgbm_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', lgbm_classifier)])
lgbm_mdl = lgbm_estimator.fit(X_train, y_train.ravel())

In [None]:
y_pred = lgbm_mdl.predict(X_test)
y_prob = lgbm_mdl.predict_proba(X_test)

In [None]:
print("train accuracy:", metrics.accuracy_score(y_train,lgbm_mdl.predict(X_train)))
print("test accuracy:", metrics.accuracy_score(y_test,y_pred))

In [None]:
print("roc auc:", metrics.roc_auc_score(y_test,y_prob[:,1]))
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#!pip install scikit-plot
import scikitplot as skplt

In [None]:
skplt.metrics.plot_precision_recall(y_test, y_prob)

In [None]:
skplt.metrics.plot_confusion_matrix(y_test,y_pred,normalize = True)

In [None]:
skplt.metrics.plot_confusion_matrix(y_train,lgbm_mdl.predict(X_train),normalize = True)

In [None]:
skplt.metrics.plot_roc(y_test, y_prob)

In [None]:
skplt.metrics.plot_roc(y_train, lgbm_mdl.predict_proba(X_train))

### cumulative gain curve and lift chart

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, y_prob)

In [None]:
skplt.metrics.plot_lift_curve(y_test, y_prob)
#figsize=(12, 8), title_fontsize=20, text_fontsize=18)

### explain model result

In [None]:
import shap

In [None]:
# display dataset
#categorical features
display_dt = mdl_df_imputed_simple[feature_columns].copy()
for col in categ_columns3:
    lb = preprocessing.LabelEncoder()
    lb_e = lb.fit(mdl_df_display[~pd.isnull(mdl_df_display[col])][col])
    ts = (display_dt[col].astype('Int8').to_list())
    display_dt[col] = lb_e.inverse_transform(ts)  

#inverse back to original values of numeric features  
# log(), exp()
for col in ['num_var1']:
    display_dt[col] = np.exp(display_dt[col])

#QuantileTransformer inverse
display_dt['num_var2'] = pwtranf.inverse_transform(display_dt['num_var2'].to_numpy().reshape(-1, 1))

In [None]:
best_mdl = lgbm_mdl.named_steps['classifier']

In [None]:
X = pd.DataFrame(data = Pipeline(steps=[('preprocessor', preprocessor)]).fit_transform(mdl_df_imputed_simple[feature_columns]), columns = feature_columns)

In [None]:
explainer = shap.TreeExplainer(best_mdl)
shap_values = explainer.shap_values(X)

In [None]:
# summary plot, feature importance
shap.summary_plot(shap_values[1], X)

In [None]:
# print the JS visualization code to the notebook
shap.initjs()

In [None]:
# data instance explaination for prediction contribution
shap.force_plot(explainer.expected_value[1], shap_values[1][100,:], display_dt.iloc[100,:])

In [None]:
# dependence plots
#for name in feature_columns:
#    shap.dependence_plot(name, shap_values[1], X, display_features = display_dt)
for name in feature_columns:
    shap.dependence_plot(name, shap_values[1], display_dt)

In [None]:
#interaction effect
shap.dependence_plot('X1', shap_values[1], X, display_features = display_dt,interaction_index = 'X2')

# Multi class(5) classification--lightGBM

In [None]:
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(mdl_df_imputed_simple_y[feature_columns].to_numpy(), mdl_df_imputed_simple_y[['y_var']].to_numpy(), test_size = 0.3, random_state=1)

In [None]:
sorted(Counter(m_y_train.ravel()).items())

In [None]:
#classifer estimator
m_lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)
#parameters
m_param_grid = dict(classifier__learning_rate = [0.001,0.1,1], 
                  classifier__n_estimators = [100,150],
                  classifier__num_leaves = [10,31,100],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__colsample_bytree = [0.5,0.8,1],
                  classifier__min_child_weight = [0.001,1]                  
                 )
m_n_folds = 5
m_val_metric = "roc_auc_ovo"

In [None]:
m_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', m_lgbm_estimator)])
m_grid_classifier = GridSearchCV(m_estimator, param_grid = m_param_grid, cv = m_n_folds, scoring= m_val_metric)
m_grid_mdls = m_grid_classifier.fit(m_X_train, m_y_train.ravel())

In [None]:
m_y_pred = m_grid_mdls.predict(m_X_test)

In [None]:
m_prob_pred = m_grid_mdls.predict_proba(m_X_test)

In [None]:
print("roc auc:", metrics.roc_auc_score(m_y_test.ravel(), m_prob_pred, multi_class = 'ovo',average = 'macro'))

In [None]:
pd.DataFrame(metrics.confusion_matrix(m_y_test, m_y_pred))

In [None]:
#metrics.confusion_matrix(m_y_test, m_y_pred)
disp = metrics.plot_confusion_matrix(m_grid_mdls, m_X_test, m_y_test)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")
plt.show()

In [None]:
print("tuned hpyerparameters :(best parameters) ",m_grid_mdls.best_params_)
print("performance metric :",m_grid_mdls.best_score_)
#print("roc auc:", metrics.roc_auc_score(np.argmax(m_y_test, axis = 1),m_y_pred,multi_class = 'ovo'))
print('classification report:', metrics.classification_report(m_y_test, m_y_pred), sep='\n')

In [None]:
#m_grid_mdls.cv_results_
m_grid_mdls.scorer_