# Sample codes: Grid searching for various classifers by model pipeline
* KNN
* SVM
* Neural Network
* Logistic Regression
* Tree-based: XGboost, LightGBM, Random Forest, GBM, Ada Boost, Stacking


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

In [None]:
#imputation
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import make_column_transformer

In [None]:
#models
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
#from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
import pickle

In [None]:
#display setting
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)

In [None]:
#import model data
mdl_df = pd.read_pickle("./data/mdl_df.pkl")

In [None]:
#missing data report for each feature
df = mdl_df
for crossvar in feature_columns:
    #count number of rows with missing values
    n_miss = df[[crossvar]].isnull().sum()
    perc = n_miss / df.shape[0] * 100
    print('> %s, Missing: %d (%.1f%%)' % (crossvar, n_miss, perc))

In [None]:
#one way summary frequency table
dt = mdl_df
for col in categ_columns3:
    print(col)
    pd.concat([dt[col].value_counts(dropna = False),100 * dt[col].value_counts(dropna = False, normalize = True)],axis = 1)

In [None]:
#encoding category variables
#1.set all to category
for col in categ_columns3:
    mdl_df.loc[:,col] = mdl_df[col].astype('category')
#2. set to codes
for col in categ_columns3:
    mdl_df.loc[:, col] = mdl_df[col].cat.codes
#3 reset -1 to NaN
for col in categ_columns3:
    mdl_df.loc[(mdl_df[col] == -1),col] = np.nan
    mdl_df[col] = mdl_df[col].astype('Int8')

In [None]:
#categorical features name_code: 
#encode_cat_cols = []
#for col in categ_columns3:
#    encode_cat_cols = encode_cat_cols + (col + '_' + mdl_df[col].cat.categories.astype('str').values).tolist()
#encode_cat_cols

In [None]:
#tranformation for numeric variables
for col in num_colmuns:
    mdl_df[col] = np.log(mdl_df[col])

In [None]:
#correlation
mdl_df[num_columns].corr()
mdl_df[feature_columns].corr()

#pairwise features plot
sns.pairplot(mdl_df, kind='reg', diag_kind='kde')
sns.pairplot(mdl_df,hue="loanclose")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mdl_df[feature_columns].to_numpy(), mdl_df[['y_var']].to_numpy(), test_size = 0.3, random_state=0)

In [None]:
#pipeline
#imputer_num = IterativeImputer(random_state = 0, estimator = impute_estimator,max_iter = 50,tol = 0.001))
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

numeric_transformer = Pipeline(
    steps=[('imputer', imputer_num),
           ('scaler', StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[('imputer', imputer_cat),
           ('onehot', OneHotEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, np.arange(13)),
                  ('num', numeric_transformer, np.arange(13,19,1))
                 ]
)

## grid search logistic regression(rigid)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_estimator = LogisticRegression(max_iter = 1000)

In [None]:
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', lr_estimator)])

In [None]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_grid = dict(classifier__C = C_values)

# Set the amount of folds for the cross-validation
n_folds = 5

# Do a model fit over a grid of C hyperparameters
grid_logReg = GridSearchCV(estimator, param_grid = C_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_lr_mdl = grid_logReg.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_lr_mdl.best_params_)
print("accuracy :",grid_lr_mdl.best_score_)

In [None]:
grid_lr_mdl.best_estimator_.named_steps['classifier'].coef_.transpose().flatten()

In [None]:
y_pred = grid_lr_mdl.predict(X_test)
#y_score = grid_lr_mdl.predict_proba(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)
metrics.f1_score(y_test,y_pred)

In [None]:
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

## grid search svm

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [None]:
estimator_svc = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', SVC())])

In [None]:
C_range = np.logspace(-1, -1, 1)

In [None]:
gamma_range = np.logspace(-2, -2, 1)

In [None]:
n_folds = 5
param_grid = dict(classifier__gamma = gamma_range, classifier__C = C_range)
grid_svc = GridSearchCV(estimator_svc, param_grid = param_grid, cv = n_folds, scoring= 'roc_auc')

In [None]:
grid_svc_mdl = grid_svc.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_svc_mdl.best_params_)
print("roc_auc :",grid_svc_mdl.best_score_)

In [None]:
y_pred = grid_svc_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)
metrics.f1_score(y_test,y_pred)

In [None]:
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

### linear svm

In [None]:
estimator_linsvc = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', LinearSVC(max_iter = 5000,C = 0.1,tol = 1e-3))])

In [None]:
linsvc_mdl = estimator_linsvc.fit(X_train, y_train.ravel())

In [None]:
linsvc_mdl.score(X_test,y_test)

In [None]:
y_pred = linsvc_mdl.predict(X_test)

In [None]:
metrics.f1_score(y_test,y_pred)
metrics.roc_auc_score(y_test,y_pred)

In [None]:
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

# grid search Tree based models

## Gradient Boosting Machine

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)
gbc_estimator = GradientBoostingClassifier(max_depth = 5, subsample=0.8, random_state=1)
estimator_gbc = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', gbc_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__learning_rate = [0.01,0.1], 
                  classifier__n_estimators = [50,100,150],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__max_depth= [3,5,8],
                  classifier__max_features = [5,7,9,11,13,15,17,19]
                 )
grid_gbc = GridSearchCV(estimator_gbc, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_gbc_mdl = grid_gbc.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_gbc_mdl.best_params_)
print("accuracy :",grid_gbc_mdl.best_score_)

In [None]:
y_pred = grid_gbc_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_gbc_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_gbc_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

#### feature importance

In [None]:
encoded_feature_names = np.concatenate([encode_cat_cols, num_columns])

In [None]:
##feature importance
pd.Series(grid_gbc_mdl.best_estimator_.named_steps['classifier'].feature_importances_, encoded_feature_names)\
.sort_values(ascending=False).nlargest(10)

In [None]:
pd.Series(grid_gbc_mdl.best_estimator_.named_steps['classifier'].feature_importances_,encoded_feature_names)\
.sort_values(ascending=True)\
.plot(kind='barh', title='Feature Importances',figsize=(10, 20))

## grid search Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
rf_estimator = RandomForestClassifier()
estimator_rf = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', rf_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__criterion = ['gini', 'entropy'], 
                  classifier__n_estimators = [50,100,150],
                  classifier__max_samples = [0.5,0.8,1],
                  classifier__max_depth = [3,5,8,None],
                  classifier__max_features = ['auto', 'sqrt', 'log2']
                 )
grid_rf = GridSearchCV(estimator_rf, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_rf_mdl = grid_rf.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_rf_mdl.best_params_)
print("accuracy :",grid_rf_mdl.best_score_)

In [None]:
y_pred = grid_rf_mdl.predict(X_test)
#grid_rf_mdl.predict_proba(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
pd.Series(grid_rf_mdl.best_estimator_.named_steps['classifier'].feature_importances_,encoded_feature_names)\
.sort_values(ascending=True)\
.plot(kind='barh', title='Feature Importances',figsize=(10, 20))

In [None]:
#save model
#mdl_pkl_name = "mdl_rf_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_rf_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#AdaBoostClassifier(base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
ada_estimator = AdaBoostClassifier()
estimator_ada = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', ada_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__learning_rate = [0.1,1],
                  classifier__n_estimators = [50,100]
                 )
grid_ada = GridSearchCV(estimator_ada, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_ada_mdl = grid_ada.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_ada_mdl.best_params_)
print("accuracy :",grid_ada_mdl.best_score_)

In [None]:
y_pred = grid_ada_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_adaboost_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_ada_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
estimator_dst = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

In [None]:
dstree_mdl = estimator_dst.fit(X_train, y_train.ravel())

In [None]:
dstree_mdl.predict(X_test)

In [None]:
#tree.plot_tree(dstree_mdl.named_steps['classifier'], filled=True)

## grid search Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_estimator = MLPClassifier(hidden_layer_sizes=(40,),solver = 'adam', activation='relu', alpha=0.0001, max_iter = 1000,learning_rate='constant', learning_rate_init=0.001, random_state=1)

In [None]:
estimator_mlp = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', mlp_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__hidden_layer_sizes = [(40, 30), (40,20,10)], 
                  classifier__alpha = [0.00001,0.0001,0.1],
                  classifier__learning_rate = ['constant', 'invscaling', 'adaptive']
                 )
grid_mlp = GridSearchCV(estimator_mlp, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_mlp_mdl = grid_mlp.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_mlp_mdl.best_params_)
print("accuracy :",grid_mlp_mdl.best_score_)

In [None]:
y_pred = grid_mlp_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_MLP_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_mlp_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

# XGBoost

In [None]:
#install python packages
#!pip install xgboost
#!pip install shap

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
xgb_estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
              min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
              objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

In [None]:
estimator_xgb = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', xgb_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__learning_rate = [0.001,0.1,1], 
                  classifier__n_estimators = [100,150],
                  classifier__max_depth = [10,15],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__colsample_bytree = [0.5,0.8],
                  classifier__gamma = [0,0.1],
                  classifier__min_child_weight = [1,5]                  
                 )
grid_xgb = GridSearchCV(estimator_xgb, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_xgb_mdl = grid_xgb.fit(X_train, y_train.ravel())

In [None]:
grid_xgb_mdl.best_params_, grid_xgb_mdl.best_score_

In [None]:
#({'classifier__learning_rate': 0.1,
#  'classifier__max_depth': 10,
#  'classifier__n_estimators': 150},
# 0.7918127123210275)

In [None]:
y_pred = grid_xgb_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
#0.5165817910818531
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_xgb_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_xgb_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

In [None]:
#feature importance
pd.Series(grid_xgb_mdl.best_estimator_.named_steps['classifier'].feature_importances_, encoded_feature_names)\
.sort_values(ascending=False).nlargest(10)

In [None]:
#feature importance
pd.Series(grid_xgb_mdl.best_estimator_.named_steps['classifier'].feature_importances_, encoded_feature_names)\
.sort_values(ascending=True)\
.plot(kind='barh', title='Feature Importances',figsize=(10, 20))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs
knn_estimator = KNeighborsClassifier()
estimator_knn = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', knn_estimator)])

In [None]:
n_folds = 5
param_grid = dict(classifier__n_neighbors = [2,5,10], classifier__weights = ['uniform', 'distance'])
grid_knn = GridSearchCV(estimator_knn, param_grid = param_grid, cv = n_folds, scoring= 'accuracy')

In [None]:
grid_knn_mdl = grid_knn.fit(X_train, y_train.ravel())

In [None]:
print("tuned hpyerparameters :(best parameters) ",grid_knn_mdl.best_params_)
print("accuracy :",grid_knn_mdl.best_score_)

In [None]:
y_pred = grid_knn_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
# f1 score with best estimators is 0.43737882900348973
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_knn_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_knn_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

# light GBM

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
#!pip install lightgbm

In [None]:
import lightgbm as lgbm
from lightgbm.sklearn import LGBMClassifier

In [None]:
lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)

In [None]:
estimator_lgbm = Pipeline(steps = [('preprocessor', preprocessor),
                      ('classifier', lgbm_estimator)])

In [None]:
#n_folds = 5
scv = StratifiedKFold(n_splits=5)
param_grid = dict(classifier__learning_rate = [0.001,0.1,1], 
                  classifier__n_estimators = [100,150],
                  classifier__num_leaves = [10,31,100],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__colsample_bytree = [0.5,0.8,1],
                  classifier__min_child_weight = [0.001,1]                  
                 )
grid_lgbm = GridSearchCV(estimator_lgbm, param_grid = param_grid, cv = scv, scoring= 'accuracy')

In [None]:
grid_lgbm_mdl = grid_lgbm.fit(X_train, y_train.ravel())

In [None]:
grid_lgbm.best_params_, grid_lgbm.best_score_
#({'classifier__colsample_bytree': 0.5,
#  'classifier__learning_rate': 0.1,
##  'classifier__min_child_weight': 0.001,
#  'classifier__n_estimators': 150,
#  'classifier__num_leaves': 31,
#  'classifier__subsample': 0.5},
# 0.7947761770351673)

In [None]:
y_pred = grid_lgbm.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)

In [None]:
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')

In [None]:
#save model
#mdl_pkl_name = "mdl_lgbm_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_lgbm_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

In [None]:
#feature importance
pd.Series(grid_lgbm_mdl.best_estimator_.named_steps['classifier'].feature_importances_, encoded_feature_names)\
.sort_values(ascending=True)\
.plot(kind='barh', title='Feature Importances',figsize=(10, 20))

# Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
estimators = [
    ('knn',make_pipeline(preprocessor,KNeighborsClassifier(n_neighbors=10,weights='uniform'))),
    ('nn', make_pipeline(preprocessor,MLPClassifier(hidden_layer_sizes=(40,30),solver = 'adam', activation='relu', alpha=0.1, max_iter = 1000,learning_rate='constant', learning_rate_init=0.001, random_state=1))),
    ('xgb',make_pipeline(preprocessor,XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=10,\
              min_child_weight=5, gamma=0.1, subsample=1, colsample_bytree=0.8,\
              objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27))),
    ('rf', make_pipeline(preprocessor,RandomForestClassifier(n_estimators=150, random_state=42,max_samples=0.5))),
    ('gbm', make_pipeline(preprocessor,GradientBoostingClassifier(max_depth = 8, subsample=1, random_state=1,learning_rate=0.1, max_features=9, n_estimators=150))),
    ('lgbm',make_pipeline(preprocessor,LGBMClassifier(colsample_bytree=0.5,learning_rate=0.1,min_child_weight=0.001,n_estimators=150,num_leaves=31,subsample=0.5)))
]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(C=0.1, max_iter=1000))

In [None]:
stack_mdl = clf.fit(X_train, y_train.ravel())

In [None]:
#save model
#mdl_pkl_name = "mdl_stack_smplimpt_noapr.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(stack_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

In [None]:
stack_mdl.score(X_test, y_test.ravel())

In [None]:
y_pred = stack_mdl.predict(X_test)

In [None]:
metrics.roc_auc_score(y_test,y_pred)
metrics.f1_score(y_test,y_pred)
print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')