* Hyperparameter tuning of All classifiers for emotional transition detection
* 6 fold cross validation with grid-search
* Binary classification

In [1]:
import pandas as pd
import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

from pprint import pprint
from sklearn.model_selection import train_test_split

from sklearn import metrics   
from sklearn.feature_selection import SelectFromModel,RFECV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, PredefinedSplit
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler


from sklearn import metrics   
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

pd.options.mode.chained_assignment = None
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#warnings.filterwarnings('always')
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from imblearn.metrics import specificity_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics   

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from catboost import CatBoostClassifier, Pool, cv
from sklearn.neural_network import MLPClassifier


#from pandas_ml import ConfusionMatrix

#import collections

Using TensorFlow backend.


In [2]:
def read_input(p):
    #Read input file of each person
    filename='data/NonOverlap_w5_emoChange_SelFeat_data_p'+str(p)+'.csv'
    
    raw_df= pd.read_csv(filename)
    print("The shape of the dataframe is ",raw_df.shape)

    return raw_df

In [3]:
# replace NANs with -999
def prep_data(data):
    return data.fillna(-999)

In [4]:
#drop columns
def drop_cols(data, col_list):
    return data.drop(col_list, axis=1)

In [5]:
# normalize data with minmax
def scale_data(trn_x, tst_x):
    
    sc= StandardScaler()
    scaled_trn_x = sc.fit_transform(trn_x)
    scaled_tst_x = sc.fit_transform(tst_x)
    
    return scaled_trn_x, scaled_tst_x

In [6]:
# oversampling with SMOTE with 'minority' and 'not majority'
def over_sample_SMOTE(X_train, y_train):
    sm=SMOTE(sampling_strategy='not majority', random_state=10) # 'minority'
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [7]:
# oversampling with SMOTENC with 'minority' and 'not majority'
def over_sample_SMOTENC(X_train, y_train):
    
    sm = SMOTENC(sampling_strategy='not majority',random_state=10)
    
    #sm = SMOTENC(sampling_strategy='minority',random_state=10)
    
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [8]:
# oversampling with SVMSMOTE 
def over_sample_SVMSMOTE(X_train, y_train):
    sm=SVMSMOTE(random_state=10)
    
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [9]:
def merge_dataframes(p_list):
    df = pd.DataFrame()
    for p in p_list:
        new_df = read_input(p)
        df=df.append(new_df,ignore_index = True)
        
    #drop all variables that contain all NANs
    df.dropna(axis=1,how='all', inplace=True)
    #reset the index
    df.reset_index(drop=True, inplace=True)
    #drop columns with all zeros in pandas dataframe
    df=df.T[(df!=0).any()].T
    
    #keep columns with missing values < 30%  
    df = df.loc[:, df.isnull().mean() < .3]
    
    print("The shape of the merged dataframe is ",df.shape)

    return df

In [10]:
#drop all columns that contain location information (if any)
def drop_location(df):
    print(df.shape)
    df = df[df.columns.drop(list(df.filter(regex='location')))]
    df = df[df.columns.drop(list(df.filter(regex='latitude')))]
    df = df[df.columns.drop(list(df.filter(regex='lonitude')))]
    print(df.shape)
   
    return df

In [11]:
def select_k_features(X_train_scaled,X_test_scaled,y_train,k):
    selection = SelectKBest(mutual_info_classif, k)
    X_train = selection.fit_transform(X_train_scaled,y_train)
    X_test = selection.transform(X_test_scaled)
    
    return X_train, X_test

In [12]:
def print_results(accu, bl_accu, prec, rec_, spec_, roc_, f1_):   
    print('.....................')
    print("Average Accuracy: %.2f%% (%.2f)" % (np.mean(accu), np.std(accu)))
    print("Average Balanced_accuracy: %.2f%% (%.2f)" % (np.mean(bl_accu),np.std(bl_accu)))
    print("Average Precision: %.2f%% (%.2f)" % (np.mean(prec),np.std(prec)))
    print("Average Recall: %.2f%% (%.2f)" % (np.mean(rec_),np.std(rec_)))
    print("Average Specificity: %.2f%% (%.2f)" % (np.mean(spec_),np.std(spec_)))
    print("Average ROC AUC: %.2f%% (%.2f)" % (np.mean(roc_),np.std(roc_)))
    print("Average F1 score: %.2f%% (%.2f)" % (np.mean(f1_),np.std(f1_)))
    print('..................................................')
    print('\n')

In [38]:
pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', LogisticRegression())])

search_space = [{'selector__k': [ 50, 70, 90]},
                
                {'classifier': [LogisticRegression(solver='lbfgs')],
                 'classifier__C': [0.01, 0.1, 1.0],
                 'classifier__penalty': ['l1', 'l2', None],
                 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 'classifier__max_iter':[100, 150, 200], 
                 'classifier__class_weight':[None, 'balanced']},
                 
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_depth': [5, 10, 30, None],
                 'classifier__criterion':['gini','entropy'], 
                 'classifier__bootstrap': [True],
                 'classifier__max_features':['log2', None],
                 'classifier__n_estimators': [50, 100, 200, 300, 400]},
                
                {'classifier': [MLPClassifier(random_state=1, early_stopping=True)],
                 'classifier__hidden_layer_sizes' : [(50, 50, 50), (50, 100, 50), (20, 20, 20), (30, ), (50,),(100,)], 
                 'classifier__activation' : ['tanh', 'relu', 'logistic'],
                 'classifier__max_iter':[50, 100, 150, 200, 300],
                 'classifier__solver': ['sgd', 'adam', 'lbfgs'],
                 'classifier__alpha': [0.0001, 0.001, 0.05]},
                
                {'classifier': [CatBoostClassifier(random_seed=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2]},
                
                {'classifier': [XGBClassifier(objective='binary:logistic', random_state=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2],
                 'classifier__colsample_bytree':[.5, .75, 1],
                 'classifier__max_depth': np.arange(3, 6, 10),
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]
                  

scorer = make_scorer(f1_score, average = 'binary')

In [39]:
LR_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', LogisticRegression())])

LR_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [LogisticRegression(solver='lbfgs')],
                 'classifier__C': [0.01, 0.1, 1.0],
                 'classifier__penalty': ['l1', 'l2', None],
                 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 'classifier__max_iter':[100, 150, 200], 
                 'classifier__class_weight':[None, 'balanced']}]
                 
################################################################################          

RF_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', RandomForestClassifier())])

RF_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_depth': [5, 10, 30, None],
                 'classifier__criterion':['gini','entropy'], 
                 'classifier__bootstrap': [True],
                 'classifier__max_features':['log2', None],
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]
                  
################################################################################

MLP_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', MLPClassifier(random_state=1, early_stopping=True))])

MLP_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [MLPClassifier(random_state=1, early_stopping=True)],
                 'classifier__hidden_layer_sizes' : [(50, 50, 50), (50, 100, 50), (20, 20, 20), (30, ), (50,),(100,)], 
                 'classifier__activation' : ['tanh', 'relu', 'logistic'],
                 'classifier__max_iter':[50, 100, 150, 200, 300],
                 'classifier__solver': ['sgd', 'adam', 'lbfgs'],
                 'classifier__alpha': [0.0001, 0.001, 0.05]}]

################################################################################

CB_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', CatBoostClassifier(random_seed=1))])

CB_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [CatBoostClassifier(random_seed=1, verbose=False)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2]}]
#'iterations': Integer(10, 1000),
 #                'depth': Integer(1, 8),
  #               'learning_rate': Real(0.01, 1.0, 'log-uniform'),
   #              'random_strength': Real(1e-9, 10, 'log-uniform'),
    #             'bagging_temperature': Real(0.0, 1.0),
     #            'border_count': Integer(1, 255),
      #           'l2_leaf_reg': Integer(2, 30),
       #          'scale_pos_weight':Real(0.01, 1.0, 'uniform')

################################################################################

XGB_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', XGBClassifier(objective='binary:logistic', random_state=1))])

XGB_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [XGBClassifier(objective='binary:logistic', random_state=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2],
                 'classifier__colsample_bytree':[.5, .75, 1],
                 'classifier__max_depth': np.arange(3, 6, 10),
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]


In [13]:
p_list=[8,10,12,13,15,20,21,25, 27, 33,35,40,46,48,49,52,54,55]
nfolds = 6

In [14]:
# make a predifined CV split (test_fold)
test_fold = []
for i in range(nfolds):
    p_test = p_list[i*3:i*3+3]
    
    df_test = merge_dataframes(p_test)
    
    tst = [i] * df_test.shape[0] 
    
    test_fold= test_fold + tst

The shape of the dataframe is  (268, 274)
The shape of the dataframe is  (1275, 274)
The shape of the dataframe is  (539, 274)
The shape of the merged dataframe is  (2082, 233)
The shape of the dataframe is  (1245, 274)
The shape of the dataframe is  (551, 274)
The shape of the dataframe is  (623, 274)
The shape of the merged dataframe is  (2419, 238)
The shape of the dataframe is  (1268, 274)
The shape of the dataframe is  (257, 274)
The shape of the dataframe is  (317, 274)
The shape of the merged dataframe is  (1842, 179)
The shape of the dataframe is  (862, 274)
The shape of the dataframe is  (911, 274)
The shape of the dataframe is  (752, 274)
The shape of the merged dataframe is  (2525, 237)
The shape of the dataframe is  (755, 274)
The shape of the dataframe is  (1164, 274)
The shape of the dataframe is  (692, 274)
The shape of the merged dataframe is  (2611, 186)
The shape of the dataframe is  (868, 274)
The shape of the dataframe is  (529, 274)
The shape of the dataframe is  (

In [15]:
ps = PredefinedSplit(test_fold)

# df contains all persons' data in one dataset
df = merge_dataframes(p_list)
df = prep_data(df)

# remove day_of_month variable if present in data
if 'day_of_month' in df.columns:
    drop_col=['day_of_month']
    df=drop_cols(df, drop_col)

#drop all columns that contain location information 
df = drop_location(df)


labels = list(df.columns)
labels.remove('emotion_change')

X = df[labels]
y = df['emotion_change']


The shape of the dataframe is  (268, 274)
The shape of the dataframe is  (1275, 274)
The shape of the dataframe is  (539, 274)
The shape of the dataframe is  (1245, 274)
The shape of the dataframe is  (551, 274)
The shape of the dataframe is  (623, 274)
The shape of the dataframe is  (1268, 274)
The shape of the dataframe is  (257, 274)
The shape of the dataframe is  (317, 274)
The shape of the dataframe is  (862, 274)
The shape of the dataframe is  (911, 274)
The shape of the dataframe is  (752, 274)
The shape of the dataframe is  (755, 274)
The shape of the dataframe is  (1164, 274)
The shape of the dataframe is  (692, 274)
The shape of the dataframe is  (868, 274)
The shape of the dataframe is  (529, 274)
The shape of the dataframe is  (707, 274)
The shape of the merged dataframe is  (13583, 204)
(13583, 204)
(13583, 191)


In [43]:
def grid_search_wrapper(pipe = pipe, search_space = search_space, verbose= False,refit_score=scorer):
    """
    fits a GridSearchCV classifiers using refit_score for optimization
    prints classifier performance metrics
    """
    #cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cross_validation = ps
    
    grid_search = GridSearchCV(pipe, search_space, cv=cross_validation, verbose=verbose,  n_jobs = -1) #scoring=scorer, refit=scorer
    
    grid_search.fit(X, y)
    
    return grid_search

In [44]:
# do gird search for best parameters
pipeline_grid_search_LR = grid_search_wrapper(pipe = LR_pipe, search_space = LR_search_space, verbose=2)

Fitting 6 folds for each of 274 candidates, totalling 1644 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 58.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 79.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 105.2min
[Parallel(n_jobs=-1)]: Done 1644 out of 1644 | elapsed: 117.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [45]:
pipeline_grid_search_RF = grid_search_wrapper(pipe = RF_pipe, search_space = RF_search_space, verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 6 folds for each of 84 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 83.1min
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed: 127.8min finished


In [46]:
pipeline_grid_search_MLP = grid_search_wrapper(pipe = MLP_pipe, search_space = MLP_search_space, verbose=2)

Fitting 6 folds for each of 814 candidates, totalling 4884 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 65.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 95.6min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 133.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 171.1min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 217.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 269.3min
[Parallel(n_jobs=-1)]: Done 4884 out of 4884 | elapsed: 322.7min finished


In [47]:
pipeline_grid_search_XGB = grid_search_wrapper(pipe = XGB_pipe, search_space = XGB_search_space, verbose=2)

Fitting 6 folds for each of 64 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 35.5min
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed: 39.1min finished


In [48]:
pipeline_grid_search_CB = grid_search_wrapper(pipe = CB_pipe, search_space = CB_search_space, verbose=False)

In [49]:
print(pipeline_grid_search_RF.best_estimator_)
print(pipeline_grid_search_RF.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a216088c0>)),
                ('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features=None,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                     

In [50]:
print(pipeline_grid_search_XGB.best_estimator_)
print(pipeline_grid_search_XGB.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a216088c0>)),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.5, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=400, n_jobs=1, nthread=None,
                               objective='binary:logistic', random_state=1,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
   

In [51]:
print(pipeline_grid_search_LR.best_estimator_)
print(pipeline_grid_search_LR.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=50,
                             score_func=<function mutual_info_classif at 0x1a216088c0>)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
0.947902068375397


In [52]:
print(pipeline_grid_search_CB.best_estimator_)
print(pipeline_grid_search_CB.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a216088c0>)),
                ('classifier',
                 <catboost.core.CatBoostClassifier object at 0x1a4c753050>)],
         verbose=False)
0.9491117417230509


In [53]:
print(pipeline_grid_search_MLP.best_estimator_)
print(pipeline_grid_search_MLP.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a216088c0>)),
                ('classifier',
                 MLPClassifier(activation='tanh', alpha=0.001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=True, epsilon=1e-08,
                               hidden_layer_sizes=(50, 50, 50),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=50, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,


In [16]:
# best models

LR_model = LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False)

RF_model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features=None,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False)

XGB_model = XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.5, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=400, n_jobs=1, nthread=None,
                               objective='binary:logistic', random_state=1,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1)

MLP_model = MLPClassifier(activation='tanh', alpha=0.001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=True, epsilon=1e-08,
                               hidden_layer_sizes=(50, 50, 50),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=50, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False)


CB_model = CatBoostClassifier(random_seed=1, verbose=False,learning_rate= 0.1)

In [17]:
best_models = {} # dictionary of best models with best parameters

best_models['Logistic Regression'] = LR_model
best_models['RandomForest Classifier'] = RF_model
best_models['MLP Classifier'] = MLP_model
best_models['XGBoost Classifier'] = XGB_model
best_models['CatBoost Classifier'] = CB_model

n_features = [50, 90, 90, 90, 90]

In [18]:
nfolds = 6
rnd_state=42


In [19]:
# this is to get all the detailed performance meterics after selecting the best model parameters
k_i = -1    
for model_name, model in best_models.items(): 
    k_i = k_i + 1
    accu = []
    prec = []
    rec_ = []
    f1_ = []
    bl_accu = []
    roc_ = []
    spec_ = []

    i = 1
    for train_index, test_index in ps.split():
        #print("fold", i)
        i+=1

        X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        #scale features
        X_train_scaled, X_test_scaled= scale_data(X_train, X_test) 
        #feature selection
        X_train, X_test = select_k_features(X_train_scaled,X_test_scaled,y_train,k=n_features[k_i])

        #oversample training data
        X_train_imb,y_train_imb=over_sample_SMOTE(X_train, y_train)
        #X_train_imb,y_train_imb=over_sample_SMOTENC(X_train, y_train)
        #X_train_imb,y_train_imb=over_sample_SVMSMOTE(X_train, y_train)


        # train model on imbalance-handled data
        model.fit(X_train_imb, y_train_imb)

        #train model on imbalance data 
        #model.fit(X_train, y_train)

        # test model, measure class label and probability score
        y_pred = model.predict(X_test)
        y_scores = model.predict_proba(X_test)[:,1]

        #calculate metrices

        accuracy = accuracy_score(y_test, y_pred)
        bl_accuracy = balanced_accuracy_score(y_test, y_pred)
        precision=precision_score(y_test, y_pred, labels=np.unique(y_pred))
        recall=recall_score(y_test, y_pred, labels=np.unique(y_pred))
        f1=f1_score(y_test, y_pred, labels=np.unique(y_pred))
        roc=roc_auc_score(y_test, y_scores, labels=np.unique(y_pred))
        spec=specificity_score(y_test, y_pred ,labels=np.unique(y_pred))
        
        ac=accuracy * 100.0
        pr=precision*100
        rc=recall*100
        f1_p=f1*100
        bl_ac=bl_accuracy*100
        roc=roc*100
        spec=spec*100
    
        accu.append(ac)
        prec.append(pr)
        rec_.append(rc)
        f1_.append(f1_p)
        bl_accu.append(bl_ac)
        roc_.append(roc)
        spec_.append(spec)
    
    print('Restuls for: ', model_name)
    print_results(accu, bl_accu, prec, rec_, spec_, roc_, f1_)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Restuls for:  Logistic Regression
.....................
Average Accuracy: 87.06% (1.46)
Average Balanced_accuracy: 82.75% (4.03)
Average Precision: 25.27% (5.79)
Average Recall: 77.83% (8.18)
Average Specificity: 87.68% (1.54)
Average ROC AUC: 90.16% (2.77)
Average F1 score: 37.51% (6.37)
..................................................


Restuls for:  RandomForest Classifier
.....................
Average Accuracy: 22.13% (22.16)
Average Balanced_accuracy: 56.41% (10.04)
Average Precision: 6.48% (3.01)
Average Recall: 94.60% (3.76)
Average Specificity: 18.22% (23.34)
Average ROC AUC: 51.68% (20.99)
Average F1 score: 11.97% (5.06)
..................................................






Restuls for:  MLP Classifier
.....................
Average Accuracy: 84.15% (3.64)
Average Balanced_accuracy: 81.96% (4.71)
Average Precision: 22.07% (5.48)
Average Recall: 79.40% (7.85)
Average Specificity: 84.51% (3.67)
Average ROC AUC: 89.52% (3.73)
Average F1 score: 34.01% (6.82)
..................................................


Restuls for:  XGBoost Classifier
.....................
Average Accuracy: 9.91% (8.65)
Average Balanced_accuracy: 51.66% (3.91)
Average Precision: 5.32% (1.47)
Average Recall: 98.30% (2.25)
Average Specificity: 5.02% (9.56)
Average ROC AUC: 66.23% (8.12)
Average F1 score: 10.06% (2.62)
..................................................


Restuls for:  CatBoost Classifier
.....................
Average Accuracy: 55.23% (27.40)
Average Balanced_accuracy: 60.06% (6.78)
Average Precision: 8.90% (3.13)
Average Recall: 65.44% (19.92)
Average Specificity: 54.67% (29.26)
Average ROC AUC: 67.69% (10.05)
Average F1 score: 15.15% (4.70)
..............................