* Hyperparameter tuning of All classifiers for emotional state detection
* 6 fold cross validation with grid-search
* Multiclass classification


In [40]:
import pandas as pd
import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

from pprint import pprint
from sklearn.model_selection import train_test_split

from sklearn import metrics   
from sklearn.feature_selection import SelectFromModel,RFECV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, PredefinedSplit
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler


from sklearn import metrics   
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

pd.options.mode.chained_assignment = None
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#warnings.filterwarnings('always')
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from imblearn.metrics import specificity_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics   

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from catboost import CatBoostClassifier, Pool, cv
from sklearn.neural_network import MLPClassifier


#from pandas_ml import ConfusionMatrix

#import collections

In [41]:
def read_input(p): # 
    #Read input file of each person
    filename='data/NOv_w5_emotionLabel_SelFeat_p'+str(p)+'.csv'
    
    raw_df= pd.read_csv(filename)
    print("The shape of the dataframe is ",raw_df.shape)

    return raw_df

In [42]:
# replace NANs with -999
def prep_data(data):
    return data.fillna(-999)

In [43]:
#drop columns
def drop_cols(data, col_list):
    return data.drop(col_list, axis=1)

In [44]:
# normalize data with minmax
def scale_data(trn_x, tst_x):
    
    sc= StandardScaler()
    scaled_trn_x = sc.fit_transform(trn_x)
    scaled_tst_x = sc.fit_transform(tst_x)
    
    return scaled_trn_x, scaled_tst_x

In [45]:
# oversampling with SMOTE with 'minority' and 'not majority'
def over_sample_SMOTE(X_train, y_train):
    sm=SMOTE(sampling_strategy='not majority', random_state=10) # 'minority'
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [46]:
# oversampling with SMOTENC with 'minority' and 'not majority'
def over_sample_SMOTENC(X_train, y_train):
    
    sm = SMOTENC(sampling_strategy='not majority',random_state=10)
    
    #sm = SMOTENC(sampling_strategy='minority',random_state=10)
    
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [47]:
# oversampling with SVMSMOTE 
def over_sample_SVMSMOTE(X_train, y_train):
    sm=SVMSMOTE(random_state=10)
    
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [48]:
def merge_dataframes(p_list):
    df = pd.DataFrame()
    for p in p_list:
        new_df = read_input(p)
        df=df.append(new_df,ignore_index = True)
        
    #drop all variables that contain all NANs
    df.dropna(axis=1,how='all', inplace=True)
    #reset the index
    df.reset_index(drop=True, inplace=True)
    #drop columns with all zeros in pandas dataframe
    df=df.T[(df!=0).any()].T
    
    #keep columns with missing values < 30%  
    df = df.loc[:, df.isnull().mean() < .3]
    
    print("The shape of the merged dataframe is ",df.shape)
    return df

In [49]:
#drop all columns that contain location information (if any)
def drop_location(df):
    print(df.shape)
    df = df[df.columns.drop(list(df.filter(regex='location')))]
    df = df[df.columns.drop(list(df.filter(regex='latitude')))]
    df = df[df.columns.drop(list(df.filter(regex='lonitude')))]
    print(df.shape)
   
    return df

In [50]:
def select_k_features(X_train_scaled,X_test_scaled,y_train,k):
    selection = SelectKBest(mutual_info_classif, k)
    X_train = selection.fit_transform(X_train_scaled,y_train)
    X_test = selection.transform(X_test_scaled)
    
    return X_train, X_test

In [51]:
def print_results(accu, bl_accu, prec, rec_, spec_, roc_, f1_):   
    print('.....................')
    print("Average Accuracy: %.2f%% (%.2f)" % (np.mean(accu), np.std(accu)))
    print("Average Balanced_accuracy: %.2f%% (%.2f)" % (np.mean(bl_accu),np.std(bl_accu)))
    print("Average Precision: %.2f%% (%.2f)" % (np.mean(prec),np.std(prec)))
    print("Average Recall: %.2f%% (%.2f)" % (np.mean(rec_),np.std(rec_)))
    print("Average Specificity: %.2f%% (%.2f)" % (np.mean(spec_),np.std(spec_)))
    print("Average ROC AUC: %.2f%% (%.2f)" % (np.mean(roc_),np.std(roc_)))
    print("Average F1 score: %.2f%% (%.2f)" % (np.mean(f1_),np.std(f1_)))
    print('..................................................')
    print('\n')

In [52]:
pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', LogisticRegression())])

search_space = [{'selector__k': [ 50, 70, 90]},
                
                {'classifier': [LogisticRegression(solver='lbfgs')],
                 'classifier__C': [0.01, 0.1, 1.0],
                 'classifier__penalty': ['l1', 'l2', None],
                 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 'classifier__max_iter':[100, 150, 200], 
                 'classifier__class_weight':[None, 'balanced']},
                 
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_depth': [5, 10, 30, None],
                 'classifier__criterion':['gini','entropy'], 
                 'classifier__bootstrap': [True],
                 'classifier__max_features':['log2', None],
                 'classifier__n_estimators': [50, 100, 200, 300, 400]},
                
                {'classifier': [MLPClassifier(random_state=1, early_stopping=True)],
                 'classifier__hidden_layer_sizes' : [(50, 50, 50), (50, 100, 50), (20, 20, 20), (30, ), (50,),(100,)], 
                 'classifier__activation' : ['tanh', 'relu', 'logistic'],
                 'classifier__max_iter':[50, 100, 150, 200, 300],
                 'classifier__solver': ['sgd', 'adam', 'lbfgs'],
                 'classifier__alpha': [0.0001, 0.001, 0.05]},
                
                {'classifier': [CatBoostClassifier(random_seed=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2]},
                
                {'classifier': [XGBClassifier(random_state=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2],
                 'classifier__colsample_bytree':[.5, .75, 1],
                 'classifier__max_depth': np.arange(3, 6, 10),
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]
                  
     
scorers = {
    'precision_score': make_scorer(precision_score, average='macro'),
    'recall_score': make_scorer(recall_score, average='macro'),
    'accuracy_score': make_scorer(accuracy_score, average='macro')
}

scorer = make_scorer(f1_score, average = 'micro')

In [53]:
LR_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', LogisticRegression())])

LR_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [LogisticRegression(solver='lbfgs')],
                 'classifier__C': [0.01, 0.1, 1.0],
                 'classifier__penalty': ['l1', 'l2', None],
                 'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 'classifier__max_iter':[100, 150, 200], 
                 'classifier__class_weight':[None, 'balanced']}]
                 
################################################################################          

RF_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', RandomForestClassifier())])

RF_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_depth': [5, 10, 30, None],
                 'classifier__criterion':['gini','entropy'], 
                 'classifier__bootstrap': [True],
                 'classifier__max_features':['log2', None],
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]
                  
################################################################################

MLP_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', MLPClassifier(random_state=1, early_stopping=True))])

MLP_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [MLPClassifier(random_state=1, early_stopping=True)],
                 'classifier__hidden_layer_sizes' : [(50, 50, 50), (50, 100, 50), (20, 20, 20), (30, ), (50,),(100,)], 
                 'classifier__activation' : ['tanh', 'relu', 'logistic'],
                 'classifier__max_iter':[50, 100, 150, 200, 300],
                 'classifier__solver': ['sgd', 'adam', 'lbfgs'],
                 'classifier__alpha': [0.0001, 0.001, 0.05]}]

################################################################################

CB_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', CatBoostClassifier(random_seed=1))])

CB_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [CatBoostClassifier(random_seed=1, verbose=False)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2]}]
#'iterations': Integer(10, 1000),
 #                'depth': Integer(1, 8),
  #               'learning_rate': Real(0.01, 1.0, 'log-uniform'),
   #              'random_strength': Real(1e-9, 10, 'log-uniform'),
    #             'bagging_temperature': Real(0.0, 1.0),
     #            'border_count': Integer(1, 255),
      #           'l2_leaf_reg': Integer(2, 30),
       #          'scale_pos_weight':Real(0.01, 1.0, 'uniform')

################################################################################

XGB_pipe = Pipeline([('scaler', StandardScaler()), # MinMaxScaler()
                 ('selector', SelectKBest(mutual_info_classif, k=90)), #
                 ('classifier', XGBClassifier(random_state=1))])

XGB_search_space = [{'selector__k': [ 50, 70, 90, 110]},
                
                {'classifier': [XGBClassifier(random_state=1)],
                 'classifier__learning_rate': [0.05, 0.1, 0.15, 0.2],
                 'classifier__colsample_bytree':[.5, .75, 1],
                 'classifier__max_depth': np.arange(3, 6, 10),
                 'classifier__n_estimators': [50, 100, 200, 300, 400]}]


In [54]:
p_list=[8,10,12,13,15,20,21,25, 27, 33,35,40,46,48,49,52,54,55]

In [55]:
# make a predifined CV split (test_fold)
test_fold = []
for i in range(nfolds):
    p_test = p_list[i*3:i*3+3]
    
    df_test = merge_dataframes(p_test)
    
    tst = [i] * df_test.shape[0] 
    
    test_fold= test_fold + tst

The shape of the dataframe is  (269, 274)
The shape of the dataframe is  (1276, 274)
The shape of the dataframe is  (540, 274)
The shape of the merged dataframe is  (2085, 233)
The shape of the dataframe is  (1246, 274)
The shape of the dataframe is  (552, 274)
The shape of the dataframe is  (624, 274)
The shape of the merged dataframe is  (2422, 238)
The shape of the dataframe is  (1269, 274)
The shape of the dataframe is  (258, 274)
The shape of the dataframe is  (318, 274)
The shape of the merged dataframe is  (1845, 179)
The shape of the dataframe is  (863, 274)
The shape of the dataframe is  (912, 274)
The shape of the dataframe is  (753, 274)
The shape of the merged dataframe is  (2528, 237)
The shape of the dataframe is  (756, 274)
The shape of the dataframe is  (1165, 274)
The shape of the dataframe is  (693, 274)
The shape of the merged dataframe is  (2614, 186)
The shape of the dataframe is  (869, 274)
The shape of the dataframe is  (530, 274)
The shape of the dataframe is  (

In [56]:
ps = PredefinedSplit(test_fold)

# df contains all persons' data in one dataset
df = merge_dataframes(p_list)
df = prep_data(df)

# remove day_of_month variable if present in data
if 'day_of_month' in df.columns:
    drop_col=['day_of_month']
    df=drop_cols(df, drop_col)

#drop all columns that contain location information 
df = drop_location(df)


labels = list(df.columns)
labels.remove('emotion')

X = df[labels]
y = df['emotion']


The shape of the dataframe is  (269, 274)
The shape of the dataframe is  (1276, 274)
The shape of the dataframe is  (540, 274)
The shape of the dataframe is  (1246, 274)
The shape of the dataframe is  (552, 274)
The shape of the dataframe is  (624, 274)
The shape of the dataframe is  (1269, 274)
The shape of the dataframe is  (258, 274)
The shape of the dataframe is  (318, 274)
The shape of the dataframe is  (863, 274)
The shape of the dataframe is  (912, 274)
The shape of the dataframe is  (753, 274)
The shape of the dataframe is  (756, 274)
The shape of the dataframe is  (1165, 274)
The shape of the dataframe is  (693, 274)
The shape of the dataframe is  (869, 274)
The shape of the dataframe is  (530, 274)
The shape of the dataframe is  (708, 274)
The shape of the merged dataframe is  (13601, 204)
(13601, 204)
(13601, 191)


In [57]:
def grid_search_wrapper(pipe = pipe, search_space = search_space, verbose= False,refit_score=scorer):
    """
    fits a GridSearchCV classifiers using refit_score for optimization
    prints classifier performance metrics
    """
    #cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cross_validation = ps
    
    grid_search = GridSearchCV(pipe, search_space, cv=cross_validation, verbose=verbose,  n_jobs = -1) #scoring=scorer, refit=scorer
    
    grid_search.fit(X, y)
    
    return grid_search

In [58]:
# do gird search for best parameters
pipeline_grid_search_RF = grid_search_wrapper(pipe = RF_pipe, search_space = RF_search_space, verbose=2)

Fitting 6 folds for each of 84 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 84.0min
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed: 165.1min finished


In [59]:
pipeline_grid_search_XGB = grid_search_wrapper(pipe = XGB_pipe, search_space = XGB_search_space, verbose=2)

Fitting 6 folds for each of 64 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 106.6min
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed: 120.3min finished


In [60]:
pipeline_grid_search_LR = grid_search_wrapper(pipe = LR_pipe, search_space = LR_search_space, verbose=2)

Fitting 6 folds for each of 274 candidates, totalling 1644 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 39.6min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 63.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 100.3min
[Parallel(n_jobs=-1)]: Done 1644 out of 1644 | elapsed: 123.1min finished


In [61]:
pipeline_grid_search_MLP = grid_search_wrapper(pipe = MLP_pipe, search_space = MLP_search_space, verbose=2)

Fitting 6 folds for each of 814 candidates, totalling 4884 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 49.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 76.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 108.9min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 145.9min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 187.4min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 237.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 294.2min
[Parallel(n_jobs=-1)]: Done 4884 out of 4884 | elapsed: 353.3min finished


In [62]:
pipeline_grid_search_CB = grid_search_wrapper(pipe = CB_pipe, search_space = CB_search_space, verbose=False)

In [63]:
print(pipeline_grid_search_RF.best_estimator_)
print(pipeline_grid_search_RF.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a23e49830>)),
                ('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=10, max_features='log2',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=50, n_jobs=None,
                                   

In [64]:
print(pipeline_grid_search_XGB.best_estimator_)
print(pipeline_grid_search_XGB.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a23e49830>)),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.75, gamma=0,
                               learning_rate=0.15, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=None,
                               n_estimators=400, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=1,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
  

In [65]:
print(pipeline_grid_search_LR.best_estimator_)
print(pipeline_grid_search_LR.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a23e49830>)),
                ('classifier',
                 LogisticRegression(C=0.01, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=200,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
0.3995603428246966


In [66]:
print(pipeline_grid_search_CB.best_estimator_)
print(pipeline_grid_search_CB.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a23e49830>)),
                ('classifier',
                 <catboost.core.CatBoostClassifier object at 0x1a4f9c2110>)],
         verbose=False)
0.40175145640320165


In [67]:
print(pipeline_grid_search_MLP.best_estimator_)
print(pipeline_grid_search_MLP.best_score_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selector',
                 SelectKBest(k=90,
                             score_func=<function mutual_info_classif at 0x1a23e49830>)),
                ('classifier',
                 MLPClassifier(activation='logistic', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=True, epsilon=1e-08,
                               hidden_layer_sizes=(50, 50, 50),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=50, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=

In [68]:
# best models

LR_model = LogisticRegression(C=0.01, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=200,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False)

RF_model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=10, max_features='log2',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=50, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False)

MLP_model = MLPClassifier(activation='logistic', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=True, epsilon=1e-08,
                               hidden_layer_sizes=(50, 50, 50),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=50, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False)

XGB_model = XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.75, gamma=0,
                               learning_rate=0.15, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=None,
                               n_estimators=400, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=1,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1)


CB_model = CatBoostClassifier(random_seed=1, verbose=False,learning_rate= 0.1)

In [75]:
best_models = {} # dictionary of best models with best parameters

best_models['Logistic Regression'] = LR_model
best_models['RandomForest Classifier'] = RF_model
best_models['MLP Classifier'] = MLP_model
best_models['XGBoost Classifier'] = XGB_model
best_models['CatBoost Classifier'] = CB_model

n_features = [90, 90, 90, 90, 90]

In [76]:
nfolds = 6
rnd_state=42


In [80]:
# this is to get all the detailed performance meterics after selecting the best model parameters
k_i = -1    
for model_name, model in best_models.items(): 
    k_i = k_i + 1
    accu = []
    prec = []
    rec_ = []
    f1_ = []
    bl_accu = []
    roc_ = []
    spec_ = []

    i = 1
    for train_index, test_index in ps.split():
        #print("fold", i)
        i+=1

        X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        #scale features
        X_train_scaled, X_test_scaled= scale_data(X_train, X_test) 
        #feature selection
        X_train, X_test = select_k_features(X_train_scaled,X_test_scaled,y_train,k=n_features[k_i])

        #oversample training data
        #X_train_imb,y_train_imb=over_sample_SMOTE(X_train, y_train)
        #X_train_imb,y_train_imb=over_sample_SMOTENC(X_train, y_train)
        X_train_imb,y_train_imb=over_sample_SVMSMOTE(X_train, y_train)


        # train model on imbalance-handled data
        model.fit(X_train_imb, y_train_imb)

        #train model on imbalance data 
        #model.fit(X_train, y_train)

        # test model, measure class label and probability score
        y_pred = model.predict(X_test)
        y_scores = model.predict_proba(X_test)

        #calculate metrices
        accuracy = accuracy_score(y_test, y_pred)
        bl_accuracy = balanced_accuracy_score(y_test, y_pred)
        precision=precision_score(y_test, y_pred,  average='macro',labels=np.unique(y_pred)) #'weighted', 'micro', 'micro'
        recall=recall_score(y_test, y_pred,  average='macro',labels=np.unique(y_pred))
        #kappa=cohen_kappa_score(y_pred, y_test)
        spec=specificity_score(y_test, y_pred, average='macro',labels=np.unique(y_pred))
        #roc=roc_auc_score(y_test, y_scores, multi_class='ovr', average='macro')
        f1=f1_score(y_test, y_pred,  average='macro',labels=np.unique(y_pred))

        # sometimes not all classes are present in the test set
        not_present = list(set(model.classes_)-set(y_test.unique()))
        # get that class
        if not_present:
            not_present=not_present[0] # get the element then its index
            ind= list(model.classes_).index(not_present)
            y_scores = np.delete(y_scores,ind,1) # delete it from the scores
            y_scores = y_scores / y_scores.sum(axis=1)[:,None]  #make sure sum equals ro 0 (sum of probabilities)
        else:
            pass


        roc=roc_auc_score(y_test, y_scores, multi_class='ovr', average='macro')

        ac=accuracy * 100.0
        pr=precision*100
        rc=recall*100
        f1_p=f1*100
        bl_ac=bl_accuracy*100
        roc=roc*100
        spec=spec*100
    
        accu.append(ac)
        prec.append(pr)
        rec_.append(rc)
        f1_.append(f1_p)
        bl_accu.append(bl_ac)
        roc_.append(roc)
        spec_.append(spec)
    
    print('Restuls for: ', model_name)
    print_results(accu, bl_accu, prec, rec_, spec_, roc_, f1_)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Restuls for:  Logistic Regression
.....................
Average Accuracy: 32.61% (2.65)
Average Balanced_accuracy: 30.66% (8.82)
Average Precision: 26.48% (2.32)
Average Recall: 28.27% (7.76)
Average Specificity: 82.93% (0.76)
Average ROC AUC: 60.55% (3.41)
Average F1 score: 23.04% (2.82)
..................................................


Restuls for:  RandomForest Classifier
.....................
Average Accuracy: 38.99% (8.71)
Average Balanced_accuracy: 23.27% (2.43)
Average Precision: 25.33% (2.20)
Average Recall: 25.94% (2.08)
Average Specificity: 76.93% (2.63)
Average ROC AUC: 55.43% (4.37)
Average F1 score: 21.81% (3.24)
..................................................




  _warn_prf(average, modifier, msg_start, len(result))


Restuls for:  MLP Classifier
.....................
Average Accuracy: 34.24% (17.81)
Average Balanced_accuracy: 17.85% (4.63)
Average Precision: 29.04% (15.81)
Average Recall: 45.56% (26.76)
Average Specificity: 47.74% (22.16)
Average ROC AUC: 49.46% (7.23)
Average F1 score: 32.78% (20.85)
..................................................


Restuls for:  XGBoost Classifier
.....................
Average Accuracy: 29.63% (15.13)
Average Balanced_accuracy: 24.17% (4.63)
Average Precision: 25.15% (5.78)
Average Recall: 25.40% (6.51)
Average Specificity: 78.49% (1.52)
Average ROC AUC: 54.10% (6.63)
Average F1 score: 19.06% (6.91)
..................................................


Restuls for:  CatBoost Classifier
.....................
Average Accuracy: 39.52% (13.24)
Average Balanced_accuracy: 23.32% (2.91)
Average Precision: 27.18% (10.23)
Average Recall: 27.81% (5.77)
Average Specificity: 75.36% (4.66)
Average ROC AUC: 56.83% (6.14)
Average F1 score: 21.20% (6.95)
......................