In [75]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, roc_auc_score, precision_score, recall_score, f1_score,cohen_kappa_score

%matplotlib inline
plt.style.use('seaborn-white')

In [76]:
dataset="bank-additional-full.csv"
bankdata=pd.read_csv(dataset, delimiter=";")
bankdata["response"] = bankdata.y.map({'no':0, 'yes':1})

bankdata.drop(["y"],axis=1, inplace=True)


In [77]:
bankdata.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,response
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [78]:
cat_bankdata = bankdata.select_dtypes(include=['object']).copy() #dataframe of columns with object type columns
int_float_bankdata= bankdata.select_dtypes(include=['int64','float64']).copy() #dataframe of columns with int64,float64 type columns
cat_bankdata.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 10 columns):
job            41188 non-null object
marital        41188 non-null object
education      41188 non-null object
default        41188 non-null object
housing        41188 non-null object
loan           41188 non-null object
contact        41188 non-null object
month          41188 non-null object
day_of_week    41188 non-null object
poutcome       41188 non-null object
dtypes: object(10)
memory usage: 3.1+ MB


In [79]:
#converting categories into numeric types (numeric encoding)

cat_column=cat_bankdata.columns.tolist()

for i in cat_column:
    cat_bankdata[i] = cat_bankdata[i].astype('category')
    cat_bankdata[i] = cat_bankdata[i].cat.codes
    
cat_bankdata.head()    

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,3,1,0,0,0,0,1,6,1,1
1,7,1,3,1,0,0,1,6,1,1
2,7,1,3,0,2,0,1,6,1,1
3,0,1,1,0,0,0,1,6,1,1
4,7,1,3,0,0,2,1,6,1,1


In [80]:
print(cat_bankdata.isnull().values.sum()) #checking for the null values

0


In [81]:
#remove constant features from a dataset

from sklearn.feature_selection import VarianceThreshold

#EX/ threshold=0.1 which means that if the variance of the values in a column is less than 0.1, remove those columns.
#In other words, remove feature column where approximately 99% of the values are similar.

variance_filter = VarianceThreshold(threshold=0.1)
variance_filter.fit(int_float_bankdata)  

#Now to get all the features that are NOT CONSTANT, we can use the get_support() method of the filter
print(int_float_bankdata.columns[variance_filter.get_support()])

#Get the number of constant features with the help of the following script:
constant_columns = [column for column in int_float_bankdata.columns  
                    if column not in int_float_bankdata.columns[variance_filter.get_support()]]

int_float_bankdata_filtered=int_float_bankdata[int_float_bankdata.columns[variance_filter.get_support()]]

int_float_bankdata_filtered.head()  #not include "response"

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0


In [82]:
print(constant_columns)

['response']


In [83]:
#Removing Correlated Features
correlated_features = set()   #creating empty set
correlation_matrix = int_float_bankdata.corr()  

for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(correlation_matrix)

int_float_bankdata.drop(columns=list(correlated_features), axis=1, inplace=True)  


                     age  duration  campaign     pdays  previous  \
age             1.000000 -0.000866  0.004594 -0.034369  0.024365   
duration       -0.000866  1.000000 -0.071699 -0.047577  0.020640   
campaign        0.004594 -0.071699  1.000000  0.052584 -0.079141   
pdays          -0.034369 -0.047577  0.052584  1.000000 -0.587514   
previous        0.024365  0.020640 -0.079141 -0.587514  1.000000   
emp.var.rate   -0.000371 -0.027968  0.150754  0.271004 -0.420489   
cons.price.idx  0.000857  0.005312  0.127836  0.078889 -0.203130   
cons.conf.idx   0.129372 -0.008173 -0.013733 -0.091342 -0.050936   
euribor3m       0.010767 -0.032897  0.135133  0.296899 -0.454494   
nr.employed    -0.017725 -0.044703  0.144095  0.372605 -0.501333   
response        0.030399  0.405274 -0.066357 -0.324914  0.230181   

                emp.var.rate  cons.price.idx  cons.conf.idx  euribor3m  \
age                -0.000371        0.000857       0.129372   0.010767   
duration           -0.027968       

In [84]:
resulting_bankdata = pd.concat([cat_bankdata, int_float_bankdata], axis=1)
resulting_feature_set=resulting_bankdata.drop(["response"],axis=1) #response variable (target variable) excluded
print(resulting_feature_set.head())

   job  marital  education  default  housing  loan  contact  month  \
0    3        1          0        0        0     0        1      6   
1    7        1          3        1        0     0        1      6   
2    7        1          3        0        2     0        1      6   
3    0        1          1        0        0     0        1      6   
4    7        1          3        0        0     2        1      6   

   day_of_week  poutcome  age  duration  campaign  pdays  previous  \
0            1         1   56       261         1    999         0   
1            1         1   57       149         1    999         0   
2            1         1   37       226         1    999         0   
3            1         1   40       151         1    999         0   
4            1         1   56       307         1    999         0   

   emp.var.rate  cons.price.idx  cons.conf.idx  
0           1.1          93.994          -36.4  
1           1.1          93.994          -36.4  
2          

In [85]:
print(correlated_features)

{'euribor3m', 'nr.employed'}


In [86]:
resulting_bankdata.describe()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,response
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,3.72458,1.172769,3.747184,0.208872,1.07172,0.327425,0.365252,4.230868,2.004613,0.930101,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,0.112654
std,3.59456,0.608902,2.136482,0.406686,0.985314,0.723616,0.481507,2.320025,1.397575,0.362886,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,0.316173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.0
25%,0.0,1.0,2.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,0.0
50%,2.0,1.0,3.0,0.0,2.0,0.0,0.0,4.0,2.0,1.0,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,0.0
75%,7.0,2.0,6.0,0.0,2.0,0.0,1.0,6.0,3.0,1.0,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,0.0
max,11.0,3.0,7.0,2.0,2.0,2.0,1.0,9.0,4.0,2.0,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,1.0


In [87]:
resulting_bankdata.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,response
0,3,1,0,0,0,0,1,6,1,1,56,261,1,999,0,1.1,93.994,-36.4,0
1,7,1,3,1,0,0,1,6,1,1,57,149,1,999,0,1.1,93.994,-36.4,0
2,7,1,3,0,2,0,1,6,1,1,37,226,1,999,0,1.1,93.994,-36.4,0
3,0,1,1,0,0,0,1,6,1,1,40,151,1,999,0,1.1,93.994,-36.4,0
4,7,1,3,0,0,2,1,6,1,1,56,307,1,999,0,1.1,93.994,-36.4,0


In [88]:
dict_values=dict(resulting_bankdata["response"].value_counts())
print(dict_values)
print(dict_values[1]/(dict_values[0]+dict_values[1])) #data is imbalanced
#Research on imbalanced classes often considers imbalanced to mean a minority class of 10% to 20%


{0: 36548, 1: 4640}
0.11265417111780131


In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(resulting_feature_set, resulting_bankdata["response"], test_size=0.2, random_state=1, stratify=y)


In [97]:
from sklearn.model_selection import RandomizedSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(random_state=0,sampling_strategy="majority",class_weight="balanced",max_depth=None)
 
parameters = {"criterion":["gini","entropy"], "n_estimators":[50,100,150],"max_depth":[10,12,14,16,18] }
classif_RF = RandomizedSearchCV(brf, parameters, cv=5,return_train_score=True, scoring="recall")

classif_RF.fit(X_train, y_train)


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=BalancedRandomForestClassifier(bootstrap=True,
                                                            class_weight='balanced',
                                                            criterion='gini',
                                                            max_depth=None,
                                                            max_features='auto',
                                                            max_leaf_nodes=None,
                                                            min_impurity_decrease=0.0,
                                                            min_samples_leaf=2,
                                                            min_samples_split=2,
                                                            min_weight_fraction_leaf=0.0,
                                                            n_estimators=100,
                                                 

In [98]:
pd.DataFrame(classif_RF.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,8.422221,2.743126,0.699528,0.455411,100,16,gini,"{'n_estimators': 100, 'max_depth': 16, 'criter...",0.966353,0.981157,...,0.971173,0.010035,7,0.999663,0.999663,0.999663,0.999663,1.0,0.999731,0.000135
1,3.067121,0.29383,0.334745,0.180443,50,18,gini,"{'n_estimators': 50, 'max_depth': 18, 'criteri...",0.963661,0.981157,...,0.970904,0.008375,8,0.998653,0.999326,0.999663,1.0,0.999663,0.999461,0.000457
2,3.445561,0.311819,0.718894,0.598763,50,14,entropy,"{'n_estimators': 50, 'max_depth': 14, 'criteri...",0.971736,0.985195,...,0.976023,0.008915,5,0.998316,0.99899,0.999327,0.998653,0.998653,0.998788,0.000343
3,8.699206,0.68705,0.617689,0.126869,150,18,entropy,"{'n_estimators': 150, 'max_depth': 18, 'criter...",0.959623,0.975774,...,0.965786,0.008759,10,0.999326,1.0,1.0,1.0,1.0,0.999865,0.000269
4,5.622032,0.58336,0.387642,0.05776,100,12,entropy,"{'n_estimators': 100, 'max_depth': 12, 'criter...",0.979812,0.987887,...,0.981141,0.007133,4,0.998316,0.998653,0.998653,0.99697,0.99798,0.998114,0.000624
5,2.706845,0.504125,0.384892,0.344234,50,10,gini,"{'n_estimators': 50, 'max_depth': 10, 'criteri...",0.982503,0.993271,...,0.985721,0.006744,2,0.997979,0.997642,0.998316,0.997643,0.997306,0.997777,0.000343
6,8.898312,0.487016,0.731366,0.404577,150,10,gini,"{'n_estimators': 150, 'max_depth': 10, 'criter...",0.981157,0.990579,...,0.984913,0.005349,3,0.998316,0.997642,0.998316,0.997306,0.997306,0.997778,0.000457
7,8.598744,0.519015,0.590647,0.049932,150,16,gini,"{'n_estimators': 150, 'max_depth': 16, 'criter...",0.965007,0.978466,...,0.971713,0.008689,6,0.999326,0.999663,1.0,1.0,1.0,0.999798,0.000269
8,2.749506,0.229087,0.210215,0.021172,50,16,entropy,"{'n_estimators': 50, 'max_depth': 16, 'criteri...",0.966353,0.982503,...,0.970095,0.010593,9,0.99899,0.999663,0.999327,0.99899,0.999663,0.999327,0.000301
9,2.532309,0.159703,0.192029,0.020553,50,10,entropy,"{'n_estimators': 50, 'max_depth': 10, 'criteri...",0.985195,0.991925,...,0.987877,0.003905,1,0.998653,0.997305,0.997643,0.99798,0.996633,0.997643,0.000673


In [100]:
print(classif_RF.best_estimator_)
best=classif_RF.best_estimator_

BalancedRandomForestClassifier(bootstrap=True, class_weight='balanced',
                               criterion='entropy', max_depth=10,
                               max_features='auto', max_leaf_nodes=None,
                               min_impurity_decrease=0.0, min_samples_leaf=2,
                               min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=50,
                               n_jobs=1, oob_score=False, random_state=0,
                               replacement=False, sampling_strategy='majority',
                               verbose=0, warm_start=False)


In [101]:
print(classif_RF.best_params_)
print(classif_RF.best_score_)

{'n_estimators': 50, 'max_depth': 10, 'criterion': 'entropy'}
0.9878767880558769


In [102]:
best.fit(X_train, y_train)
y_pred = best.predict(X_test)

In [103]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score ,confusion_matrix,cohen_kappa_score
from imblearn.metrics import geometric_mean_score
#For any classifier, there is always a trade off between true positive rate and true negative rate; and the
#same applies for recall and precision

"""
In many applications such as drug discovery and disease diagnosis, it is desirable
to have a classifier that gives high prediction accuracy over the minority class, while maintaining
reasonable accuracy for the majority class. Weighted Accuracy is often used in such situations.
Weights can be adjusted to suit the application. Here we use equal weights for both true positive rate and
true negative rate.
"""

print(confusion_matrix(y_test, y_pred))
print()
print("f1 : ", f1_score(y_test, y_pred,average="binary"))
print("recall : " ,recall_score(y_test, y_pred))
print("precision : " ,precision_score(y_test, y_pred))
print("weighted_accuracy : " ,balanced_accuracy_score(y_test, y_pred))
print("G_mean : " , geometric_mean_score(y_test, y_pred))
print("Kappa_score : " , cohen_kappa_score(y_test, y_pred))

[[5359 1951]
 [  21  907]]

f1 :  0.47913365029054406
recall :  0.9773706896551724
precision :  0.3173547935619314
weighted_accuracy :  0.8552380124062455
G_mean :  0.8464724844982233
Kappa_score :  0.3723942634661046


In [111]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under ROC curve = {:0.2f}".format(roc_auc))

Area under ROC curve = 0.86


In [121]:
#RANDOM_FOREST

from imblearn.over_sampling import SMOTENC #dealing with mixed data type such as continuous and categorical features
from collections import Counter

smote_nc = SMOTENC(categorical_features=[0,1,2,3,4,5,6,7,8,9], random_state=0)

X_train, y_train = smote_nc.fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items()))

[(0, 29238), (1, 29238)]


In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit

random_forest_model = RandomForestClassifier()
cv = StratifiedShuffleSplit(n_splits=5, random_state=1)

parameters = {"criterion":["gini","entropy"], "n_estimators":[50,100,150],"max_depth":[10,12,14,16,18] }
classif_RF = RandomizedSearchCV(random_forest_model, parameters,cv=cv, return_train_score=True, scoring="recall")
classif_RF.fit(X_train, y_train)


RandomizedSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=1, test_size=None,
            train_size=None),
                   error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samp...
                                                    min_weight_fraction_leaf=0.0,
                                

In [126]:
pd.DataFrame(classif_RF.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,28.191147,4.198108,0.3634,0.017965,100,18,gini,"{'n_estimators': 100, 'max_depth': 18, 'criter...",0.957934,0.958276,...,0.958276,0.000838,10,0.995098,0.99449,0.994642,0.994452,0.994452,0.994626,0.000246
1,11.309129,1.180561,0.155985,0.022346,50,12,gini,"{'n_estimators': 50, 'max_depth': 12, 'criteri...",0.965458,0.962722,...,0.964706,0.002189,2,0.977008,0.977959,0.979099,0.978377,0.978073,0.978103,0.000676
2,26.420608,1.788691,0.3792,0.059171,100,14,gini,"{'n_estimators': 100, 'max_depth': 14, 'criter...",0.963748,0.961354,...,0.96156,0.002767,4,0.982937,0.983013,0.983393,0.983165,0.982823,0.983066,0.000198
3,13.74602,1.812272,0.203895,0.053063,50,14,entropy,"{'n_estimators': 50, 'max_depth': 14, 'criteri...",0.961696,0.95896,...,0.960602,0.001392,7,0.981417,0.981949,0.982025,0.982975,0.981531,0.981979,0.00055
4,49.887864,7.319308,0.734036,0.163257,150,18,entropy,"{'n_estimators': 150, 'max_depth': 18, 'criter...",0.958618,0.958276,...,0.958482,0.002275,9,0.99411,0.993236,0.994072,0.993996,0.994034,0.993889,0.000329
5,41.392193,6.776052,0.671029,0.350064,150,10,entropy,"{'n_estimators': 150, 'max_depth': 10, 'criter...",0.97264,0.967852,...,0.967305,0.003505,1,0.9745,0.975754,0.975564,0.974576,0.976476,0.975374,0.000748
6,13.639902,2.463469,0.232333,0.036191,50,14,gini,"{'n_estimators': 50, 'max_depth': 14, 'criteri...",0.96409,0.961354,...,0.961286,0.002526,5,0.982975,0.981607,0.982633,0.984039,0.982139,0.982678,0.000822
7,31.404447,2.514398,0.467931,0.091505,100,18,entropy,"{'n_estimators': 100, 'max_depth': 18, 'criter...",0.95725,0.959302,...,0.958687,0.001068,8,0.993654,0.993464,0.99373,0.993958,0.993388,0.993638,0.000202
8,35.91224,1.060644,0.708334,0.14896,150,14,gini,"{'n_estimators': 150, 'max_depth': 14, 'criter...",0.961696,0.961354,...,0.961149,0.000958,6,0.983051,0.983355,0.982937,0.983583,0.984039,0.983393,0.000395
9,39.033736,1.415729,0.487422,0.07735,150,14,entropy,"{'n_estimators': 150, 'max_depth': 14, 'criter...",0.963406,0.961354,...,0.961902,0.002601,3,0.981455,0.982063,0.982633,0.982785,0.982405,0.982268,0.000474


In [127]:
print(classif_RF.best_estimator_)
best=classif_RF.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [129]:
best.fit(X_train, y_train)
y_pred = best.predict(X_test)

In [130]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score ,confusion_matrix,cohen_kappa_score
from imblearn.metrics import geometric_mean_score

print(confusion_matrix(y_test, y_pred))
print()
print("f1 : ", f1_score(y_test, y_pred,average="binary"))
print("recall : " ,recall_score(y_test, y_pred))
print("precision : " ,precision_score(y_test, y_pred))
print("weighted_accuracy : " ,balanced_accuracy_score(y_test, y_pred))
print("G_mean : " , geometric_mean_score(y_test, y_pred))
print("Kappa_score : " , cohen_kappa_score(y_test, y_pred))

[[6658  652]
 [ 228  700]]

f1 :  0.6140350877192983
recall :  0.7543103448275862
precision :  0.5177514792899408
weighted_accuracy :  0.8325587291853389
G_mean :  0.8288734691664994
Kappa_score :  0.5545201325752176
