# Author 

1. Muhammad Dhiaz Rafilianza **(First Author)**
2. Ghazali Akmal Rabbani **(Second Author)**
3. Daniel Najoan **(Third Author)**

# Import Library

In [None]:
!pip install imblearn



In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



# Read Data

In [None]:
data = pd.read_csv("bank-full.csv", delimiter=';')

# Simple Data Visualization

In [None]:
data.shape

(45211, 17)

# Model Construction

## Label Encoding For Categorical Data Train

Label Encoding are used to transform non-numerical&nbsp;labels&nbsp;(as long as they are hashable and comparable) to numerical&nbsp;labels. Label that are being transform to numerical are categorical data column which is job, marital, education, default, housing, loan, contact, month and poutcome.

In [None]:
le = LabelEncoder()
encode_x = data.iloc[ : , :-1]
encode_x.job = le.fit_transform(encode_x.job)
encode_x.marital = le.fit_transform(encode_x.marital)
encode_x.education = le.fit_transform(encode_x.education)
encode_x.default = le.fit_transform(encode_x.default)
encode_x.housing = le.fit_transform(encode_x.housing)
encode_x.loan = le.fit_transform(encode_x.loan)
encode_x.contact = le.fit_transform(encode_x.contact)
encode_x.month = le.fit_transform(encode_x.month)
encode_x.poutcome = le.fit_transform(encode_x.poutcome)

## Split Data

Split data are used to split the original data into 2 different parts which is training data and testing data. Training data will also be divided into 2 parts training data and validation training data. This process to ensure when the model being trained we have a data to validate the predicted output of a model.

In [None]:
split_input_data = encode_x
split_output_data = data['y']

We divided the data into 4 parts:
- X_train is a training data that have no output.
- X_test is a testing data that have no output. This data will be use by the model to predict the output of the X_test data
- y_train is a training data that has the output data from X_train data.
- y_test is the original output data from X_test. This data will be used to validate the accuracy of predicted data that were generated by the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(split_input_data, split_output_data, test_size=0.3, random_state=10)

## Scale Data

Here we transform the data to fit within a specific scale using these algorithms a change of "1" in any numeric feature will give the same importance to each data. We used StandardScaler() method from Sklearn library. Define the transformation for train and test data:
- X_train will be scale using fit_transform()
- X_test will be scale using transform()

In [None]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

## Mix Sampling using SMOTE &amp; Random Under Sampler

Here we distributed the data using SMOTE and Random Under Sampler to make the classification output data balance. We used SMOTE to duplicates and variance the 'yes' data for the model to learn more variance of yes data.  We used Random Under Sampler to delete random sample of the 'no' data to make the model not to have low bias towards 'yes' data.

In [None]:
mixSample_X = X_train_scaled
# define pipeline
over = SMOTE(sampling_strategy=0.15)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
mixSample_X, mixSample_Y = pipeline.fit_resample(mixSample_X, y_train)
pd.DataFrame(mixSample_Y).value_counts()



no     8374
yes    4187
dtype: int64

In [None]:
asd

In [None]:
pd.DataFrame(mixSample_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.744139,-0.714920,-0.280363,1.044009,-0.13531,-0.377628,0.893093,-0.436666,-0.710219,0.266698,1.159604,-0.778758,-0.573585,-0.411514,-0.304903,0.443520
1,-0.367918,1.728822,-1.925577,1.044009,-0.13531,-0.091325,-1.119705,-0.436666,-0.710219,1.109056,-1.498418,-0.436343,0.395689,-0.411514,-0.304903,0.443520
2,1.231023,-1.325856,-1.925577,-0.298415,-0.13531,0.143449,-1.119705,-0.436666,1.521344,-0.334986,0.827351,1.022810,-0.573585,-0.411514,-0.304903,0.443520
3,-0.744139,1.423354,-0.280363,1.044009,-0.13531,0.320577,0.893093,-0.436666,1.521344,-1.658692,0.162846,-0.405215,-0.250493,-0.411514,-0.304903,0.443520
4,-0.461973,0.812419,-1.925577,-0.298415,-0.13531,-0.503228,-1.119705,-0.436666,-0.710219,1.109056,-0.169407,-0.681481,-0.250493,-0.411514,-0.304903,0.443520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12556,-1.060133,-1.088523,1.364851,1.044009,-0.13531,-0.219291,0.893093,-0.436666,-0.710219,0.584428,-1.830671,-0.281606,-0.250493,2.837236,0.769506,-1.585547
12557,-1.625343,1.117886,1.364851,-0.298415,-0.13531,-0.060418,-1.119705,-0.436666,-0.710219,-0.442449,1.491857,-0.478712,-0.573585,0.872079,0.571280,-0.571014
12558,-0.461973,1.589567,-0.280363,0.840017,-0.13531,0.739308,-1.119705,-0.436666,1.521344,0.434227,0.162846,2.902026,0.248400,-0.411514,-0.304903,0.443520
12559,3.318613,-0.165435,-0.280363,2.386433,-0.13531,0.178643,0.893093,-0.436666,-0.710219,0.904694,1.408308,-0.119465,-0.234245,1.426879,1.279694,-0.571014


In [None]:
X_test_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.325079,1.423354,-1.925577,-0.298415,-0.13531,-0.440750,0.893093,-0.436666,1.521344,0.507372,0.827351,0.415802,0.395689,-0.411514,-0.304903,0.44352
1,-0.932250,0.506951,1.364851,1.044009,-0.13531,0.145382,0.893093,-0.436666,1.521344,0.507372,0.827351,-0.739847,-0.573585,-0.411514,-0.304903,0.44352
2,-1.214416,-0.103985,1.364851,1.044009,-0.13531,0.284829,0.893093,-0.436666,1.521344,0.266698,0.162846,2.520873,-0.250493,-0.411514,-0.304903,0.44352
3,-1.214416,1.423354,-0.280363,-0.298415,-0.13531,1.958202,0.893093,-0.436666,-0.710219,-1.057007,0.827351,-0.253463,-0.573585,-0.411514,-0.304903,0.44352
4,0.196415,-0.714920,-0.280363,-0.298415,-0.13531,-0.421749,-1.119705,-0.436666,1.521344,-0.334986,0.827351,0.695959,-0.250493,-0.411514,-0.304903,0.44352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559,-1.684693,0.812419,-0.280363,-0.298415,-0.13531,-0.436241,0.893093,2.290082,0.405563,-0.334986,0.827351,-0.930510,1.688054,3.175350,0.232301,-2.60008
13560,-0.932250,1.423354,-0.280363,-0.298415,-0.13531,-0.278759,-1.119705,-0.436666,-0.710219,0.507372,1.159604,-0.311829,-0.573585,-0.411514,-0.304903,0.44352
13561,-0.650084,1.423354,1.364851,-0.298415,-0.13531,1.404920,0.893093,-0.436666,-0.710219,0.627708,-0.169407,-0.611442,0.072598,-0.411514,-0.304903,0.44352
13562,-0.179807,0.812419,-0.280363,-0.298415,-0.13531,-0.424003,-1.119705,-0.436666,0.405563,-1.658692,-0.833913,4.520886,0.072598,-0.411514,-0.304903,0.44352


In [None]:
X_test

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
4393,55,9,0,1,0,0,1,0,2,20,8,365,4,-1,0,3
4438,31,6,2,2,0,1820,1,0,2,20,8,68,1,-1,0,3
11108,28,4,2,2,0,2253,1,0,2,18,6,906,2,-1,0,3
35421,28,9,1,1,0,7449,1,0,0,7,8,193,1,-1,0,3
2667,43,2,1,1,0,59,0,0,2,13,8,437,2,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37075,23,7,1,1,0,14,1,1,1,13,8,19,8,358,1,0
26370,31,9,1,1,0,503,0,0,0,20,9,178,1,-1,0,3
15684,34,9,2,1,0,5731,1,0,0,21,5,101,3,-1,0,3
29313,39,7,1,1,0,52,0,0,1,2,3,1420,3,-1,0,3


# Random Forest from SKLEARN Library

## Model Fit with default Parameters

Here we used Random Forest only define class weight parameter

In [None]:
model = RandomForestClassifier()

## Train the Model using Training Data and Validation Training Data

In [None]:
model.fit(mixSample_X, mixSample_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Predict Model using Testing Data

In [None]:
y_pred = model.predict(X_test_scaled)

### Before evaluation we need to know the value count of each class from the original output data

Here are the count of 'no' data and the count of 'yes' data from the original output data

In [None]:
y_test.value_counts()

## Evaluation Metrics using Confusion Matrix

Here we used confusion matrix which is crosstab() method from pandas There are 4 terms as a representation of the result of the classification process confusion matrix. The four terms: 
- True Positive (TP): Represents positive data that is predicted to be correct.
- True Negative (TN): Represents negative data that is predicted to be correct.
- False Positive (FP) Type I Error: Represents negative data but predicted as positive data.
- False Negative (FN) Type II Error: Represents positive data but predicted as negative data.

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred)
print (confusion_matrix)

 The conclusion from the above result: 
- True Positive (TP): **1199** 'Yes' predicted data is correct **from 1558** 'Yes' original data 
- True Negative (TN): **10732** 'No' predicted data is correct **from 12006** 'No' original data
- False Negative (FN): **359** 'Yes' predicted data falsely predicted as 'No' data
- False Positive (FP): **1274** 'No' predicted data falsely predicted as 'Yes' data

## Evaluation Metrics using Classification Report

Here we used the classification report from sklearn library in the classification report function we have precision, recall, f1-score and support for the evaluation metrics.

### Precision

Precision is the ability of a classifier not to label an instance positive that is actually negative. For each class, it is defined as the ratio of true positives to the sum of a true positive and false positive.

### Recall

Recall is the ability of a classifier to find all positive instances. For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.

### F1-Score

The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.

### Support

Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing. Support doesn’t change between models but instead diagnoses the evaluation process.

In [None]:
print(classification_report(y_test,y_pred))

 The conclusion from the above result:
- Precision: **97%** 'No' predicted data are correctly predict &amp; **48%** 'Yes' predicted data are correctly predict
- Recall: **89%** 'No' original data have been predicted by the model &amp; **77%** 'Yes' original data have been predicted by the model
- F1-Score: The mean from precision and recall for 'No' predicted data is **93%** and for 'Yes' predicted data is **59%**

# Model Fit with Best Parameters using Grid Search CV

We used Grid Search CV from to find best parameter for our Random Forest model here we will compare the model that use default parameter and the model that will be given the best parameter from Grid Search CV

## Define Parameters

In [None]:
parameters = {
        'criterion':['gini','entropy'],
        'bootstrap':[True,False],
         'oob_score':[True,False],
        'warm_start':[True,False],
        'class_weight' : ['balanced', 'balanced_subsample']
}

## Find Best Parameter using Randomized Search CV

Here we want to focus the result based on the Recall evaluation metric because we would like to know if the result of the predicted output made by the model already predict most of the correct data inside the predicted output.

In [None]:
from sklearn.metrics import precision_score, recall_score, make_scorer

recall_scorer = make_scorer(recall_score,pos_label='yes')

grid_model = RandomizedSearchCV(RandomForestClassifier(), param_distributions = parameters,n_iter=20,verbose=2,scoring=recall_scorer)

grid_model.fit(mixSample_X, mixSample_Y)

## Check Best Parameter using Randomized Search CV

In [None]:
grid_model.best_params_

## Input Best Parameter into the Random Forest Function

In [None]:
model = RandomForestClassifier(class_weight='balanced_subsample',
                               bootstrap=True,
                               criterion='entropy',oob_score=True,
                               warm_start=True)

## Train the Model using Training Data and Validation Training Data

In [None]:
model.fit(mixSample_X, mixSample_Y)

## Predict Model using Testing Data

In [None]:
y_pred = model.predict(X_test_scaled)

## Evaluation Metrics using Confusion Matrix

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred)
print (confusion_matrix)

 The conclusion from the above result: 
- True Positive (TP): **1200** 'Yes' predicted data is correct **from 1558** 'Yes' original data 
- True Negative (TN): **10767** 'No' predicted data is correct **from 12006** 'No' original data
- False Negative (FN): **358** 'Yes' predicted data falsely predicted as 'No' data
- False Positive (FP): **1239** 'No' predicted data falsely predicted as 'Yes' data

## Evaluation Metrics using Classification Report

In [None]:
print(classification_report(y_test,y_pred))

 The conclusion from the above result:
- Precision: **97%** 'No' predicted data are correctly predict &amp; **49%** 'Yes' predicted data are correctly predict
- Recall: **90%** 'No' original data have been predicted by the model &amp; **77%** 'Yes' original data have been predicted by the model
- F1-Score: The mean from precision and recall for 'No' predicted data is **93%** and for 'Yes' predicted data is **60%**

# Evaluation Metrics using Cross Validation

In [None]:
cv = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(model, split_input_data, split_output_data, cv=cv)

In [None]:
print('F1_Score: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
kf= KFold(n_splits=5)
X = split_input_data.to_numpy()
y = split_output_data
le = LabelEncoder()
y = le.fit_transform(y)
F1=[]
Accuracy=[]
Recall=[]
Precision=[]
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train = X[train_index]
    y_train = y[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
    X_test = X[test_index]
    y_test = y[test_index]  # See comment on ravel and  y_train
    
    #Create the Dataframe
    X_train = pd.DataFrame(X_train,columns=data.drop('y',axis=1).columns)
    X_test = pd.DataFrame(X_test,columns=data.drop('y',axis=1).columns)
    

    #Standard Scalling
    ss = StandardScaler()
    X_train= ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    X_train = pd.DataFrame(X_train, columns = split_input_data.columns)
    X_test = pd.DataFrame(X_test , columns = split_input_data.columns)
    

    # Sampling
    over = SMOTE(sampling_strategy = 0.18)
    under = RandomUnderSampler(sampling_strategy=0.85)
    steps = [('o',over),('u',under)]
    pipeline = Pipeline(steps=steps)
    
    X_train,y_train_s =pipeline.fit_resample(X_train,y_train)
    
    #Modelling
    model1 = RandomForestClassifier(class_weight='balanced_subsample',bootstrap=True,criterion='entropy',oob_score=True,warm_start=True)
    model1.fit(X_train,y_train_s)
    y_pred1 = model1.predict(X_test)
    
    F1.append(f1_score(y_test, y_pred1))
    Accuracy.append(accuracy_score(y_test, y_pred1))
    Recall.append(recall_score(y_test, y_pred1))
    Precision.append(precision_score(y_test, y_pred1))

In [None]:
print('F1 : '+str(np.mean(F1)))
print('Accuracy : '+str(np.mean(Accuracy)))
print('Recall : '+str(np.mean(Recall)))
print('Precision : '+str(np.mean(Precision)))