# Import Library

In [1]:
!pip install imblearn



In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



# Read Data

In [3]:
data = pd.read_csv("bank-full.csv", delimiter=';')

# Simple Data Visualization

In [4]:
data.shape

(45211, 17)

# Model Construction

## Label Encoding For Categorical Data Train

Label Encoding are used to transform non-numerical&nbsp;labels&nbsp;(as long as they are hashable and comparable) to numerical&nbsp;labels. Label that are being transform to numerical are categorical data column which is job, marital, education, default, housing, loan, contact, month and poutcome.

In [5]:
le = LabelEncoder()
encode_x = data.iloc[ : , :-1]
encode_x.job = le.fit_transform(encode_x.job)
encode_x.marital = le.fit_transform(encode_x.marital)
encode_x.education = le.fit_transform(encode_x.education)
encode_x.default = le.fit_transform(encode_x.default)
encode_x.housing = le.fit_transform(encode_x.housing)
encode_x.loan = le.fit_transform(encode_x.loan)
encode_x.contact = le.fit_transform(encode_x.contact)
encode_x.month = le.fit_transform(encode_x.month)
encode_x.poutcome = le.fit_transform(encode_x.poutcome)

## Split Data

Split data are used to split the original data into 2 different parts which is training data and testing data. Training data will also be divided into 2 parts training data and validation training data. This process to ensure when the model being trained we have a data to validate the predicted output of a model.

In [6]:
split_input_data = encode_x
split_output_data = data['y']

We divided the data into 4 parts:
- X_train is a training data that have no output.
- X_test is a testing data that have no output. This data will be use by the model to predict the output of the X_test data
- y_train is a training data that has the output data from X_train data.
- y_test is the original output data from X_test. This data will be used to validate the accuracy of predicted data that were generated by the model.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(split_input_data, split_output_data, test_size=0.3, random_state=10)

## Scale Data

Here we transform the data to fit within a specific scale using these algorithms a change of "1" in any numeric feature will give the same importance to each data. We used MinMaxScaler() method from Sklearn library. Define the transformation for train and test data:- X_train will be scale using fit_transform()
- X_test will be scale using transform()

In [8]:
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

## Mix Sampling using SMOTE &amp; Random Under Sampler

Here we distributed the data using SMOTE and Random Under Sampler to make the classification output data balance. We used SMOTE to duplicates and variance the 'yes' data for the model to learn more variance of yes data.  We used Random Under Sampler to delete random sample of the 'no' data to make the model not to have low bias towards 'yes' data.

In [9]:
mixSample_X = X_train_scaled
# define pipeline
over = SMOTE(sampling_strategy=0.15)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
mixSample_X, mixSample_Y = pipeline.fit_resample(mixSample_X, y_train)
pd.DataFrame(mixSample_Y).value_counts()



no     8374
yes    4187
dtype: int64

# ANN from SKLEARN Library

## Model Fit with default Parameters

Here we used MLPClassifier() without specifying the parameters 

In [10]:
model = MLPClassifier()

## Train the Model using Training Data and Validation Training Data

In [11]:
model.fit(mixSample_X, mixSample_Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

## Predict Model using Testing Data

In [12]:
y_pred = model.predict(X_test_scaled)

### Before evaluation we need to know the value count of each class from the original output data

Here are the count of 'no' data and the count of 'yes' data from the original output data

In [13]:
y_test.value_counts()

no     12006
yes     1558
Name: y, dtype: int64

## Evaluation Metrics using Confusion Matrix

Here we used confusion matrix which is crosstab() method from pandas There are 4 terms as a representation of the result of the classification process confusion matrix. The four terms: 
- True Positive (TP): Represents positive data that is predicted to be correct.
- True Negative (TN): Represents negative data that is predicted to be correct.
- False Positive (FP) Type I Error: Represents negative data but predicted as positive data.
- False Negative (FN) Type II Error: Represents positive data but predicted as negative data.

In [14]:
confusion_matrix = pd.crosstab(y_test, y_pred)
print (confusion_matrix)

col_0     no   yes
y                 
no     10480  1526
yes      380  1178


 The conclusion from the above result: 
- True Positive (TP): **1178** 'Yes' predicted data is correct **from 1558** 'Yes' original data 
- True Negative (TN): **10480** 'No' predicted data is correct **from 12006** 'No' original data
- False Negative (FN): **380** 'Yes' predicted data falsely predicted as 'No' data
- False Positive (FP): **1526** 'No' predicted data falsely predicted as 'Yes' data

## Evaluation Metrics using Classification Report

Here we used the classification report from sklearn library in the classification report function we have precision, recall, f1-score and support for the evaluation metrics.

### Precision

Precision is the ability of a classifier not to label an instance positive that is actually negative. For each class, it is defined as the ratio of true positives to the sum of a true positive and false positive.

### Recall

Recall is the ability of a classifier to find all positive instances. For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.

### F1-Score

The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.

### Support

Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing. Support doesn’t change between models but instead diagnoses the evaluation process.

In [15]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          no       0.97      0.87      0.92     12006
         yes       0.44      0.76      0.55      1558

    accuracy                           0.86     13564
   macro avg       0.70      0.81      0.73     13564
weighted avg       0.90      0.86      0.87     13564



 The conclusion from the above result:
- Precision: **97%** 'No' predicted data are correctly predict &amp; **44%** 'Yes' predicted data are correctly predict
- Recall: **87%** 'No' original data have been predicted by the model &amp; **76%** 'Yes' original data have been predicted by the model
- F1-Score: The mean from precision and recall for 'No' predicted data is **92%** and for 'Yes' predicted data is **55%**

# Model Fit with Best Parameters using Random Search CV

We used Random Search CV from to find best parameter for our ANN model here we will compare the model that use default parameter and the model that will be given the best parameter from Random Search CV

## Define Parameters

In [16]:
parameters = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu','logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive','invscaling'],
}

## Find Best Parameter using RandomSearch CV

Here we want to focus the result based on the Recall evaluation metric because we would like to know if the result of the predicted output made by the model already predict most of the correct data inside the predicted output.

In [17]:
from sklearn.metrics import precision_score, recall_score, make_scorer

recall_scorer = make_scorer(recall_score,pos_label='yes')

grid_model = RandomizedSearchCV(MLPClassifier(), param_distributions = parameters,n_iter=20,verbose=2,scoring=recall_scorer)

grid_model.fit(mixSample_X, mixSample_Y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.6s remaining:    0.0s


[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu, total=  20.6s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu, total=  18.5s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu, total=  18.5s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu, total=  18.7s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=relu, total=  18.4s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   0.9s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   1.4s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   0.8s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   1.4s
[CV] solver=sg



[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh, total=  29.0s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh, total=  26.8s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh, total=  26.7s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh, total=  26.9s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=tanh, total=  26.9s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic, total=  13.4s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic, total=  11.0s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic, total=  14.1s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50



[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  37.9s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  37.8s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  38.0s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  38.0s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  37.8s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=  36.7s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=  36.8s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=  36.5s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=  28.3s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=  27.8s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=   2.0s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 50, 50), alpha=0



[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.4s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.5s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.4s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.5s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.3s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=   2.6s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=   2.5s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh, total=   2.5s
[CV] solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=tanh 
[CV]  solver=sgd, learning_rate=invscaling, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=



[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=  10.5s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=  10.5s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=  10.6s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=  10.6s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=  10.5s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  34.0s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  34.4s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  33.8s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  34.2s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.0001, activation=tanh, total=  34.0s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh, total=  15.5s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh, total=  15.2s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh, total=  15.4s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh, total=  15.1s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(100,), alpha=0.0001, activation=tanh, total=  15.4s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   9.6s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   9.5s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   9.5s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   9.7s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(100,), alpha=0.0001, activation=relu, total=   9.6s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.9s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  16.0s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=   4.7s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 




[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=  15.9s
[CV] solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh 
[CV]  solver=adam, learning_rate=invscaling, hidden_layer_sizes=(100,), alpha=0.05, activation=tanh, total=   3.8s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic, total=  13.3s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic 




[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic, total=  25.2s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic, total=  10.8s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic, total=  13.0s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=logistic, total=   8.9s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu, total=  18.0s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu, total=  18.0s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu, total=  18.2s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu, total=  17.9s
[CV] solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu 




[CV]  solver=sgd, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=relu, total=  17.9s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=   2.1s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=   2.3s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=   2.3s
[CV] solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=sgd, learning_rate=constant, hidden_layer_sizes=(50, 100, 50), alp



[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=  29.5s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=  29.5s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=  29.6s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=  29.4s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh 




[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 50, 50), alpha=0.0001, activation=tanh, total=  29.5s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=  10.9s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=  14.9s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic, total=  26.2s
[CV] solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation=logistic 
[CV]  solver=adam, learning_rate=adaptive, hidden_layer_sizes=(50, 100,

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 26.7min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100,),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_fun=15000, max_iter=200,
                                           momentum=0.9, n_iter_no_change=10,
                                           nesterovs_momentum=True, power_t=0.5,
                                           ran...
                   param_distributions={'activation': ['tanh', 'relu',
                                                       'logistic'],
                                        

## Check Best Parameter using Random Search CV

In [18]:
grid_model.best_params_

{'activation': 'tanh',
 'alpha': 0.0001,
 'hidden_layer_sizes': (50, 50, 50),
 'learning_rate': 'adaptive',
 'solver': 'adam'}

## Input Best Parameter into the MLPClassifier Function

In [25]:
model = MLPClassifier(hidden_layer_sizes=(50,50,50),activation='tanh',alpha=0.0001,learning_rate='adaptive',solver='adam')

## Train the Model using Training Data and Validation Training Data

In [26]:
model.fit(mixSample_X, mixSample_Y)



MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 50, 50), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

## Predict Model using Testing Data

In [27]:
y_pred = model.predict(X_test_scaled)

## Evaluation Metrics using Confusion Matrix

In [28]:
confusion_matrix = pd.crosstab(y_test, y_pred)
print (confusion_matrix)

col_0     no   yes
y                 
no     10470  1536
yes      364  1194


 The conclusion from the above result: 
- True Positive (TP): **1194** 'Yes' predicted data is correct **from 1558** 'Yes' original data 
- True Negative (TN): **10470** 'No' predicted data is correct **from 12006** 'No' original data
- False Negative (FN): **364** 'Yes' predicted data falsely predicted as 'No' data
- False Positive (FP): **1536** 'No' predicted data falsely predicted as 'Yes' data

## Evaluation Metrics using Classification Report

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          no       0.97      0.87      0.92     12006
         yes       0.44      0.77      0.56      1558

    accuracy                           0.86     13564
   macro avg       0.70      0.82      0.74     13564
weighted avg       0.91      0.86      0.88     13564



 The conclusion from the above result:
- Precision: **97%** 'No' predicted data are correctly predict &amp; **44%** 'Yes' predicted data are correctly predict
- Recall: **87%** 'No' original data have been predicted by the model &amp; **77%** 'Yes' original data have been predicted by the model
- F1-Score: The mean from precision and recall for 'No' predicted data is **92%** and for 'Yes' predicted data is **56%**

# Evaluation Metrics using Cross Validation

In [30]:
cv = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(model, split_input_data, split_output_data, cv=cv)

In [31]:
print('F1_Score: %.3f (%.3f)' % (mean(scores), std(scores)))

F1_Score: 0.889 (0.004)


In [32]:
kf= KFold(n_splits=5)
X = split_input_data.to_numpy()
y = split_output_data
le = LabelEncoder()
y = le.fit_transform(y)
F1=[]
Accuracy=[]
Recall=[]
Precision=[]
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train = X[train_index]
    y_train = y[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
    X_test = X[test_index]
    y_test = y[test_index]  # See comment on ravel and  y_train
    
    #Create the Dataframe
    X_train = pd.DataFrame(X_train,columns=data.drop('y',axis=1).columns)
    X_test = pd.DataFrame(X_test,columns=data.drop('y',axis=1).columns)
    

    #Standard Scalling
    ss = MinMaxScaler()
    X_train= ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    X_train = pd.DataFrame(X_train, columns = split_input_data.columns)
    X_test = pd.DataFrame(X_test , columns = split_input_data.columns)
    

    # Sampling
    over = SMOTE(sampling_strategy = 0.18)
    under = RandomUnderSampler(sampling_strategy=0.85)
    steps = [('o',over),('u',under)]
    pipeline = Pipeline(steps=steps)
    
    X_train,y_train_s =pipeline.fit_resample(X_train,y_train)
    
    #Modelling
    model1 = MLPClassifier(hidden_layer_sizes=(50,100,50),activation='tanh',alpha=0.0001,learning_rate='adaptive',solver='adam')
    model1.fit(X_train,y_train_s)
    y_pred1 = model1.predict(X_test)
    
    F1.append(f1_score(y_test, y_pred1))
    Accuracy.append(accuracy_score(y_test, y_pred1))
    Recall.append(recall_score(y_test, y_pred1))
    Precision.append(precision_score(y_test, y_pred1))



In [33]:
print('F1 : '+str(np.mean(F1)))
print('Accuracy : '+str(np.mean(Accuracy)))
print('Recall : '+str(np.mean(Recall)))
print('Precision : '+str(np.mean(Precision)))

F1 : 0.28927563501399245
Accuracy : 0.6312180250748696
Recall : 0.8195134926329791
Precision : 0.2083441420521376
