In [48]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import plotly.graph_objects as go
import joblib

### Read and Clean Data

In [None]:
#read csv using pandas
df = pd.read_csv('Bank_Predictions.csv')
df.head()

Unnamed: 0,Number,Customer_ID,Last_Name,Cr_Score,Location,Gender,Age,History,Current_Balance,Num_Of_Products,Has_CrCard,IsActiveMember,Customer_Salary,Acc_Closed
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
#examine shape of data along with column types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Number           10000 non-null  int64  
 1   Customer_ID      10000 non-null  int64  
 2   Last_Name        10000 non-null  object 
 3   Cr_Score         10000 non-null  int64  
 4   Location         10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   History          10000 non-null  int64  
 8   Current_Balance  10000 non-null  float64
 9   Num_Of_Products  10000 non-null  int64  
 10  Has_CrCard       10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  Customer_Salary  10000 non-null  float64
 13  Acc_Closed       10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [None]:
#examine the predicted column to ensure data integrity
df.Acc_Closed.value_counts()

Acc_Closed
0    7963
1    2037
Name: count, dtype: int64

In [44]:
#drop irrelevant columns
df_dropped = df.drop(columns=['Number','Customer_ID','Last_Name'])


#assign variables
x = df_dropped.drop('Acc_Closed',axis=1)
y = df_dropped.Acc_Closed


#encode categorical variables and drop columns that could cause multicollinearity
x = pd.get_dummies(x, columns=['Location','Gender'], drop_first=True)
print("New X Shape: ",x.shape)
print("New column types post encoding:\n", x.dtypes)



New X Shape:  (10000, 11)
New column types post encoding:
 Cr_Score              int64
Age                   int64
History               int64
Current_Balance     float64
Num_Of_Products       int64
Has_CrCard            int64
IsActiveMember        int64
Customer_Salary     float64
Location_Germany       bool
Location_Spain         bool
Gender_Male            bool
dtype: object


In [32]:
#splitting train and test groups, stratifying to ensure even split in train and test
x_train, x_test, y_train, y_test = train_test_split(x
                                                    ,y
                                                    ,test_size=.2
                                                    ,random_state=42
                                                    ,stratify=y
                                                )

#ensuring split 
print('x_train:', x_train.shape)
print('x_test', x_test.shape)

x_train: (8000, 11)
x_test (2000, 11)


In [33]:
#scaling features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

print('x_train shape: ',x_train.shape)

x_train shape:  (8000, 11)


### Build ANN Model

In [None]:
#initializing uising a sequential model as it is the simplest and non-linear topography isnt needed
model = Sequential()
#input layer + first hidden layer, relu is applied to each nodes output
model.add(Dense(units=16, activation='relu', input_dim=x_train.shape[1]))
#second hidden layer and again relu
model.add(Dense(units=8, activation='relu'))
#output layer (binary classification -> 1 neuron, sigmoid)
model.add(Dense(units=1,activation='sigmoid'))
#compile model and loss is binary and adam adjusts weights individually
model.compile(
    optimizer='adam'
    ,loss='binary_crossentropy'
    ,metrics=['accuracy']
)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Training

In [38]:
## training in 50 passes through data, batches of 32
history = model.fit(
    x_train
    ,y_train
    ,epochs=50
    ,batch_size=32
    ,validation_data=(x_test,y_test)
    ,verbose=1
)

Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7674 - loss: 0.5095 - val_accuracy: 0.7960 - val_loss: 0.4484
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8095 - loss: 0.4332 - val_accuracy: 0.8105 - val_loss: 0.4256
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8240 - loss: 0.4157 - val_accuracy: 0.8200 - val_loss: 0.4128
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8340 - loss: 0.3995 - val_accuracy: 0.8390 - val_loss: 0.3957
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8434 - loss: 0.3838 - val_accuracy: 0.8480 - val_loss: 0.3814
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8479 - loss: 0.3702 - val_accuracy: 0.8550 - val_loss: 0.3695
Epoch 7/50
[1m250/250[0m 

### Reporting

In [43]:
#predicting probabilities
y_prob = model.predict(x_test)
#convert probs to bianry using .5 threshold
y_pred = (y_prob >= .5).astype(int).ravel()

print('test accuracy:', accuracy_score(y_test,y_pred))

cm = confusion_matrix(y_test,y_pred)
print('\nConfusion Matrix:\n',cm)
print('\nClassification Report:\n',classification_report(y_test,y_pred,digits=4))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 806us/step
test accuracy: 0.86

Confusion Matrix:
 [[1531   62]
 [ 218  189]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8754    0.9611    0.9162      1593
           1     0.7530    0.4644    0.5745       407

    accuracy                         0.8600      2000
   macro avg     0.8142    0.7127    0.7453      2000
weighted avg     0.8505    0.8600    0.8467      2000



In [None]:
#model accuracy plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    y=history.history['accuracy'],
    mode='lines',
    name='Train Accuracy'
))

fig.add_trace(go.Scatter(
    y=history.history['val_accuracy'],
    mode='lines',
    name='Validation Accuracy'
))

fig.update_layout(
    title='Model Accuracy',
    xaxis_title='Epoch',
    yaxis_title='Accuracy'
)

fig.show()

In [47]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='Train Loss'
))

fig.add_trace(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='Validation Loss'
))

fig.update_layout(
    title='Model Loss',
    xaxis_title='Epoch',
    yaxis_title='Loss'
)

fig.show()

### Saving Trained Model

In [50]:

# Save trained ANN
model.save("ann_bank_model.h5")

# Save scaler
joblib.dump(scaler, "scaler.pkl")



['scaler.pkl']