# Bank Customer Churn Model

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [2]:
import tensorflow as tf
print(tf.__version__)

2.1.0


In [0]:
# basic packages that is required for analysis and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# run this cell to upload file if you are using google colab
from google.colab import files
import io

uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['bank.csv']))

Saving bank.csv to bank.csv


In [11]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [0]:
# on seaing the head of the dataframe it clear that RowNumber is not a signifcant field for this problem
# droping row number since that column doenst contribute to the modeling
df.drop(columns=['RowNumber'], axis=1, inplace=True)

In [13]:
# checking for null values
for col in df.columns:
  print("Column Name {} has {} NaN Values.".format(col,df[col].isna().sum()))

Column Name CustomerId has 0 NaN Values.
Column Name Surname has 0 NaN Values.
Column Name CreditScore has 0 NaN Values.
Column Name Geography has 0 NaN Values.
Column Name Gender has 0 NaN Values.
Column Name Age has 0 NaN Values.
Column Name Tenure has 0 NaN Values.
Column Name Balance has 0 NaN Values.
Column Name NumOfProducts has 0 NaN Values.
Column Name HasCrCard has 0 NaN Values.
Column Name IsActiveMember has 0 NaN Values.
Column Name EstimatedSalary has 0 NaN Values.
Column Name Exited has 0 NaN Values.


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


*   All the columns has values and doesn't have any null value.
*   i.e., we dont have any rows with empyt cell, that means the dataset is complete.

In [15]:
for col in df.columns:
  print("{} ---- nunique {}.".format(col, df[col].nunique()))

CustomerId ---- nunique 10000.
Surname ---- nunique 2932.
CreditScore ---- nunique 460.
Geography ---- nunique 3.
Gender ---- nunique 2.
Age ---- nunique 70.
Tenure ---- nunique 11.
Balance ---- nunique 6382.
NumOfProducts ---- nunique 4.
HasCrCard ---- nunique 2.
IsActiveMember ---- nunique 2.
EstimatedSalary ---- nunique 9999.
Exited ---- nunique 2.


In [16]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CustomerId,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48
Exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


### From the above describe data we can condlude the below informations
* Creditscore, Balance, Extimated Salary are interval data type and since the mean and the 50th percentile of the above features doesnt vary much we can conclude that the data doesnt have outliers.
* Age, HasCrCard, IsActiveMember & Exited are Nominal data, basically they are categorical value and those values are not quantitative in nature.
* Tenure feature is a of Ordinal data type.

In [17]:
df['Tenure'].unique()

array([ 2,  1,  8,  7,  4,  6,  3, 10,  5,  9,  0])

In [18]:
df.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')

In [0]:
dfnoID = df.drop(labels=['CustomerId'], axis=1)

In [0]:
# required standardizing and normalizing packages
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Normalizer

In [0]:
enc_Geography = LabelEncoder()
enc_Gender = LabelEncoder()
Geography = dfnoID['Geography']
Gender = dfnoID['Gender']
enc_Geography.fit(Geography)
enc_Gender.fit(Gender)
Geophy = enc_Geography.transform(Geography)
Gen = enc_Gender.transform(Gender)

In [0]:
dfnoID.drop(labels=['Geography', 'Gender', 'Surname'], axis=1, inplace=True)

In [0]:
dfnoID['Geography'] = Geophy
dfnoID['Gender'] = Gen

In [24]:
target = dfnoID.pop('Exited')
target.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [0]:
# module for test train split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfnoID, target, test_size=0.30, random_state=42)

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [0]:
array([5.65716810e-03, 3.03740569e-04, 2.84756784e-05, 9.17952124e-01,
       1.89837856e-05, 0.00000000e+00, 0.00000000e+00, 3.96650728e-01,
       9.49189279e-06, 9.49189279e-06])

dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(dfnoID_norm, tf.float32),
            tf.cast(target.values, tf.int32)
        )
    ))

In [0]:
# Printing the feature values and target values of the first row in the dataset
for features_tensor, target_tensor in dataset:
  print(f'features:{features_tensor} target:{target_tensor}')
  break

features:[6.1075012e-03 4.1440234e-04 1.9733445e-05 0.0000000e+00 9.8667224e-06
 9.8667224e-06 9.8667224e-06 9.9998128e-01 0.0000000e+00 0.0000000e+00] target:1


In [0]:

model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

#model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [91]:
model.fit(X_train, y_train.values, epochs=50, batch_size=8)

Train on 7000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7ff296d78828>

7. Predicting the results using 0.5 as a threshold value

In [0]:
prediction = model.predict(X_test)
prediction = prediction > 0.5

8. Printing the Accuracy Score and Confusion Matrix

In [93]:
# importing packages for measuring accuracy score 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, prediction)
print("Accuracy Score of the model :{} ".format(accuracy))

Accuracy Score of the model :0.8686666666666667 


In [94]:
# importing module required for confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, prediction)

array([[2351,   65],
       [ 329,  255]])