In [1]:
# imports

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data
data = pd.read_csv(r"..\data\Churn_Modelling.csv")

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df = data.copy()

In [5]:
# drop unnecessary columns
cols_drop = [
    "RowNumber",
    "CustomerId",
    "Surname",
]

df = df.drop(columns=cols_drop)

In [6]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
# Encode categorical variables
def encode_categorical(data, columns):
    encoders = {}
    for col in columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        encoders[col] = le
    return data, encoders

In [8]:
# NOTE: Only encode Gender col. since its binary
data_cat_encoded, gender_encoder = encode_categorical(df, ["Gender"])

In [9]:
data_cat_encoded.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [10]:
for i, val in enumerate(gender_encoder["Gender"].classes_):
    print(f"{i}: {val}")

0: Female
1: Male


In [11]:
onehot_encoder_geo = OneHotEncoder(sparse_output=False)
geo_encoder = onehot_encoder_geo.fit_transform(data_cat_encoded[["Geography"]])
onehot_encoder_geo.get_feature_names_out(["Geography"])
geo_encoded_df = pd.DataFrame(geo_encoder, columns=onehot_encoder_geo.get_feature_names_out(["Geography"]))



In [12]:
data_cat_encoded.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [13]:
df_2 = pd.concat([data_cat_encoded, geo_encoded_df], axis=1)
df_2 = df_2.drop(columns=["Geography"])

In [14]:
df_2.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [15]:
# Save the encoders
with open(r"../models/geo_encoder.pkl", "wb") as f:
    pickle.dump(onehot_encoder_geo, f)

with open(r"../models/gender_encoder.pkl", "wb") as f:
    pickle.dump(gender_encoder, f)



In [16]:
# Split data into dependents and target features
X = df_2.drop(columns=["Exited"])
y = df_2["Exited"]

In [17]:
# Split data into train and test sets
# reserve 20% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Save the scaler
with open(r"../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

## ANN Building & Training

### Aspects of the Artificial Neural Network
1. Create Sequential model
2. Input layer - Dimention (n x m)
   1. n: Number of sample datapoints
   2. m: Number of independent features
3. Dense layer (64 Neurons) - Hidden layer 1
   1. Activation - ReLu
4. Optimizer - Adam (Backpropagation & weight updation)
5. Loss function - 
6. Output layer (1 Neuron)
   1. Activation - Sigmoid (Binary classification)
7. Metrics - To quantify loss
   1. Accuracy (for classification)
8. Logs - Storing training info. & vizualizing with `Tensorboard`

In [24]:
# imports

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime


In [25]:
X_train_scaled.shape[1]

12

#### Note
Not using Functional API style of Keras for model building since it is not a very complicated network.

For reference:

```py
# Input layer
inputs = Input(shape=(X_train_scaled.shape[1],))

# Hidden layers
x1 = Dense(64, activation="relu)(inputs) ## layer 1
x2 = Dense(32, activation="relu)(x1) ## layer 2

# Output layer
outputs = Dense(1, activation="sigmoid)(x2)

# Build the model
model = Model(inputs=inputs, outputs=outputs)
```

In [26]:
# Build the model
model = Sequential()
model.add(Input(shape=(X_train_scaled.shape[1],))) ## Input layer with the number of features
model.add(Dense(64, activation="relu")) ## Hidden layer 1 with 64 neurons and ReLU activation
model.add(Dense(32, activation="relu")) ## Hidden layer 2 with 32 neurons and ReLU activation
model.add(Dense(1, activation="sigmoid")) ## Output layer with 1 neuron and sigmoid activation for binary classification


In [27]:
# model summary
model.summary()

In [29]:
# load optimizer
lr = 0.01 ## learning rate
opt_adam = tf.keras.optimizers.Adam(learning_rate=lr)

# load loss function for binary classification
loss_accuracy = tf.keras.losses.BinaryCrossentropy()

In [30]:
# compile the model
model.compile(
    optimizer=opt_adam,
    loss=loss_accuracy,
    metrics=["accuracy"]
)

In [31]:
# setup tensorboard to view training progress
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y-%m-%d (%H-%M-%S)")

# log & store training progress — including metrics, graphs, and more
# histogram_freq=1 (True) will log the weights and biases histograms after every single epoch
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [34]:
# set up early stopping
# Monitor validation loss
# patience=5 - Stop after 5 epochs with no improvement
# Restore the best weights after stopping
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=10,          
    restore_best_weights=True  
)

### Note: Epoch & Bacthing
1. Epoch = Iteration over entire training set i.e. NN goes through entire training set once in a single epoch
2. Batch size = No. of examples considered for each step of updating weights (One epoch has multiple steps of gradient updates since NN needs to go through entire training set).

Example,
Training Dataset has 1000 samples
Batch size = 50 samples
Epochs = 100

Since batch size is 50 and there are 1000 samples,
model will take 1000/50 steps to go through entire dataset once.
i.e. For one epoch, weights are updated 20 times by considering the
dataset of 1000 examples in batches/chunks of 50 examples.

Steps per epoch = 1000 / 50 = 20
$\rightarrow$ So each epoch has 20 gradient updates

Total weight updates across training = 100 epochs × 20 steps/epoch = 2000 updates

In [35]:
# train the model
# Use test dataset for validation
history = model.fit(
    X_train_scaled, 
    y_train, 
    epochs=100, 
    batch_size=50, 
    validation_data=(X_test_scaled, y_test), 
    callbacks=[tensorboard_callback, early_stopping]
)

Epoch 1/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8685 - loss: 0.3309 - val_accuracy: 0.8660 - val_loss: 0.3425
Epoch 2/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8664 - loss: 0.3229 - val_accuracy: 0.8660 - val_loss: 0.3554
Epoch 3/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8701 - loss: 0.3146 - val_accuracy: 0.8610 - val_loss: 0.3496
Epoch 4/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8730 - loss: 0.3086 - val_accuracy: 0.8565 - val_loss: 0.3557
Epoch 5/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8681 - loss: 0.3092 - val_accuracy: 0.8620 - val_loss: 0.3458
Epoch 6/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8704 - loss: 0.3055 - val_accuracy: 0.8610 - val_loss: 0.3534
Epoch 7/100
[1m160/16

In [36]:
# save the model as an HDF5 file
model.save(r"../models/churn_model.h5")



In [37]:
# load Tensorboard extension in Jupyter Notebook
%load_ext tensorboard

In [41]:
# log directory
%tensorboard --logdir logs/fit --port 6012

Reusing TensorBoard on port 6012 (pid 16388), started 0:02:16 ago. (Use '!kill 16388' to kill it.)