In [3]:
# keras and tensorflow
# Keras is a high-level wrapper (or API) that runs on top of TensorFlow
# Keras is a user-friendly interface (wrapper) for TensorFlow.
# It abstracts away low-level details and gives a clean, Pythonic way to work with neural networks.

In [4]:
# drop out few neurons so that model doesn't overfit

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [6]:
df = pd.read_csv("Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
drop_cols = ['RowNumber', 'CustomerId',	'Surname']
df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
lable_gender = LabelEncoder()
df['Gender'] = lable_gender.fit_transform(df['Gender'])
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [9]:
# OHE
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
geo_encoder = ohe.fit_transform(df[['Geography']])

In [10]:
ohe_df = pd.DataFrame(geo_encoder.toarray(), columns=['Geo_France', 'Geo_Germany', 'Geo_Spain'])

In [11]:
df = pd.concat([df.drop("Geography", axis=1), ohe_df], axis=1)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_France,Geo_Germany,Geo_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [12]:
# save ecoder and scaler
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(lable_gender, file)
    
with open('ohe_geo.pkl', 'wb') as file:
    pickle.dump(ohe, file)

In [13]:
# divide x and y
X = df.drop("Exited", axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

with open('scaler.pkl', 'wb')  as file:
    pickle.dump(scaler, file)

ANN implementation

In [20]:
import tensorflow as tf

In [15]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_France,Geo_Germany,Geo_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [16]:
# Framework:
# 1. Sequencial network
# 2. Dense (for Hidden neuron)
# 3. Activation function (sigmoid, tanh, RelU, Leaky RelU)
# 4. Optimizer (Back propagation, updating the weights)
# 5. Loss function
# 6. Metrics [accuracy, MSE]
# 7. Training -> Logs folder -> Tensorboard

In [None]:
# Common Activation Functions Used in Hidden Layers (and Why)
# 1. ReLU (Rectified Linear Unit)
# Formula: 𝑓(𝑥)=max(0,𝑥)
# f(x)=max(0,x)

# Why it's used:
# Simple and fast to compute
# Helps avoid the vanishing gradient problem
# Works well in most deep networks

# 2. Leaky ReLU / Parametric ReLU
# Formula (Leaky ReLU):
# f(x)={ x  if x>0
#      {αx  if x≤0
# Where α is a small constant like 0.01
# Why it's used:
# Solves the "dying ReLU" problem (when neurons get stuck and never activate)
# Allows a small gradient even when 𝑥<0

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [29]:
(X_train.shape[1],)

(12,)

In [31]:
## Build ANN model
model = Sequential([
        Dense(64, activation='relu',input_shape=(X_train.shape[1],)), ## HL!: connected with input layer
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid') ## output layer
]
)

In [None]:
# parameters
# Input-12, HL1-64, HL2-32, output-1
# 12*64 + 64*32 + 32*1 + 64 + 32 + 1

In [33]:
12*64 + 64*32 + 32*1 + 64 + 32 + 1

2945

In [34]:
model.summary()

In [None]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate = 0.01)
loss = tensorflow.keras.losses.binary_crossentropy()

In [39]:
# for forward and backward propagation I need to compile the model

# model.compile(optimizer='adam', loss="binary_crossentropy", metric=['accuracy'])
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])

In [None]:
# Here’s a concise explanation of the important optimizers in TensorFlow Keras, 
# including how they work and when to use them:

# 1. SGD (Stochastic Gradient Descent)
# How it works: Updates model weights based on the gradient of the loss function with respect to each weight.
# Variants:
# Momentum: Helps accelerate SGD by dampening oscillations.
# Nesterov Momentum: Looks ahead at the next position before computing gradient.
# When to use: Simple and effective for convex problems or fine-tuning pre-trained models.

# 2. Adam (Adaptive Moment Estimation)
# How it works: Combines ideas from Momentum and RMSprop. Maintains moving averages of both gradients and their squares.
# Pros: Fast convergence, good for most problems.
# When to use: Default optimizer for most deep learning tasks (CNNs, Transformers, etc.).

# 3. RMSprop
# How it works: Keeps a moving average of squared gradients to adjust the learning rate dynamically.
# Pros: Works well for recurrent neural networks (RNNs) and non-stationary objectives.
# When to use: Time series, sequences, and RNN-based models.

# 4. Adagrad
# How it works: Adapts the learning rate to each parameter by scaling inversely with the square root of all past squared gradients.
# Pros: Good for sparse data like text (NLP).
# Cons: Learning rate may become too small over time.

# 5. AdamW
# How it works: A variant of Adam that decouples weight decay (L2 regularization) from the gradient update.
# Pros: Helps reduce overfitting, performs better with large-scale models.
# When to use: Transformers, BERT, or other models where regularization is important.

In [40]:
## Setup the tensorboard
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

# Tensorboard is used to visualize all the logs while training the model
log_dir = "logs/fit" + datetime.datetime.now().strftime("%Y%m%d - %H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [41]:
# setup the early stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', 
                                        patience=5, restore_best_weights=True)

In [42]:
# train the model
history = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=100,
                    callbacks=[tensorflow_callback, early_stopping_callback])

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 48ms/step - accuracy: 0.8138 - loss: 0.4397 - val_accuracy: 0.8545 - val_loss: 0.3610
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8495 - loss: 0.3608 - val_accuracy: 0.8620 - val_loss: 0.3419
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8488 - loss: 0.3598 - val_accuracy: 0.8555 - val_loss: 0.3701
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8541 - loss: 0.3507 - val_accuracy: 0.8555 - val_loss: 0.3460
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8630 - loss: 0.3404 - val_accuracy: 0.8560 - val_loss: 0.3492
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8685 - loss: 0.3267 - val_accuracy: 0.8675 - val_loss: 0.3439
Epoch 7/100
[1m250

In [43]:
model.save('model.h5')
# h5 file is compatible with keras



In [44]:
# load tensorboard extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs\fit20250406 - 205945
# tensorboard shows a dashboard which shows the accuracy and loss function wrt epochs

ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
2025-04-06 21:16:42.071143: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 21:16:43.267948: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]