In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [184]:
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense,InputLayer,Input
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard,ModelCheckpoint


In [53]:
data = pd.read_csv('Data/Churn_Modelling.csv')
data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [186]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
useful_features = [col for col in data.columns if col not in ['RowNumber','CustomerId','Surname']]
useful_features

['CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [188]:
data[useful_features].describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
CreditScore,10000,,,,650.529,96.6533,350.0,584.0,652.0,718.0,850.0
Geography,10000,3.0,France,5014.0,,,,,,,
Gender,10000,2.0,Male,5457.0,,,,,,,
Age,10000,,,,38.9218,10.4878,18.0,32.0,37.0,44.0,92.0
Tenure,10000,,,,5.0128,2.89217,0.0,3.0,5.0,7.0,10.0
Balance,10000,,,,76485.9,62397.4,0.0,0.0,97198.5,127644.0,250898.0
NumOfProducts,10000,,,,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000,,,,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000,,,,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000,,,,100090.0,57510.5,11.58,51002.1,100194.0,149388.0,199992.0


In [189]:
X = data[useful_features].drop('Exited',axis=1)
y=data[useful_features]['Exited']

In [190]:
missing_cols = [col for col in X.columns if X[col].isnull().any()]
missing_cols

[]

In [191]:
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [192]:
cat_cols = [col for col in X.columns if X[col].dtype=='object']
num_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]

In [73]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

In [194]:
numerical_transformer = Pipeline(steps=
                                 [
                                     ('imputer',SimpleImputer(strategy='mean')),
                                     ('scaler',StandardScaler())
                                 ])
categorical_transformer = Pipeline(steps=
                                   [
                                       ('imputer',SimpleImputer(strategy='most_frequent')),
                                       ('one_hot',OneHotEncoder(handle_unknown='ignore'))
                                   ]
                                   )
preprocessor = ColumnTransformer(transformers=[('num',numerical_transformer,num_cols),
                                 ('cat',categorical_transformer,cat_cols)])

In [195]:
X.shape

(10000, 10)

In [196]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [197]:
X_train.shape

(8000, 10)

In [198]:
X_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
6252,596,Germany,Male,32,3,96709.07,2,0,0,41788.37
4684,623,France,Male,43,1,0.0,2,1,1,146379.3
1731,601,Spain,Female,44,4,0.0,2,1,0,58561.31
4742,506,Germany,Male,59,8,119152.1,2,1,1,170679.74
4521,560,Spain,Female,27,7,124995.98,1,1,1,114669.79


In [199]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test = pd.DataFrame(preprocessor.transform(X_test))

In [200]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.3565,-0.655786,0.34568,-1.218471,0.808436,0.649203,0.974817,1.36767,1.0,0.0,0.0,0.0,1.0
1,-0.203898,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,1.661254,0.0,1.0,0.0,0.0,1.0
2,-0.961472,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,-0.252807,0.0,0.0,1.0,0.0,1.0
3,-0.940717,-1.131148,1.386753,0.953212,-0.916688,0.649203,-1.025834,0.915393,1.0,0.0,0.0,1.0,0.0
4,-1.397337,1.625953,1.386753,1.057449,-0.916688,-1.540351,-1.025834,-1.0596,1.0,0.0,0.0,0.0,1.0


In [216]:
from tensorflow.keras import backend as K

# Clear any previous session
K.clear_session()
inputs = Input(shape=(X_train.shape[1],))
x = Dense(64,activation='relu')(inputs)
x = Dense(32,activation='relu')(x)
outputs = Dense(1,activation='sigmoid')(x)
model = Model(inputs=inputs,outputs=outputs)
model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 13)]              0         
_________________________________________________________________
dense (Dense)                (None, 64)                896       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 3,009
Trainable params: 3,009
Non-trainable params: 0
_________________________________________________________________


In [217]:
# model arguments
loss = tf.keras.losses.BinaryCrossentropy()
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
metrics = tf.keras.metrics.BinaryAccuracy()
earlystopping = EarlyStopping(patience=10,monitor='val_loss',restore_best_weights=True)
modelcheckpoint = ModelCheckpoint(monitor='val_loss',save_best_only=True,filepath='/home/gagan/Churn_model/best_model.h5')
tensorboard = TensorBoard(log_dir='/home/gagan/Churn_model/log_dir',histogram_freq=1)

In [218]:
model.compile(metrics=[metrics],optimizer=opt,loss=loss)

In [219]:
history = model.fit(X_train,y_train,
                    validation_data=(X_test,y_test),
                    epochs=100,
                    callbacks=[earlystopping,tensorboard,modelcheckpoint]
                    )

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


In [223]:
import pickle
with open('preprocessor.pkl','wb') as f:
    pickle.dump(preprocessor,f)
    

In [222]:
%tensorboard --logdir log_dir

UsageError: Line magic function `%tensorboard` not found.


In [78]:
sample = data[5:6]

In [79]:
sample

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1


In [80]:
sample = sample[useful_features].drop(['Exited'],axis=1)
sample

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
5,645,Spain,Male,44,8,113755.78,2,1,0,149756.71


In [60]:
import pickle
with open('preprocessor.pkl','rb') as f:
    preprocessor = pickle.load(f)


In [81]:
sample = preprocessor.transform(sample)

In [82]:
sample.shape

(1, 13)

In [63]:
from tensorflow.keras.models import load_model
model = load_model(filepath='best_model.h5')

In [83]:
prediction = model.predict(sample)

In [84]:
probability = prediction[0][0]

In [85]:
probability

0.19092612

In [86]:
if probability>0.50:
    print('Customer will Churn')
else:
    print('Customer will not churn')

Customer will not churn
