In [1]:
%matplotlib inline 
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import numpy as np 

In [2]:
df = pd.read_csv("../Datasets/Churn_Modelling_21_14.csv")
df.shape

(10000, 14)

In [3]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
df_clean = df.drop(['CustomerId','Surname'], axis='columns')
df_clean.columns

Index(['RowNumber', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')

In [6]:
df_clean.dtypes

RowNumber            int64
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [7]:
df_clean= pd.get_dummies(df_clean, ['Geography','Gender'],drop_first=True)
df_clean.columns

Index(['RowNumber', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited',
       'Geography_Germany', 'Geography_Spain', 'Gender_Male'],
      dtype='object')

In [8]:
df_clean.head()

Unnamed: 0,RowNumber,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,1,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,2,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,3,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,4,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,5,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [9]:
scalar = MinMaxScaler()
df_clean[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']] = scalar.fit_transform(df_clean[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])

In [10]:
df_clean.head()

Unnamed: 0,RowNumber,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,1,0.538,0.324324,0.2,0.0,0.0,1,1,0.506735,1,0,0,0
1,2,0.516,0.310811,0.1,0.334031,0.0,0,1,0.562709,0,0,1,0
2,3,0.304,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1,0,0,0
3,4,0.698,0.283784,0.1,0.0,0.333333,0,0,0.46912,0,0,0,0
4,5,1.0,0.337838,0.2,0.500246,0.0,1,1,0.3954,0,0,1,0


In [11]:
def ANN(X_train,y_train,X_test,y_test,loss,weights, epochs):
    model = keras.Sequential ([ 
        keras.layers.Dense(12, input_dim=12,activation='relu' ),
        keras.layers.Dense(8,activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
     ])

    model.compile(optimizer = 'adam', loss=loss, metrics=['accuracy'])

    if weights == -1:
        model.fit(X_train,y_train,epochs=epochs)
    else:
        model.fit(X_train,y_train, class_weight = weights, epochs=epochs)
    
    print(model.evaluate(X_test,y_test))

    y_preds = model.predict(X_test)
    y_preds = np.round(y_preds)

    print(f'classification report: \n ', classification_report(y_test, y_preds))

    return y_preds

# Step #1 create a input dataset to be later used for undersampling, oversampling ..

In [12]:
# store churn variation counts in two variables 
count_exited_0, count_exited_1 = df_clean.Exited.value_counts()

count_exited_0 , count_exited_1

(7963, 2037)

In [13]:
df_exited_0 = df_clean[df_clean.Exited==0]
df_exited_1 = df_clean[df_clean.Exited==1]

df_exited_0.shape , df_exited_1.shape

((7963, 13), (2037, 13))

In [14]:
X = df_clean.drop(['Exited'], axis='columns')
y = df_clean.Exited
X.shape, y.shape

((10000, 12), (10000,))

# Create DS for standard model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=15,stratify=y)
X_train.shape,X_test.shape

((8000, 12), (2000, 12))

In [16]:
y_train.value_counts() , y_test.value_counts()

(0    6370
 1    1630
 Name: Exited, dtype: int64,
 0    1593
 1     407
 Name: Exited, dtype: int64)

In [17]:
y_pred = ANN(X_train,y_train,X_test,y_test,'binary_crossentropy',-1,20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.47357210516929626, 0.7879999876022339]
classification report: 
                precision    recall  f1-score   support

           0       0.81      0.96      0.88      1593
           1       0.42      0.12      0.18       407

    accuracy                           0.79      2000
   macro avg       0.62      0.54      0.53      2000
weighted avg       0.73      0.79      0.74      2000



# Method 1: Model for undersampling

## Exited customer are less in numbers in DS vs Customers still active - undersampling 

In [18]:
df_consol = X_train.copy()
df_consol.shape

(8000, 12)

In [19]:
df_consol['Exited'] = y_train
df_consol.shape

(8000, 13)

In [20]:
count_exited_0 = df_consol[df_consol.Exited==0]
count_exited_1 = df_consol[df_consol.Exited==1]
count_exited_0.Exited.value_counts(), count_exited_1.Exited.value_counts()

(0    6370
 Name: Exited, dtype: int64,
 1    1630
 Name: Exited, dtype: int64)

In [21]:
count_exited_0_under = count_exited_0.sample(1630)
count_exited_0_under.shape

(1630, 13)

In [22]:
df_under_input = pd.concat([count_exited_0_under,count_exited_1],axis=0)
df_under_input.shape

(3260, 13)

In [23]:
X= df_under_input.drop(['Exited'],axis='columns')
y=df_under_input.Exited
X.shape, y.shape

((3260, 12), (3260,))

In [24]:
y_pred_under =  ANN(X,y,X_test,y_test,'binary_crossentropy',-1,20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.5451475381851196, 0.7630000114440918]
classification report: 
                precision    recall  f1-score   support

           0       0.81      0.93      0.86      1593
           1       0.30      0.12      0.17       407

    accuracy                           0.76      2000
   macro avg       0.55      0.52      0.52      2000
weighted avg       0.70      0.76      0.72      2000



# Method 2: Model for Over Sampling

## Bump up the exited count to 6370

In [25]:
count_exited_1_over = count_exited_1.sample(6370,replace=True)
count_exited_1_over.shape

(6370, 13)

In [26]:
count_exited_1_over.Exited.value_counts()

1    6370
Name: Exited, dtype: int64

In [27]:
df_over_input = pd.concat([count_exited_1_over,count_exited_0],axis=0)
df_over_input.shape

(12740, 13)

In [28]:
X= df_over_input.drop(['Exited'],axis='columns')
y= df_over_input.Exited
X.shape, y.shape

((12740, 12), (12740,))

In [29]:
y_pred_over =  ANN(X,y,X_test,y_test,'binary_crossentropy',-1,20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.5200784206390381, 0.7739999890327454]
classification report: 
                precision    recall  f1-score   support

           0       0.82      0.92      0.87      1593
           1       0.40      0.22      0.28       407

    accuracy                           0.77      2000
   macro avg       0.61      0.57      0.58      2000
weighted avg       0.74      0.77      0.75      2000



# Method 3: Using SMOTE

## For this we are using the DS created in standard model

In [30]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority', n_jobs=2)

In [31]:
X_sm,y_sm = smote.fit_resample(X_train,y_train)
y_sm.value_counts()

0    6370
1    6370
Name: Exited, dtype: int64

In [32]:
y_pred_sm = ANN(X_sm,y_sm,X_test,y_test,'binary_crossentropy',-1,20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.7499382495880127, 0.5669999718666077]
classification report: 
                precision    recall  f1-score   support

           0       0.87      0.54      0.67      1593
           1       0.27      0.67      0.39       407

    accuracy                           0.57      2000
   macro avg       0.57      0.61      0.53      2000
weighted avg       0.75      0.57      0.61      2000



# Method 4: Ensemble with undersampling using count DS from Method 1

In [33]:
X_train.shape , y_train.shape ,X_test.shape ,y_test.shape

((8000, 12), (8000,), (2000, 12), (2000,))

In [34]:
y_train.value_counts(),  y_test.value_counts()

(0    6370
 1    1630
 Name: Exited, dtype: int64,
 0    1593
 1     407
 Name: Exited, dtype: int64)

In [35]:
count_exited_0.shape, count_exited_1.shape

((6370, 13), (1630, 13))

In [36]:
count_exited_0.Exited.value_counts(), count_exited_1.Exited.value_counts()

(0    6370
 Name: Exited, dtype: int64,
 1    1630
 Name: Exited, dtype: int64)

In [37]:
6370/1630

3.9079754601226995

In [38]:
def get_train_batch(df_majority,df_minority,start,end):
        df_train = pd.concat([df_majority[start:end],df_minority], axis=0)
        X_train = df_train.drop(['Exited'],axis='columns')
        y_train= df_train.Exited
        
        return X_train,y_train
    

In [39]:
X_train,y_train = get_train_batch(count_exited_0,count_exited_1,0,1630)

X_train.shape , y_train.shape

((3260, 12), (3260,))

In [40]:
y_pred_ens_1 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.7139105200767517, 0.5120000243186951]
classification report: 
                precision    recall  f1-score   support

           0       0.87      0.45      0.60      1593
           1       0.26      0.74      0.38       407

    accuracy                           0.51      2000
   macro avg       0.57      0.60      0.49      2000
weighted avg       0.75      0.51      0.55      2000



In [41]:
X_train,y_train = get_train_batch(count_exited_0,count_exited_1,1630,3260)

X_train.shape , y_train.shape

((3260, 12), (3260,))

In [42]:
y_pred_ens_2 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.717944860458374, 0.28049999475479126]
classification report: 
                precision    recall  f1-score   support

           0       0.92      0.11      0.19      1593
           1       0.22      0.96      0.35       407

    accuracy                           0.28      2000
   macro avg       0.57      0.53      0.27      2000
weighted avg       0.78      0.28      0.22      2000



In [43]:
X_train,y_train = get_train_batch(count_exited_0,count_exited_1,3260,4890)

X_train.shape , y_train.shape

((3260, 12), (3260,))

In [44]:
y_pred_ens_3 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.991060733795166, 0.3709999918937683]
classification report: 
                precision    recall  f1-score   support

           0       0.91      0.23      0.37      1593
           1       0.23      0.91      0.37       407

    accuracy                           0.37      2000
   macro avg       0.57      0.57      0.37      2000
weighted avg       0.77      0.37      0.37      2000



In [45]:
X_train,y_train = get_train_batch(count_exited_0,count_exited_1,4890,6371)

X_train.shape , y_train.shape

((3110, 12), (3110,))

In [46]:
y_pred_ens_4 = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.6084357500076294, 0.7889999747276306]
classification report: 
                precision    recall  f1-score   support

           0       0.80      0.97      0.88      1593
           1       0.40      0.07      0.12       407

    accuracy                           0.79      2000
   macro avg       0.60      0.52      0.50      2000
weighted avg       0.72      0.79      0.73      2000



In [47]:
y_pred_final= y_pred_ens_1.copy()

for i in range(len(y_pred_ens_1)):
    n_ones = y_pred_ens_1[i]+y_pred_ens_2[i]+y_pred_ens_3[i]+y_pred_ens_4[i]

    if n_ones >1:
        y_pred_final[i]=1
    else:
        y_pred_final[i]=0

In [48]:
print(classification_report(y_test,y_pred_final))

              precision    recall  f1-score   support

           0       0.91      0.26      0.40      1593
           1       0.24      0.90      0.37       407

    accuracy                           0.39      2000
   macro avg       0.57      0.58      0.39      2000
weighted avg       0.77      0.39      0.40      2000

