## Import Common Package

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, PowerTransformer, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 999
pd.set_option('display.float_format', lambda x: '%.2f' % x) #Change Decimal

### Import Data

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
creditDesc = []

for i in df.columns:
    creditDesc.append([
        i,
        df[i].dtypes,
        df[i].isna().sum(),
        (((df[i].isna().sum()) / len(df)) * 100).round(2),
        df[i].nunique(),
        df[i].drop_duplicates().sample(2).values
    ])

pd.DataFrame(data = creditDesc, columns= [
    'Data Feature',
    'Data Types',
    'Null',
    'Null Pct',
    'Unique',
    'Unique Sample'
] )

Unnamed: 0,Data Feature,Data Types,Null,Null Pct,Unique,Unique Sample
0,RowNumber,int64,0,0.0,10000,"[8355, 8416]"
1,CustomerId,int64,0,0.0,10000,"[15775803, 15694821]"
2,Surname,object,0,0.0,2932,"[Sharp, Ugochukwu]"
3,CreditScore,int64,0,0.0,460,"[455, 832]"
4,Geography,object,0,0.0,3,"[Germany, France]"
5,Gender,object,0,0.0,2,"[Male, Female]"
6,Age,int64,0,0.0,70,"[72, 18]"
7,Tenure,int64,0,0.0,11,"[2, 9]"
8,Balance,float64,0,0.0,6382,"[115095.88, 169902.92]"
9,NumOfProducts,int64,0,0.0,4,"[2, 4]"


Detail EDA please refer to Tugas_20210425_EDA_FransiskusAlvin.ipynb

### Data Preparation

In [4]:
df_1 = df.copy()

In [5]:
# Remove RowNumber and Surname and CustomerId
df_1.drop(columns = ['RowNumber','Surname', 'CustomerId'], inplace = True)

In [6]:
df_1.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
df_1.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

## Splitting Data

In [8]:
X = df_1.drop(columns='Exited')
y = df_1['Exited']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size =.20, random_state = 42)

In [10]:
X_train.shape

(8000, 10)

In [11]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2151,753,France,Male,57,7,0.0,1,1,0,159475.08
8392,739,Germany,Male,32,3,102128.27,1,1,0,63981.37
5006,755,Germany,Female,37,0,113865.23,2,1,1,117396.25
4117,561,France,Male,37,5,0.0,2,1,0,83093.25
7182,692,Germany,Male,49,6,110540.43,2,0,1,107472.99


In [12]:
X_test.shape

(2000, 10)

# Pipeline

In [13]:
num_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']


cat_columns = ['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

numeric_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=1, include_bias=False))
#     ('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns), 
    ('categorical', categoric_pipeline, cat_columns)
])


### 1. Pipeline Random Forest

In [14]:
pipe_RF = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier())
])

In [15]:
pipe_RF.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('poly',
                                                                   PolynomialFeatures(degree=1,
                                                                                      include_bias=False))]),
                                                  ['CreditScore', 'Age',
                                                   'Tenure', 'Balance',
                                                   'NumOfProducts',
                                                   'EstimatedSalary']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['Geography', 'Gender',
                                            

In [16]:
y_rf1 = pipe_RF.predict(X_test)

In [17]:
print(classification_report(y_test, y_rf1))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.75      0.47      0.57       407

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



In [18]:
cm_RF1 = confusion_matrix(y_test, y_rf1, labels=[1,0])

In [19]:
df_RF1 = pd.DataFrame(cm_RF1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF1

Unnamed: 0,Pred 1,Pred 0
Akt 1,190,217
Akt 0,64,1529


### Pipeline Random Forest - Hyper Parameter Tuning

In [20]:
param_RF = {
    'algo__n_estimators' : [700, 750, 800],
    'algo__max_depth': [3,4],
#     'algo__max_features': ['auto', 0.1, 0.2, 0.3],
    'algo__min_samples_leaf': [20, 25,30],
    'algo__class_weight' : [{0:x, 1: 1 - x} for x in [.15, .20]]
}

In [21]:
skf = StratifiedKFold(n_splits=3, random_state = 42)

In [22]:
RF_GS = GridSearchCV(pipe_RF, param_RF, cv = skf, scoring='recall', n_jobs = -1, verbose=1)

In [23]:
RF_GS.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  1.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('poly',
                                                                                          PolynomialFeatures(degree=1,
                                                                                                             include_bias=False))]),
                                                                         ['CreditScore',
                                                                          'Age',
                                                                          'Tenure',
                                                                          'Balance',
                                                                          'NumOfProducts',
           

In [24]:
RF_Tuned = RF_GS.best_estimator_

In [25]:
y_ts_RFTuned = RF_Tuned.predict(X_test)

In [26]:
print(classification_report(y_test, y_ts_RFTuned))

              precision    recall  f1-score   support

           0       0.93      0.55      0.69      1593
           1       0.32      0.85      0.47       407

    accuracy                           0.61      2000
   macro avg       0.63      0.70      0.58      2000
weighted avg       0.81      0.61      0.65      2000



In [27]:
cm_RF2 = confusion_matrix(y_test, y_ts_RFTuned, labels=[1,0])

In [28]:
df_RF2 = pd.DataFrame(cm_RF2, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF2

Unnamed: 0,Pred 1,Pred 0
Akt 1,346,61
Akt 0,720,873


### 2. Pipeline Decision Tree

In [29]:
pipe_DT = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier())
])

In [30]:
pipe_DT.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('poly',
                                                                   PolynomialFeatures(degree=1,
                                                                                      include_bias=False))]),
                                                  ['CreditScore', 'Age',
                                                   'Tenure', 'Balance',
                                                   'NumOfProducts',
                                                   'EstimatedSalary']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['Geography', 'Gender',
                                            

In [31]:
y_DT1 = pipe_DT.predict(X_test)

In [32]:
print(classification_report(y_test, y_DT1))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1593
           1       0.47      0.50      0.48       407

    accuracy                           0.78      2000
   macro avg       0.67      0.68      0.67      2000
weighted avg       0.79      0.78      0.79      2000



In [33]:
cm_DT1 = confusion_matrix(y_test, y_DT1, labels=[1,0])

In [34]:
df_DT1 = pd.DataFrame(cm_DT1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_DT1

Unnamed: 0,Pred 1,Pred 0
Akt 1,202,205
Akt 0,229,1364


### Pipeline Decision Tree - Hyper Parameter Tuning

In [35]:
pipe_DT.get_params()

{'memory': None,
 'steps': [('prep', ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('poly',
                                                     PolynomialFeatures(degree=1,
                                                                        include_bias=False))]),
                                    ['CreditScore', 'Age', 'Tenure', 'Balance',
                                     'NumOfProducts', 'EstimatedSalary']),
                                   ('categorical',
                                    Pipeline(steps=[('encoder', OneHotEncoder())]),
                                    ['Geography', 'Gender', 'NumOfProducts',
                                     'HasCrCard', 'IsActiveMember'])])),
  ('algo', DecisionTreeClassifier())],
 'verbose': False,
 'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('poly',
                                                   PolynomialFeatures(

In [36]:
param_DT = {
    'algo__max_depth': [20,25,30],
    'algo__max_features': ['auto'],
    'algo__min_samples_leaf': [68],
    'algo__class_weight' : [{0:x, 1: 1 - x} for x in [.15, .175]]
}

In [37]:
skf = StratifiedKFold(n_splits=3, random_state = 42)

In [38]:
DT_GS = GridSearchCV(pipe_DT, param_DT, cv = skf, scoring='recall', n_jobs = -1, verbose=1)

In [39]:
DT_GS.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('poly',
                                                                                          PolynomialFeatures(degree=1,
                                                                                                             include_bias=False))]),
                                                                         ['CreditScore',
                                                                          'Age',
                                                                          'Tenure',
                                                                          'Balance',
                                                                          'NumOfProducts',
           

In [40]:
DT_Tuned = DT_GS.best_estimator_

In [41]:
y_ts_DTTuned = DT_Tuned.predict(X_test)

In [42]:
print(classification_report(y_test, y_ts_DTTuned))

              precision    recall  f1-score   support

           0       0.94      0.68      0.79      1593
           1       0.40      0.82      0.53       407

    accuracy                           0.71      2000
   macro avg       0.67      0.75      0.66      2000
weighted avg       0.83      0.71      0.74      2000



In [43]:
cm_DT2 = confusion_matrix(y_test, y_ts_DTTuned, labels=[1,0])

In [44]:
df_DT2 = pd.DataFrame(cm_DT2, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_DT2

Unnamed: 0,Pred 1,Pred 0
Akt 1,332,75
Akt 0,507,1086


### 3. Pipeline LogReg

In [45]:
pipe_LogReg = Pipeline([
    ("prep", preprocessor),
    ("algo", LogisticRegression())
])

In [46]:
pipe_LogReg.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('poly',
                                                                   PolynomialFeatures(degree=1,
                                                                                      include_bias=False))]),
                                                  ['CreditScore', 'Age',
                                                   'Tenure', 'Balance',
                                                   'NumOfProducts',
                                                   'EstimatedSalary']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['Geography', 'Gender',
                                            

In [47]:
y_LogReg1 = pipe_LogReg.predict(X_test)

In [48]:
print(classification_report(y_test, y_LogReg1))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1593
           1       0.00      0.00      0.00       407

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000



In [49]:
cm_LogReg1 = confusion_matrix(y_test, y_LogReg1, labels=[1,0])

In [50]:
df_LogReg1 = pd.DataFrame(cm_LogReg1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_LogReg1

Unnamed: 0,Pred 1,Pred 0
Akt 1,0,407
Akt 0,0,1593


### Pipeline Decision Tree - Hyper Parameter Tuning

In [51]:
pipe_LogReg.get_params()

{'memory': None,
 'steps': [('prep', ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('poly',
                                                     PolynomialFeatures(degree=1,
                                                                        include_bias=False))]),
                                    ['CreditScore', 'Age', 'Tenure', 'Balance',
                                     'NumOfProducts', 'EstimatedSalary']),
                                   ('categorical',
                                    Pipeline(steps=[('encoder', OneHotEncoder())]),
                                    ['Geography', 'Gender', 'NumOfProducts',
                                     'HasCrCard', 'IsActiveMember'])])),
  ('algo', LogisticRegression())],
 'verbose': False,
 'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('poly',
                                                   PolynomialFeatures(degr

In [52]:
param_LogReg = {
    'algo__penalty': ['none', 'l1', 'l2'],
    'algo__C': np.logspace(-4 , 4, 14),
    'algo__class_weight' : [{0:x, 1: 1 - x} for x in [.05,.1,.15, .2,]]
}

In [53]:
skf = StratifiedKFold(n_splits=3, random_state = 42)

In [54]:
LogReg_GS = GridSearchCV(pipe_LogReg, param_LogReg, cv = skf, scoring='recall', n_jobs = -1, verbose=1)

In [55]:
LogReg_GS.fit(X_train, y_train)

Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed:    7.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('poly',
                                                                                          PolynomialFeatures(degree=1,
                                                                                                             include_bias=False))]),
                                                                         ['CreditScore',
                                                                          'Age',
                                                                          'Tenure',
                                                                          'Balance',
                                                                          'NumOfProducts',
           

In [56]:
LogReg_Tuned = LogReg_GS.best_estimator_

In [57]:
y_ts_LogRegTuned = LogReg_Tuned.predict(X_test)

In [58]:
print(classification_report(y_test, y_ts_LogRegTuned))

              precision    recall  f1-score   support

           0       0.90      0.01      0.01      1593
           1       0.20      1.00      0.34       407

    accuracy                           0.21      2000
   macro avg       0.55      0.50      0.17      2000
weighted avg       0.76      0.21      0.08      2000



In [59]:
cm_LogReg2 = confusion_matrix(y_test, y_ts_LogRegTuned, labels=[1,0])

In [60]:
df_LogReg2 = pd.DataFrame(cm_LogReg2, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_LogReg2

Unnamed: 0,Pred 1,Pred 0
Akt 1,406,1
Akt 0,1584,9


# Recommendation

- Create customer retention program, for customers who are predicted will Exited bank
    - Give 1 Honda Vario (with value IDR15.000.000) for customers who are predicted will Exited bank with 3 years contract agreement
    - The cost of acquiring new customers is estimated at five times the rate of retaining existing ones
        - Source : https://www.fpsc.com/the_cost_of_customer_churn.pdf
        - With asumption retain existing customer cost is IDR 15.000.000 => lose 1 customers are equal to 75000000

# Summary

- We suggest to use Model Random Forest with Hyper Parameter Tuning 2, because after rough calculation, it has the lowest loss of money