In [214]:
import sys
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn import over_sampling

IN_COLAB = 'google.colab' in sys.modules
warnings.filterwarnings('ignore')


if IN_COLAB:
  df = pd.read_csv('https://raw.githubusercontent.com/chakraskun/churn-modelling/main/Churn_Modelling.csv')
else:
  df = pd.read_csv('Churn_Modelling.csv')

In [215]:
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4784,4785,15811588,Eluemuno,664,Spain,Female,53,7,187602.18,1,1,0,186392.99,1
5559,5560,15729557,Olisaemeka,850,Germany,Male,36,5,119984.07,1,1,0,191535.11,1
4724,4725,15578648,Marino,543,Germany,Male,49,6,59532.18,1,1,0,104253.56,0
568,569,15795564,Moretti,737,Germany,Male,31,5,121192.22,2,1,1,74890.58,0
8726,8727,15570289,Benson,697,Germany,Male,43,8,103409.16,1,1,0,66893.28,1


In [216]:
numerical = [
  'CreditScore',
  'Age',
  'Balance',
  'EstimatedSalary',
  'Tenure',
  'NumOfProducts',
]
categorical = [
  'Geography',
  'Gender',
  'HasCrCard',
  'IsActiveMember',
]

# Data Preprocessing

- Dari hasil EDA didapat bahwa tidak ada value yang null
- drop CustomerId, Surname dan RowNumber
- tidak ada duplikat

In [217]:
df.duplicated().sum()

0

In [218]:
try:
  df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
except:
  pass

df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9314,613,Germany,Female,51,7,147262.11,1,1,1,53630.9,1
1985,719,Spain,Female,47,9,116393.59,1,1,0,63051.32,1
9102,445,Germany,Female,61,2,137655.31,1,0,1,29909.84,0
1431,667,France,Male,48,2,0.0,1,1,0,43229.2,0
9352,667,France,Male,40,8,72945.29,2,1,0,98931.5,0


# Outliers Handling

Outlier untuk sementara tidak dihandle, dikarenakan outlier bersifat statistikal saja,

# Split Dataset

In [219]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [220]:
# try:
#   df_test.drop(columns=['Exited'], inplace=True)
# except:
#   print(df_test.columns)

In [221]:
df_test.to_csv('test_raw.csv', index=False)
df_train.to_csv('train_raw.csv', index=False)

# Transformation Train & Test Dataset

## Re-read raw dataset

In [222]:
test = pd.read_csv('test_raw.csv')
train = pd.read_csv('train_raw.csv')

In [223]:
handled_dataset = [train, test]

## Handle Age

In [224]:
for i in handled_dataset:
  i['LogAge'] = np.log(i['Age'])

## Handle Balance, EstimatedSalary, Tenure, NumOfProducts

In [225]:
for i in handled_dataset:
  i['LogAgeNorm'] = MinMaxScaler().fit_transform(i['LogAge'].values.reshape(-1, 1))
  i['BalanceStd'] = StandardScaler().fit_transform(i['Balance'].values.reshape(len(i), 1))
  i['BalanceNorm'] = MinMaxScaler().fit_transform(i['BalanceStd'].values.reshape(len(i), 1))
  i['EstimatedSalaryStd'] = StandardScaler().fit_transform(i['EstimatedSalary'].values.reshape(len(i), 1))
  i['EstimatedSalaryNorm'] = MinMaxScaler().fit_transform(i['EstimatedSalaryStd'].values.reshape(len(i), 1))
  i['TenureNorm'] = MinMaxScaler().fit_transform(i['Tenure'].values.reshape(len(i), 1))
  i['NumOfProductsNorm'] = MinMaxScaler().fit_transform(i['NumOfProducts'].values.reshape(len(i), 1))
  i['CreditScoreNorm'] = MinMaxScaler().fit_transform(i['CreditScore'].values.reshape(len(i), 1))

# Feature Encoding

In [226]:
mapping_gender = {
    'Female' : 0,
    'Male' : 1
  }
for i in handled_dataset:
  i['Gender'] = i['Gender'].map(mapping_gender)

In [227]:
for i in handled_dataset:
  one_hot_encoding = pd.get_dummies(i['Geography'], prefix='geo')
  if i.shape[0] == 8000:
    train = train.join(one_hot_encoding)
  if i.shape[0] == 2000:
    test = test.join(one_hot_encoding)

# Drop Unused Columns

In [228]:
handled_dataset = [train, test]
for i in handled_dataset:
  i.drop(columns=['Geography','Age', 'Balance', 'EstimatedSalary', 'EstimatedSalaryStd', 'BalanceStd', 'Tenure', 'NumOfProducts', 'LogAge', 'CreditScore'], inplace=True)

In [229]:
train.sample(5)

Unnamed: 0,Gender,HasCrCard,IsActiveMember,Exited,LogAgeNorm,BalanceNorm,EstimatedSalaryNorm,TenureNorm,NumOfProductsNorm,CreditScoreNorm,geo_France,geo_Germany,geo_Spain
3270,0,0,0,0,0.441669,0.541762,0.904541,0.4,0.0,0.514,0,1,0
214,1,0,0,0,0.313118,0.536063,0.215931,0.8,0.0,0.584,0,1,0
5525,1,1,0,0,0.533787,0.84774,0.576238,0.3,0.0,0.568,0,1,0
5682,0,1,0,0,0.601213,0.509865,0.160659,0.2,0.333333,0.516,0,1,0
4104,0,1,1,0,0.458016,0.0,0.971283,0.7,0.333333,0.79,0,0,1


In [230]:
test.sample(5)

Unnamed: 0,Gender,HasCrCard,IsActiveMember,Exited,LogAgeNorm,BalanceNorm,EstimatedSalaryNorm,TenureNorm,NumOfProductsNorm,CreditScoreNorm,geo_France,geo_Germany,geo_Spain
1100,1,1,1,0,0.72769,0.481222,0.515787,0.2,0.0,0.566,1,0,0
909,0,1,0,1,0.575126,0.216882,0.156318,0.1,1.0,0.556,1,0,0
1730,0,0,1,1,0.575126,0.287873,0.858433,0.6,0.0,0.332,1,0,0
960,0,1,0,0,0.441669,0.517574,0.052503,0.7,0.333333,0.61,1,0,0
135,0,1,0,0,0.37154,0.0,0.289849,0.6,0.333333,0.454,0,0,1


# Handle imbalance class on train df

In [231]:
X = train[[col for col in train.columns if (str(train[col].dtype) != 'object') and col != 'Exited']]
y = train['Exited'].values
X_over_SMOTE, y_over_SMOTE = over_sampling.SMOTE(sampling_strategy=0.5).fit_resample(X, y)

print('BEFORE')
print(pd.Series(y).value_counts())
print('----------------------')
print('AFTER')
print(pd.Series(y_over_SMOTE).value_counts())

BEFORE
0    6356
1    1644
dtype: int64
----------------------
AFTER
0    6356
1    3178
dtype: int64


In [232]:
X_over_SMOTE
X_over_SMOTE.to_csv('churn_train.csv', index=False)

In [233]:
test.to_csv('churn_test.csv', index=False)

# Kesimpulan

- Outlier pada feature `Age` dan `CreditScore` tidak dihandle, karena bersifat statistical (to be reviewed step selanjutnya)
- Pada feature `Age` dilakukan log transformation agar mendapatkan hasil mendekati distribusi normal
- Feature `Balance` dan `EstimatedSalary` dilakukan standarizarion (to be reviewed di step selanjutnya)
- Feature `Tenure` dan `NumOfProducts` dilakukan normalization (to be reviewed di step selanjutnya)
- Dataset di split menjadi 80% train dan 20% test (`random_state=42`)
- Data imbalance pada dataset train di handle dengan menggunakan oversampling SMOTE

# Feature Engineering

- tidak ada fitur yang dibuang
- tidak ada fitur yang dibuat baru
- fitur yang bisa ditambahkan:
  - number of transaction made per month
  - occupation
  - total branch offices in his/her country (bisa relate ke customer satisfaction, tidak susah untuk mencari kantor bank XYZ di daerahnya)
  - education
  - rates


In [259]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    
    fpr, tpr, thresholds = roc_curve(ytest, pred, pos_label=1) # pos_label: label yang kita anggap positive
    print("AUC: %.2f" % auc(fpr, tpr))

In [235]:
X_train = X_over_SMOTE
y_train = y_over_SMOTE
X_test = test[[col for col in test.columns if (str(test[col].dtype) != 'object') and col != 'Exited']]
y_test = test['Exited'].values

# Logistic Regression

In [236]:
model = LogisticRegression(random_state=42)
model.fit(X_over_SMOTE, y_over_SMOTE)

LogisticRegression(random_state=42)

In [237]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 0, 0])

In [238]:
eval_classification(model, y_pred, X_over_SMOTE, y_over_SMOTE, X_test, y_test)

Accuracy (Test Set): 0.81
Precision (Test Set): 0.51
Recall (Test Set): 0.51
F1-Score (Test Set): 0.51
AUC: 0.70


In [239]:
print('Train score: ' + str(model.score(X_train, y_train))) #accuracy
print('Test score:' + str(model.score(X_test, y_test))) #accuracy

Train score: 0.7442836165303126
Test score:0.8085


In [240]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# List Hyperparameters yang akan diuji
penalty = ['l2','l1','elasticnet']
C = [0.0001, 0.001, 0.002] # Inverse of regularization strength; smaller values specify stronger regularization.
hyperparameters = dict(penalty=penalty, C=C)

# Inisiasi model
logres = LogisticRegression(random_state=42) # Init Logres dengan Gridsearch, cross validation = 5
model = RandomizedSearchCV(logres, hyperparameters, cv=5, random_state=42, scoring='precision')

# Fitting Model & Evaluation
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

Accuracy (Test Set): 0.81
Precision (Test Set): 0.59
Recall (Test Set): 0.12
F1-Score (Test Set): 0.20
AUC: 0.55


In [241]:
print('Best algorithm:', model.best_estimator_.get_params()['penalty'])
print('Best C:', model.best_estimator_.get_params()['C'])

Best algorithm: l2
Best C: 0.002


In [242]:
print('Train score: ' + str(model.score(X_train, y_train))) #accuracy
print('Test score:' + str(model.score(X_test, y_test))) #accuracy

Train score: 0.7318840579710145
Test score:0.5875


# Decision Tree

In [243]:
from sklearn.tree import DecisionTreeClassifier
model_decision_tree = DecisionTreeClassifier(random_state=42)
model_decision_tree.fit(X_train,y_train)

y_pred = model_decision_tree.predict(X_test)
eval_classification(model_decision_tree, y_pred, X_train, y_train, X_test, y_test)

Accuracy (Test Set): 0.77
Precision (Test Set): 0.43
Recall (Test Set): 0.52
F1-Score (Test Set): 0.47
AUC: 0.68


In [244]:
print('Train score: ' + str(model_decision_tree.score(X_train, y_train))) #accuracy
print('Test score:' + str(model_decision_tree.score(X_test, y_test))) #accuracy

Train score: 1.0
Test score:0.771


In [245]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

# List of hyperparameter
max_depth = [int(x) for x in np.linspace(1, 110, num = 30)] # Maximum number of levels in tree
min_samples_split = [2, 5, 10, 100] # Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4, 10, 20, 50] # Minimum number of samples required at each leaf node
max_features = ['auto', 'sqrt'] # Number of features to consider at every split

hyperparameters = dict(max_depth=max_depth, 
                       min_samples_split=min_samples_split, 
                       min_samples_leaf=min_samples_leaf,
                       max_features=max_features
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
model = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42, scoring='precision')
model.fit(X_train, y_train)

# Predict & Evaluation
y_pred = model.predict(X_test)#Check performa dari model
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

Accuracy (Test Set): 0.83
Precision (Test Set): 0.56
Recall (Test Set): 0.59
F1-Score (Test Set): 0.58
AUC: 0.74


In [246]:
print('Best max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_features:', model.best_estimator_.get_params()['max_features'])

Best max_depth: 87
Best min_samples_split: 10
Best min_samples_leaf: 50
Best max_features: auto


In [247]:
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

Train score: 0.7699739873652918
Test score:0.5587529976019184


# Loop for each model

In [255]:
models = []
models.append(('Logistic Regression', LogisticRegression(random_state = 42)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier(random_state = 42)))
models.append(('Random Forest', RandomForestClassifier(random_state = 42)))
models.append(('SVM', SVC(gamma='auto', random_state = 42)))
models.append(('Gradient Boosting', GradientBoostingClassifier(random_state = 42)))
models.append(("LightGBM", LGBMClassifier(random_state = 42)))
models.append(("XGBoost", xgb.XGBClassifier()))
# models.append(("CatBoost", CatBoostClassifier(random_state = 12345, verbose = False)))

# evaluate each model in turn
results = []
names = []

In [264]:
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    msg = "%s: (%f)" % (name, score)
    print(msg)

Logistic Regression: (0.808500)
KNN: (0.777000)
Decision Tree: (0.771000)
Random Forest: (0.860000)
SVM: (0.828500)
Gradient Boosting: (0.866500)
LightGBM: (0.858500)
XGBoost: (0.851000)
