# 0.0. Imports 

In [67]:
import pandas as pd

from sklearn import preprocessing as pp
from sklearn import model_selection as ms
from sklearn import dummy
from sklearn import metrics
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import linear_model

## 0.1. Helper Functions

## 0.2. Load Data

In [2]:
df_raw = pd.read_csv( '../data/raw/churn.csv' )

# 1.0. Data Description

In [3]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [4]:
df1.columns = ['row_number', 'customer_id', 'surname', 'credit_score', 'geography', 'gender', 'age', 'tenure', 'balance', 'num_of_products', 'has_cr_card','is_active_member', 'estimated_salary', 'exited']

## 1.2. Data Dimensions

In [25]:
df1.shape

(10000, 14)

## 1.3. Check NA

In [26]:
df1.isna().sum()

row_number          0
customer_id         0
surname             0
credit_score        0
geography           0
gender              0
age                 0
tenure              0
balance             0
num_of_products     0
has_cr_card         0
is_active_member    0
estimated_salary    0
exited              0
dtype: int64

# 2.0. Feature Engineering

In [5]:
df2 = df1.copy()

# 3.0. Data Filtering

In [8]:
df3 = df2.copy()

In [9]:
drop_cols = ['row_number', 'customer_id', 'surname']
df3 = df3.drop(drop_cols, axis=1)

# 4.0. EDA

In [10]:
df4 = df3.copy()

# 5.0. Data Preprocessing

In [19]:
df5 = df4.copy()

In [24]:
le = pp.LabelEncoder()

df5['geography'] = le.fit_transform( df5[['geography']].values.ravel() )
df5['gender'] = le.fit_transform( df5[['gender']].values.ravel() )

# 6.0. Feature Selection

In [28]:
df6 = df5.copy()

# 7.0. Model Traning

In [32]:
X = df6.drop('exited', axis=1)
y = df6['exited'].values

In [33]:
X_train, X_val, y_train, y_val = ms.train_test_split(X, y, 
                                                     test_size=0.2, 
                                                     train_size=0.8, 
                                                     random_state=42, 
                                                     shuffle=True, 
                                                     stratify=None)

## 7.1. DummyClassifier

In [80]:
# model definition
model_baseline = dummy.DummyClassifier(strategy='prior', random_state=42, constant=None)

# model fit
model_baseline.fit( X_train, y_train )

# model predict
yhat_baseline = model_dummy.predict( X_val )

# model performance
print(metrics.classification_report( y_val, yhat_baseline,
                                    labels=None, 
                                    target_names=None, 
                                    sample_weight=None, 
                                    digits=2, 
                                    output_dict=False, 
                                    zero_division=0))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1607
           1       0.00      0.00      0.00       393

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.80      0.72      2000



## 7.2. KNeighborsClassifier

In [74]:
# # model definition
model_knn = neighbors.KNeighborsClassifier(n_neighbors=5, 
                                           weights='uniform', 
                                           algorithm='auto', 
                                           leaf_size=30, 
                                           p=2, 
                                           metric='minkowski', 
                                           metric_params=None, 
                                           n_jobs=None)

# model fit
model_knn.fit( X_train, y_train )

# model predict
yhat_knn = model_knn.predict( X_val )

# model performance
print(metrics.classification_report( y_val, yhat_knn,
                                    labels=None, 
                                    target_names=None, 
                                    sample_weight=None, 
                                    digits=2, 
                                    output_dict=False, 
                                    zero_division=0))

              precision    recall  f1-score   support

           0       0.81      0.93      0.86      1607
           1       0.24      0.09      0.14       393

    accuracy                           0.76      2000
   macro avg       0.52      0.51      0.50      2000
weighted avg       0.70      0.76      0.72      2000



## 7.3 SVM

In [73]:
# model definition
model_svm = svm.SVC(C=1.0, 
                    kernel='rbf', 
                    degree=3, 
                    gamma='scale', 
                    coef0=0.0, 
                    shrinking=True, 
                    probability=False, 
                    tol=0.001, 
                    cache_size=200, 
                    class_weight=None, 
                    verbose=False, 
                    max_iter=-1, 
                    decision_function_shape='ovr', 
                    break_ties=False, 
                    random_state=42)
# model fit
model_svm.fit( X_train, y_train )

# model predict
yhat_svm = model_svm.predict( X_val )

# model performance
print(metrics.classification_report( y_val, yhat_svm,
                                    labels=None, 
                                    target_names=None, 
                                    sample_weight=None, 
                                    digits=2, 
                                    output_dict=False, 
                                    zero_division=0))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1607
           1       0.00      0.00      0.00       393

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.80      0.72      2000



## 7.4. RandomForestClassifier

In [72]:
# model definition
model_rf = ensemble.RandomForestClassifier(n_estimators=100, 
                                           criterion='gini', 
                                           max_depth=None, 
                                           min_samples_split=2, 
                                           min_samples_leaf=1, 
                                           min_weight_fraction_leaf=0.0, 
                                           max_features='auto', 
                                           max_leaf_nodes=None, 
                                           min_impurity_decrease=0.0, 
                                           min_impurity_split=None, 
                                           bootstrap=True, 
                                           oob_score=False, 
                                           n_jobs=None, 
                                           random_state=42, 
                                           verbose=0, 
                                           warm_start=False, 
                                           class_weight=None, 
                                           ccp_alpha=0.0, 
                                           max_samples=None)
# model fit
model_rf.fit( X_train, y_train )

# model predict
yhat_rf = model_rf.predict( X_val )

# model performance
print(metrics.classification_report( y_val, yhat_rf,
                                    labels=None, 
                                    target_names=None, 
                                    sample_weight=None, 
                                    digits=2, 
                                    output_dict=False, 
                                    zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



## 7.5. LogisticRegression

In [71]:
# model definition
model_lr = linear_model.LogisticRegression(penalty='l2', 
                                           dual=False, 
                                           tol=0.0001, 
                                           C=1.0, 
                                           fit_intercept=True, 
                                           intercept_scaling=1, 
                                           class_weight=None, 
                                           random_state=42, 
                                           solver='lbfgs', 
                                           max_iter=100, 
                                           multi_class='auto', 
                                           verbose=0, 
                                           warm_start=False, 
                                           n_jobs=None, 
                                           l1_ratio=None)
# model fit
model_lr.fit( X_train, y_train )

# model predict
yhat_lr = model_lr.predict( X_val )

# model performance
print(metrics.classification_report( y_val, yhat_lr,
                                    labels=None, 
                                    target_names=None, 
                                    sample_weight=None, 
                                    digits=2, 
                                    output_dict=False, 
                                    zero_division=0))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1607
           1       0.45      0.07      0.12       393

    accuracy                           0.80      2000
   macro avg       0.63      0.53      0.51      2000
weighted avg       0.74      0.80      0.74      2000



# 8.0. Hyperparameter Fine-Tuning

# 9.0. Model Perfomance

# 10.0. Deploy to Production