### Libraries

In [1]:
# DataFrame
import pandas as pd

import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline

# Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.model_selection import GridSearchCV, cross_val_score

# Model Evaluate
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

# Model Saving
import json
import pickle

# Warning Ignore
import warnings
warnings.filterwarnings("ignore")

### Data Loading

In [35]:
# Load data
data = pd.read_csv('botswana_bank_customer_churn.csv')

In [5]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,First Name,Date of Birth,Gender,Marital Status,Number of Dependents,Occupation,Income,Education Level,Address,Contact Information,Customer Tenure,Customer Segment,Preferred Communication Channel,Credit Score,Credit History Length,Outstanding Loans,Churn Flag,Churn Reason,Churn Date,Balance,NumOfProducts,NumComplaints
0,1,83ef0b54-35f6-4f84-af58-5653ac0c0dc4,Smith,Troy,1987-08-29,Male,Divorced,3,Information systems manager,77710.14,High School,"26644 Diaz Viaduct\nNorth Jessicaville, VI 14852",001-301-934-3058x0825,30,Retail,Phone,397,24,41959.74,0,,,211359.05,1,0
1,2,009f115a-e5ca-4cf4-97d6-530140545e4e,Sullivan,Katrina,2000-02-07,Female,Married,1,Charity fundraiser,58209.87,High School,"5642 Rachel Pass Suite 320\nPort Peggy, RI 75805",+1-581-683-4267,27,SME,Email,665,10,8916.67,0,,,30624.76,4,1
2,3,66309fd3-5009-44d3-a3f7-1657c869d573,Fuller,Henry,1954-02-03,Female,Single,1,Television production assistant,9794.01,High School,00025 Brittany Flats Apt. 543\nNew Amandaborou...,9753900004,14,Retail,Email,715,21,43270.54,0,,,111956.61,2,6
3,4,b02a30df-1a5f-4087-8075-2a35432da641,Young,Antonio,1991-01-15,Female,Divorced,5,Agricultural engineer,15088.98,High School,Unit 3386 Box 0088\nDPO AE 71940,+1-928-477-2856x660,23,Corporate,Phone,747,17,17887.65,0,,,201187.61,1,0
4,5,0d932e5b-bb3a-4104-8c83-f84270f7f2ea,Andersen,John,1992-04-08,Female,Divorced,2,"Teacher, early years/pre",60726.56,Master's,"0120 Leslie River\nWest Lindseychester, MN 98290",204.515.2388,22,Corporate,Email,549,25,32686.84,0,,,60391.24,5,6


In [9]:
# Check Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115640 entries, 0 to 115639
Data columns (total 25 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   RowNumber                        115640 non-null  int64  
 1   CustomerId                       115640 non-null  object 
 2   Surname                          115640 non-null  object 
 3   First Name                       115640 non-null  object 
 4   Date of Birth                    115640 non-null  object 
 5   Gender                           115640 non-null  object 
 6   Marital Status                   115640 non-null  object 
 7   Number of Dependents             115640 non-null  int64  
 8   Occupation                       115640 non-null  object 
 9   Income                           115640 non-null  float64
 10  Education Level                  115640 non-null  object 
 11  Address                          115640 non-null  object 
 12  Co

In [87]:
# Check duplicate and missing value
print(f'Total duplicate data : {data.duplicated().sum()}\n')

data.isnull().sum()

Total duplicate data : 0



RowNumber                               0
CustomerId                              0
Surname                                 0
First Name                              0
Date of Birth                           0
Gender                                  0
Marital Status                          0
Number of Dependents                    0
Occupation                              0
Income                                  0
Education Level                         0
Address                                 0
Contact Information                     0
Customer Tenure                         0
Customer Segment                        0
Preferred Communication Channel         0
Credit Score                            0
Credit History Length                   0
Outstanding Loans                       0
Churn Flag                              0
Churn Reason                       101546
Churn Date                         101546
Balance                                 0
NumOfProducts                     

After conducting a quick exploration, all columns were found to have no missing values except for "Churn Reason" and "Churn Date." There are no duplicate records, and all data types are correct based on the values within each column.

In [3]:
# Separate categorical and numerical column
cat_cols = ['Gender', 'Marital Status', 'Occupation', 'Education Level', 'Customer Segment', 'Preferred Communication Channel', 'Churn Flag', 'Churn Reason']
num_cols = ['Number of Dependents', 'Income', 'Customer Tenure', 'Credit Score', 'Credit History Length', 'Outstanding Loans', 'Balance', 'NumOfProducts', 'NumComplaints']

The purpose of separation is to distinguish between categorical and numerical columns that are potentially useful as feature columns.

### Feature Engineering

#### Spliting

In [93]:
# Copy the raw dataset and dropping RowNumber column

fe = data.copy()
fe = fe.drop(columns='RowNumber', axis=1)

In [94]:
# Slicing between feature and target

X = fe.drop(columns='Churn Flag', axis=1)
y = fe['Churn Flag']

In [157]:
# Spliting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(f'Train size : {X_train.shape}')
print(f'Test size : {X_test.shape}')

Train size : (92512, 23)
Test size : (23128, 23)


#### Handling Duplicates

In [96]:
# Check duplicates

X_train.duplicated().sum()

0

Since there are no duplicate values, no action is needed to address duplicated data.

#### Handling Outliers

In [97]:
# Check outliers

out = []

for col in X_train[num_cols]:
    lower_boundary = X_train[col].mean() - 3* X_train[col].std()
    upper_boundary = X_train[col].mean() + 3* X_train[col].std()
    totout = ((len(X_train[X_train[col] > upper_boundary]) / len(X_train) * 100) + (len(X_train[X_train[col] < lower_boundary]) / len(X_train) * 100))
    out.append([col, totout])

out_df = pd.DataFrame(columns=['Column Name', 'Outliers Percentage'],data=out)
out_df

Unnamed: 0,Column Name,Outliers Percentage
0,Number of Dependents,0.0
1,Income,0.0
2,Customer Tenure,0.0
3,Credit Score,0.0
4,Credit History Length,0.0
5,Outstanding Loans,0.0
6,Balance,0.0
7,NumOfProducts,0.0
8,NumComplaints,0.0


Since the percentage of outliers in the numerical columns is 0, no action is needed to address them.

#### Handling Missing Value

In [98]:
# Check missing value

X_train.isnull().sum()

CustomerId                             0
Surname                                0
First Name                             0
Date of Birth                          0
Gender                                 0
Marital Status                         0
Number of Dependents                   0
Occupation                             0
Income                                 0
Education Level                        0
Address                                0
Contact Information                    0
Customer Tenure                        0
Customer Segment                       0
Preferred Communication Channel        0
Credit Score                           0
Credit History Length                  0
Outstanding Loans                      0
Churn Reason                       81240
Churn Date                         81240
Balance                                0
NumOfProducts                          0
NumComplaints                          0
dtype: int64

In [99]:
# Fill missing value

X_train['Churn Reason'] = X_train['Churn Reason'].fillna('Not Churn')
X_test['Churn Reason'] = X_test['Churn Reason'].fillna('Not Churn')

There are missing values in the "Churn Reason" and "Churn Date" columns. The type of missing values is classified as missing at random (MAR); these columns depend on the "Churn Flag" column. When the "Churn Flag" is 0, both the "Churn Reason" and "Churn Date" are automatically missing because the customer has not churned. To handle the missing values, I have chosen to fill the "Churn Reason" column with "Not Churn." However, I will not fill in the "Churn Date" column since it is not part of the feature set.

#### Feature Selection

In [100]:
# Assign function
def cramers_v(x, y):
    '''
    This function will calculate the correlation value with chi-squared method
    '''
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r - 1, k - 1))))

In [101]:
# Check correlation between categorical feature and target

cat_corr = []

for col in cat_cols:
    if col == 'Churn Flag':
        continue
    else:
        cat_corr.append([col, cramers_v(X_train[col], y_train)])

cat_corr_df = pd.DataFrame(columns=['Column Name', 'Correlation'], data=cat_corr)
cat_corr_df

Unnamed: 0,Column Name,Correlation
0,Gender,0.002391
1,Marital Status,0.005378
2,Occupation,0.083444
3,Education Level,0.005778
4,Customer Segment,0.000667
5,Preferred Communication Channel,0.001745
6,Churn Reason,1.0


After calculating the correlation between categorical features and the target using Cramers_v function, I found that some columns have low correlation with the target. However, I will still use these features because they may interact with other variables in ways that could influence the target variable. Their combination with other features might reveal patterns that are not immediately apparent.

In [103]:
num_corr = []

for col in num_cols:
    cat_list = y_train.unique()
    anova_data = [X_train[col][y_train == cat] for cat in cat_list]

    f_value, p_value = f_oneway(*anova_data)
    num_corr.append([col, p_value])

num_corr_df = pd.DataFrame(columns=['Column Name', 'P-value'], data=num_corr)
num_corr_df

Unnamed: 0,Column Name,P-value
0,Number of Dependents,0.699299
1,Income,0.827022
2,Customer Tenure,0.717998
3,Credit Score,0.0
4,Credit History Length,0.738889
5,Outstanding Loans,0.613582
6,Balance,0.0
7,NumOfProducts,0.0
8,NumComplaints,0.0


I calculated the p-value using ANOVA, as the features are numerical and the target is categorical. A p-value under 0.05 indicates a significant correlation between the feature and the target. However, I will still use these features because they may interact with other variables in ways that could influence the target variable. Their combination with other features might reveal patterns that are not immediately apparent.

### Model Training

In [104]:
# Create column transformer

cat_col = ['Gender', 'Marital Status', 'Preferred Communication Channel', 'Churn Reason']
num_col = num_cols
ord_col = ['Education Level', 'Customer Segment']
bi_col = ['Occupation']

cat_pipeline = OneHotEncoder(handle_unknown='ignore')
num_pipeline = MinMaxScaler()
ord_pipeline = OrdinalEncoder(categories=[["High School", "Diploma", "Bachelor's", "Master\'s"], ["SME", "Corporate", "Retail"]])
bi_pipeline = ce.BinaryEncoder()

transformer = ColumnTransformer([
    ('pipe_num', num_pipeline, num_col),
    ('pipe_cat', cat_pipeline, cat_col),
    ('pipe_bi', bi_pipeline, bi_col)
])

<ul>
    <li>OneHotEncoder:<br>Converts categorical variables into binary columns (one column for each category) where each category is represented as a 0 or 1.</li>
    <li>MinMaxScaler:<br>Scales numerical data by transforming the values to a fixed range, typically between 0 and 1, to ensure all features have the same scale.</li>
    <li>OrdinalEncoder:<br>Encodes categorical variables with an inherent order (e.g., low, medium, high) by assigning integers to each category based on their rank or order.</li>
    <li>BinaryEncoder:<br>Converts high-cardinality categorical variables into binary format by encoding each category as a series of 0s and 1s, reducing the dimensionality compared to one-hot encoding.</li>
</ul>

In [105]:
# Making pipeline for every algorithm

pipe_logreg = Pipeline([
('transformer', transformer),
('classifier', LogisticRegression())
])

pipe_dt = Pipeline([
('transformer', transformer),
('classifier', DecisionTreeClassifier())
])

pipe_rf = Pipeline([
('transformer', transformer),
('classifier', RandomForestClassifier())
])

pipe_svc = Pipeline([
('transformer', transformer),
('classifier', SVC())
])

pipe_knc = Pipeline([
('transformer', transformer),
('classifier', KNeighborsClassifier())
])

pipe_xgb = Pipeline([
('transformer', transformer),
('classifier', XGBClassifier())
])

These pipelines can serve as a baseline for evaluating model performance. By running each algorithm with the same data processing steps, we can compare the models' performance under similar conditions. This helps identify which models perform best without complex hyperparameter tuning.

### Model Evaluation

#### Logistic Regression

<ul>
    <li>Combines features linearly to create a score (z).</li>
    <li>Applies the sigmoid function to convert the score into a probability.</li>
    <li>Uses a threshold (e.g., 0.5) to classify the output as 0 or 1.</li>
    <li>Trains the model by minimizing the log-loss through optimization techniques.</li>
</ul>

In [106]:
# Training

pipe_logreg.fit(X_train, y_train)
print('Training Score: ',round(pipe_logreg.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_logreg.score(X_test, y_test),4))

Training Score:  1.0
Testing Score:  1.0


In [107]:
# Prediction

y_logreg_pred = pipe_logreg.predict(X_test)
y_logreg_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [108]:
# Evaluate

print(classification_report(y_test, y_logreg_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      1.00      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [109]:
# Cross Validation

f1_train_cross_val = cross_val_score(pipe_logreg,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [1. 1. 1.]
F1 Score - Mean - Cross Validation :  1.0
F1 Score - Std - Cross Validation  :  0.0
F1 Score - Range of Test-Set       :  1.0 - 1.0


Using the logistic regression model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### Decision Tree

<ul>
    <li>Decision trees are intuitive models that work by splitting data based on feature values to make predictions.</li>
    <li>They are easy to interpret and visualize but can easily overfit the training data if not managed properly</li>
    <li>Decision trees can be combined in ensemble methods like Random Forests to improve their predictive power and robustness.</li>
</ul>

In [110]:
# Training

pipe_dt.fit(X_train, y_train)
print('Training Score: ',round(pipe_dt.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_dt.score(X_test, y_test),4))

Training Score:  1.0
Testing Score:  1.0


In [111]:
# Prediction

y_dt_pred = pipe_dt.predict(X_test)
y_dt_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [112]:
# Evaluate

print(classification_report(y_test, y_dt_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      1.00      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [113]:
#Cross Validation

f1_train_cross_val = cross_val_score(pipe_dt,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [1. 1. 1.]
F1 Score - Mean - Cross Validation :  1.0
F1 Score - Std - Cross Validation  :  0.0
F1 Score - Range of Test-Set       :  1.0 - 1.0


Using the decision tree model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### Random Forest

<ul>
    <li>Random Forest enhances the accuracy and stability of decision trees by using bootstrapping and feature randomness.</li>
    <li>It reduces overfitting and improves generalization by combining the predictions from multiple trees.</li>
    <li>Random Forest is widely used in practice due to its robustness, interpretability (through feature importance), and ability to handle both classification and regression tasks effectively.</li>
</ul>

In [114]:
# Training

pipe_rf.fit(X_train, y_train)
print('Training Score: ',round(pipe_rf.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_rf.score(X_test, y_test),4))

Training Score:  1.0
Testing Score:  1.0


In [115]:
# Prediction

y_rf_pred = pipe_rf.predict(X_test)
y_rf_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [116]:
# Evaluate

print(classification_report(y_test, y_rf_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      1.00      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [117]:
#Cross Validation

f1_train_cross_val = cross_val_score(pipe_rf,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [1. 1. 1.]
F1 Score - Mean - Cross Validation :  1.0
F1 Score - Std - Cross Validation  :  0.0
F1 Score - Range of Test-Set       :  1.0 - 1.0


Using the random forest model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### SVC

<ul>
    <li>SVC is a powerful classification algorithm that finds the optimal hyperplane to separate classes in the feature space.</li>
    <li>It utilizes support vectors to determine the margin and employs the kernel trick to handle non-linearly separable data.</li>
    <li>The algorithm is robust and effective for various types of classification tasks, especially in high-dimensional spaces.</li>
</ul>

In [118]:
# Training

pipe_svc.fit(X_train, y_train)
print('Training Score: ',round(pipe_svc.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_svc.score(X_test, y_test),4))

Training Score:  1.0
Testing Score:  1.0


In [119]:
# Prediction

y_svc_pred = pipe_svc.predict(X_test)
y_svc_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [120]:
# Evaluate

print(classification_report(y_test, y_svc_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      1.00      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [121]:
#Cross Validation

f1_train_cross_val = cross_val_score(pipe_svc,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [1. 1. 1.]
F1 Score - Mean - Cross Validation :  1.0
F1 Score - Std - Cross Validation  :  0.0
F1 Score - Range of Test-Set       :  1.0 - 1.0


Using the support vector classifier model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### K Neighbors Classifier

KNN is a straightforward yet powerful classification algorithm that operates by identifying the closest labeled instances to a new data point and predicting its class based on those neighbors. Its effectiveness hinges on the choice of distance metric, the value of K, and proper feature scaling.

In [122]:
# Training

pipe_knc.fit(X_train, y_train)
print('Training Score: ',round(pipe_knc.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_knc.score(X_test, y_test),4))

Training Score:  0.9995
Testing Score:  0.9982


In [123]:
# Prediction

y_knc_pred = pipe_knc.predict(X_test)
y_knc_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [124]:
# Evaluate

print(classification_report(y_test, y_knc_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      0.99      0.99      2822

    accuracy                           1.00     23128
   macro avg       1.00      0.99      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [125]:
#Cross Validation

f1_train_cross_val = cross_val_score(pipe_knc,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [0.98692546 0.98582805 0.98828598]
F1 Score - Mean - Cross Validation :  0.9870131640652878
F1 Score - Std - Cross Validation  :  0.0010053629101535208
F1 Score - Range of Test-Set       :  0.9860078011551343 - 0.9880185269754413


Using the k neighbors classifier model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### XGBClassifier

XGBoost is a robust and efficient gradient boosting algorithm that excels at classification and regression tasks. By building an ensemble of decision trees through a sequential process that minimizes errors, it achieves high accuracy while incorporating regularization and optimizations for performance.

In [126]:
# Training

pipe_xgb.fit(X_train, y_train)
print('Training Score: ',round(pipe_xgb.score(X_train, y_train),4))
print('Testing Score: ', round(pipe_xgb.score(X_test, y_test),4))

Training Score:  1.0
Testing Score:  1.0


In [127]:
# Prediction

y_xgb_pred = pipe_xgb.predict(X_test)
y_xgb_pred

array([0, 0, 0, ..., 0, 1, 0])

In [128]:
# Evaluate

print(classification_report(y_test, y_xgb_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      1.00      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [129]:
#Cross Validation

f1_train_cross_val = cross_val_score(pipe_xgb,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [1. 1. 1.]
F1 Score - Mean - Cross Validation :  1.0
F1 Score - Std - Cross Validation  :  0.0
F1 Score - Range of Test-Set       :  1.0 - 1.0


Using the XGBClassifier model, the F1-score on the test set indicates a good fit, with no significant difference in performance on the training set. The results indicate that the model not only fits the training data exceptionally well but also generalizes perfectly to new, unseen data.

#### Base Model Comparison

In [150]:
# Making comparison

models = {
    'Logistic Regression' : pipe_logreg,
    'Decision Tree' : pipe_dt,
    'Random Forest' : pipe_rf,
    'SVC' : pipe_svc,
    'K Neighbors Classifier' : pipe_knc,
    'XGBoost' : pipe_xgb
}

metrics = {
    'F1-Score' : f1_score,
}

df_model = pd.DataFrame(columns = models.keys(), index=['F1-Score'])

for metric in metrics.keys():
  for model in models.keys():
    df_model.loc[metric, model] = metrics[metric](y_test, models[model].predict(X_test))

In [131]:
df_model

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,SCV,K Neighbors Classifier,XGBoost
F1-Score,1.0,1.0,1.0,1.0,0.992682,1.0


In [132]:
# Cross validation on each model

models = {
    'Logistic Regression' : pipe_logreg,
    'Decision Tree' : pipe_dt,
    'Random Forest' : pipe_rf,
    'SCV' : pipe_svc,
    'K Neighbors Classifier' : pipe_knc,
    'XGBoost' : pipe_xgb
}

metrics = {
    'Mean' : cross_val_score(models[model], X_train, y_train, cv = 3, scoring='f1').mean(),
    'Std' : cross_val_score(models[model], X_train, y_train, cv = 3, scoring='f1').std()
}

df_cross_val = pd.DataFrame(columns = models.keys(), index = ['Mean', 'Std'])

for i in metrics.keys():
  for j in models.keys():
    df_cross_val.loc[i, j] = metrics[i]

In [133]:
df_cross_val

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,SCV,K Neighbors Classifier,XGBoost
Mean,1.0,1.0,1.0,1.0,1.0,1.0
Std,0.0,0.0,0.0,0.0,0.0,0.0


The comparison between the test set and the train set shows no significant differences, indicating that all the models I selected are well-fitted. Given the goal of developing a machine learning model that accurately predicts customer churn while prioritizing low computational costs and shorter processing times, I chose KNN due to its simplicity in implementation and ease of understanding compared to other models.

#### Tuning

In [134]:
# Tuning

param_distributions = {
    'classifier__n_neighbors': list(range(1, 11)),  
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

grid_search = GridSearchCV(
    estimator=pipe_knc,
    param_grid=param_distributions,
    cv=3,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)


grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [135]:
# Output

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 10, 'classifier__weights': 'distance'}
Best Score: 0.9924467866539546


In [151]:
# best_model

best_model = grid_search.best_estimator_
best_model

In [158]:
# Prediction train set

y_bm_pred = best_model.predict(X_train)
print(classification_report(y_train, y_bm_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81240
           1       1.00      1.00      1.00     11272

    accuracy                           1.00     92512
   macro avg       1.00      1.00      1.00     92512
weighted avg       1.00      1.00      1.00     92512



In [152]:
# Prediction test set

y_bm_pred = best_model.predict(X_test)
print(classification_report(y_test, y_bm_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20306
           1       1.00      0.99      1.00      2822

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [153]:
#Cross Validation

f1_train_cross_val = cross_val_score(best_model,
                                     X_train,
                                     y_train,
                                     cv=3,
                                     scoring="f1")

print('F1 Score - All - Cross Validation  : ', f1_train_cross_val)
print('F1 Score - Mean - Cross Validation : ', f1_train_cross_val.mean())
print('F1 Score - Std - Cross Validation  : ', f1_train_cross_val.std())
print('F1 Score - Range of Test-Set       : ', (f1_train_cross_val.mean()-f1_train_cross_val.std()) , '-', (f1_train_cross_val.mean()+f1_train_cross_val.std()))

F1 Score - All - Cross Validation  :  [0.99208797 0.99222103 0.99303136]
F1 Score - Mean - Cross Validation :  0.9924467866539546
F1 Score - Std - Cross Validation  :  0.00041690901882439867
F1 Score - Range of Test-Set       :  0.9920298776351302 - 0.992863695672779


The comparison of the F1 scores between the train set and test set for the best model, KNN, after tuning shows a good fit, as there is no significant difference between them. To further validate the model, I performed cross-validation on the train set, which revealed a small difference, with a mean F1 score of 0.99.

#### Model After Tuning Comparison

In [154]:
# Comparison

models = {
    'K Neighbors Classifier' : pipe_knc,
    'KNC after tuning' : best_model
}

metrics = {
    'F1-Score' : f1_score,
}

model_tuning = pd.DataFrame(columns = models.keys(), index=['F1-Score'])

for metric in metrics.keys():
  for model in models.keys():
    model_tuning.loc[metric, model] = metrics[metric](y_test, models[model].predict(X_test))

In [148]:
model_tuning

Unnamed: 0,K Neighbors Classifier,KNC after tuning
F1-Score,0.992682,0.996801


I want to check the difference between before and after tuning by predicting on the test set using the F1 score, which shows an improvement, even if it is a small one.

### Model Saving

In [155]:
# Saving best_model

with open('best_model.pkl', 'wb') as file:
  pickle.dump(best_model, file)