In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("dark_background")

In [4]:
data = pd.read_csv('churn_prediction_simple.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22067 entries, 0 to 22066
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   customer_id                     22067 non-null  int64  
 1   vintage                         22067 non-null  int64  
 2   age                             22067 non-null  int64  
 3   gender                          22067 non-null  int64  
 4   dependents                      22067 non-null  float64
 5   occupation                      22067 non-null  int64  
 6   city                            22067 non-null  float64
 7   customer_nw_category            22067 non-null  int64  
 8   branch_code                     22067 non-null  int64  
 9   days_since_last_transaction     22067 non-null  float64
 10  current_balance                 22067 non-null  float64
 11  previous_month_end_balance      22067 non-null  float64
 12  average_monthly_balance_prevQ   

In [8]:
# separating dependent and independent variable
X = data.drop(columns = ['churn','customer_id'])
Y = data['churn']

In [9]:
#scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(X)

In [11]:
#splitting the dataset
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test= tts(scaled_x,Y,train_size = 0.80, stratify = Y, random_state = 101)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((17653, 19), (4414, 19), (17653,), (4414,))

# Model Building, Bagging Logisyic Regression

In [12]:
from sklearn.ensemble import BaggingClassifier as BC
classifier = BC()
classifier.fit(x_train, y_train)

BaggingClassifier()

In [14]:
from sklearn.linear_model import LogisticRegression as LR

classifier = BC(base_estimator = LR(),
               n_estimators = 150,
               n_jobs = -1,
               random_state = 42)
classifier.fit(x_train,y_train)
predicted_values = classifier.predict(x_train)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_train, predicted_values))

              precision    recall  f1-score   support

           0       0.82      0.99      0.90     14234
           1       0.75      0.08      0.15      3419

    accuracy                           0.82     17653
   macro avg       0.78      0.54      0.52     17653
weighted avg       0.80      0.82      0.75     17653



In [20]:
predicted_values = classifier.predict(x_test)
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92      3559
           1       0.71      0.46      0.56       855

    accuracy                           0.86      4414
   macro avg       0.79      0.71      0.74      4414
weighted avg       0.85      0.86      0.85      4414



In [23]:
from sklearn.ensemble import RandomForestClassifier as RFC
classifier= RFC()

In [24]:
classifier.fit(x_train, y_train)

RandomForestClassifier()

In [29]:
from sklearn.linear_model import LogisticRegression as LR

classifier = RFC(
               n_estimators = 100,
               n_jobs = None,
               random_state = None)
classifier.fit(x_train,y_train)
predicted_values = classifier.predict(x_train)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_train, predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14234
           1       1.00      1.00      1.00      3419

    accuracy                           1.00     17653
   macro avg       1.00      1.00      1.00     17653
weighted avg       1.00      1.00      1.00     17653



In [31]:
predicted_values = classifier.predict(x_test)
print(classification_report(y_test, predicted_values))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      3559
           1       0.72      0.45      0.55       855

    accuracy                           0.86      4414
   macro avg       0.80      0.70      0.74      4414
weighted avg       0.85      0.86      0.85      4414



In [32]:
from sklearn.metrics import f1_score
def calc_score(model, x1, y1, x2, y2):
    '''x1, y1: training set
    x2,y2: test set'''
    
    model.fit(x1,y1)
    predict = model.predict(x1)
    f1 = f1_score(y1,predict)
    
    predict = model.predict(x2)
    f2 = f1_score(y2,predict)
    
    return f1, f2

In [33]:
def effect(train_score, test_score, x_axis, title):
    '''train_score: The list of scores when predictions are made on the training set
    test_score: The list of scores when predictions are made on test set
    x_axis: The list of parameter values which were used to generate the respective f1 scores which are on the x axis
    title: title of the plot'''
    plt. figure(figsize = (5,5), dpi = 120)
    plt.plot(x_axis, train_score, color = 'red', label = 'train_score')
    plt.plot(x_axis, test_score, color = 'blue', label = 'test_score')
    plt.title(title)
    plt.legend()
    plt.xlabel('parameter_value')
    plt.ylabel('f1_score')
    plt.show()

# Hyperparameter Tuning

In [34]:
classifier = RFC()
classifier.fit(x_train, y_train)

RandomForestClassifier()

In [None]:
estimators = [i for i in range(1,600,10)]
train = []
test = []

for i in estimators: 
    model = RFC(class_weight = 'balanced_subsample',
               n_estimators = i,
               n_jobs = -1,
               max_depth = 7,
               random_state = 101)
    f1, f2 = calc_score(model, x_train, y_train, x_test, y_test)
    train.append(f1)
    test.append(f2)

In [None]:
effect(train, test, range(1,600, 10), 'n_estimators')