Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/DBS/CustomerChurn.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())

      Attrition_Flag  Customer_Age Gender  Dependent_count Education_Level  \
0  Existing Customer            45      M                3     High School   
1  Existing Customer            49      F                5        Graduate   
2  Existing Customer            51      M                3        Graduate   
3  Existing Customer            40      M                3      Uneducated   
4  Existing Customer            44      M                2        Graduate   

  Marital_Status Income_Category Card_Category  Months_on_book  \
0        Married     $60K - $80K          Blue              39   
1         Single  Less than $40K          Blue              44   
2        Married    $80K - $120K          Blue              36   
3        Married     $60K - $80K          Blue              21   
4        Married     $40K - $60K          Blue              36   

   Total_Relationship_Count  Months_Inactive  Contacts_Count  Credit_Limit  \
0                         5                1            

In [None]:
print(dataset.shape)

(6237, 16)


In [None]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            6237 non-null   object 
 1   Customer_Age              6237 non-null   int64  
 2   Gender                    6237 non-null   object 
 3   Dependent_count           6237 non-null   int64  
 4   Education_Level           6237 non-null   object 
 5   Marital_Status            6237 non-null   object 
 6   Income_Category           6237 non-null   object 
 7   Card_Category             6237 non-null   object 
 8   Months_on_book            6237 non-null   int64  
 9   Total_Relationship_Count  6237 non-null   int64  
 10  Months_Inactive           6237 non-null   int64  
 11  Contacts_Count            6237 non-null   int64  
 12  Credit_Limit              6237 non-null   float64
 13  Total_Revolving_Bal       6237 non-null   int64  
 14  Total_Tr

In [None]:
print(dataset.describe())

       Customer_Age  Dependent_count  Months_on_book  \
count   6237.000000      6237.000000     6237.000000   
mean      46.380952         2.331409       36.000962   
std        8.047893         1.297106        7.980412   
min       26.000000         0.000000       13.000000   
25%       41.000000         1.000000       31.000000   
50%       46.000000         2.000000       36.000000   
75%       52.000000         3.000000       40.000000   
max       73.000000         5.000000       56.000000   

       Total_Relationship_Count  Months_Inactive  Contacts_Count  \
count               6237.000000      6237.000000     6237.000000   
mean                   3.827641         2.342312        2.462402   
std                    1.546049         0.999853        1.109643   
min                    1.000000         0.000000        0.000000   
25%                    3.000000         2.000000        2.000000   
50%                    4.000000         2.000000        2.000000   
75%                

Creating Dummy Variable

In [None]:
#dataset.drop(['Education_Level'], axis = 1)

In [None]:
dataset.Education_Level.unique()

array(['High School', 'Graduate', 'Uneducated', 'Post-Graduate',
       'Doctorate'], dtype=object)

In [None]:
dataset['Attrition_Flag'] = dataset['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})
dataset['Gender'] = dataset['Gender'].map({'M':1, 'F':0})
#dataset['Marital_Status'] = dataset['Marital_Status'].map({'Single':2, 'Married':1, 'Divorced':0})
dataset['Education_Level'] = dataset['Education_Level'].map({'High School':4, 'Graduate':3, 'Uneducated':2, 'Post-Graduate':1, 'Doctorate':0})
dataset['Income_Category'] = dataset['Income_Category'].map({'Less than $40K':4, '$40K - $60K':3, '$60K - $80K':2, '$80K - $120K':1, '$120K +':0})
dataset['Card_Category'] = dataset['Card_Category'].map({'Blue':3, 'Gold':2, 'Silver':1, 'Platinum':0})

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            6237 non-null   int64  
 1   Customer_Age              6237 non-null   int64  
 2   Gender                    6237 non-null   int64  
 3   Dependent_count           6237 non-null   int64  
 4   Education_Level           6237 non-null   int64  
 5   Marital_Status            6237 non-null   object 
 6   Income_Category           6237 non-null   int64  
 7   Card_Category             6237 non-null   int64  
 8   Months_on_book            6237 non-null   int64  
 9   Total_Relationship_Count  6237 non-null   int64  
 10  Months_Inactive           6237 non-null   int64  
 11  Contacts_Count            6237 non-null   int64  
 12  Credit_Limit              6237 non-null   float64
 13  Total_Revolving_Bal       6237 non-null   int64  
 14  Total_Tr

In [None]:
categorical_features = ['Marital_Status']
final_data = pd.get_dummies(dataset, columns = categorical_features)
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            6237 non-null   int64  
 1   Customer_Age              6237 non-null   int64  
 2   Gender                    6237 non-null   int64  
 3   Dependent_count           6237 non-null   int64  
 4   Education_Level           6237 non-null   int64  
 5   Marital_Status            6237 non-null   object 
 6   Income_Category           6237 non-null   int64  
 7   Card_Category             6237 non-null   int64  
 8   Months_on_book            6237 non-null   int64  
 9   Total_Relationship_Count  6237 non-null   int64  
 10  Months_Inactive           6237 non-null   int64  
 11  Contacts_Count            6237 non-null   int64  
 12  Credit_Limit              6237 non-null   float64
 13  Total_Revolving_Bal       6237 non-null   int64  
 14  Total_Tr

In [None]:
X = final_data.drop('Attrition_Flag', axis = 1) # taking all the independent variables except the attrition 

In [None]:
Y = final_data['Attrition_Flag'] # taking the dependednt/target variable in Y

In [None]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

TRAINING THE RFC MODEL

In [None]:
# Implementing Random Forest Classifier
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [10,20,30,40]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

In [None]:
gd_sr.fit(X_scaled, Y) # Fitting the model to scaled X column and Y column

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

{'classification__n_estimators': 50}
0.6501657515798198
Total_Trans_Ct              0.258847
Total_Trans_Amt             0.181411
Total_Revolving_Bal         0.149749
Total_Relationship_Count    0.079236
Months_Inactive             0.068103
Contacts_Count              0.054320
Credit_Limit                0.043135
Customer_Age                0.038171
Months_on_book              0.027682
Dependent_count             0.026581
Education_Level             0.024716
Income_Category             0.022902
Gender                      0.009353
Marital_Status_Married      0.006082
Marital_Status_Single       0.005195
Card_Category               0.002915
Marital_Status_Divorced     0.001602
dtype: float64


In [None]:
#Selecting the best feature for feature selection
X_ = dataset[['Total_Trans_Ct','Total_Trans_Amt','Total_Revolving_Bal','Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count', 'Credit_Limit', 'Customer_Age', 'Months_on_book']]

In [None]:
#Normalizing the data again after selecting the best featureset(Most efficient)
feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

In [None]:
#Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [50,70,90,110,130,150,170,190]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__n_estimators': 90}
0.6735937014399668


TRAINING AND FITTING THE Support Vector Classification Model

In [None]:
model = Pipeline([
         ('balancing', SMOTE(random_state = 101)),
         ('classification', SVC(random_state=1) )
     ])
grid_param = {'classification__kernel': ['linear','poly','rbf','sigmoid'], 'classification__C': [.001,.01,.1,1,10,100]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
# In the above GridSearchCV(), scoring parameter should be set as follows:
# scoring = 'accuracy' when you want to maximize prediction accuracy
# scoring = 'recall' when you want to minimize false negatives
# scoring = 'precision' when you want to minimize false positives
# scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
# """

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__C': 0.001, 'classification__kernel': 'sigmoid'}
0.8449290376048897
