In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv('./BankChurners.csv')
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [3]:
df.columns


Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

In [16]:
df['Marital_Status'].unique()

array(['Married', 'Single', 'Unknown', 'Divorced'], dtype=object)

In [18]:
#Preparing data

df['Attrition_Flag'].mask(df['Attrition_Flag'] == 'Existing Customer', 1, inplace=True)
df['Attrition_Flag'].mask(df['Attrition_Flag'] == 'Attrited Customer', 0, inplace=True)
df['Gender'].mask(df['Gender'] == 'M', 1, inplace=True)
df['Gender'].mask(df['Gender'] == 'F', 0, inplace=True)

#Sort education from the lowest level to the highest

df['Education_Level'].mask(df['Education_Level'] == 'Unknown', 0, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Uneducated', -3, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'High School', -2, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'College', -1, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Graduate', 1, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Post-Graduate', 2, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Doctorate', 3, inplace=True)


In [None]:
#marital status (decided to add binary variables)

df['Marital_Status_Married'] = (df['Marital_Status'] == 'Married').astype(int)
df['Marital_Status_Single'] = (df['Marital_Status'] == 'Single').astype(int)
df['Marital_Status_Divorced'] = (df['Marital_Status'] == 'Divorced').astype(int)
df = df[df['Marital_Status'] != 'Unknown']


In [22]:
df['Income_Category'].unique()

array(['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K',
       '$120K +', 'Unknown'], dtype=object)

In [23]:
df['Card_Category'].unique()

array(['Blue', 'Gold', 'Silver', 'Platinum'], dtype=object)

In [24]:
#card category values can be sorted from the worst to the best

df['Card_Category'].mask(df['Card_Category'] == 'Blue', 0, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Silver', 1, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Gold', 2, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Platinum', 3, inplace=True)

In [None]:
#binary variables for income level

df['Income_Category_less40'] = (df['Income_Category'] == 'Less than $40K').astype(int)
df['Income_Category_40-60'] = (df['Income_Category'] == '$40K - $60K').astype(int)
df['Income_Category_60-80'] = (df['Income_Category'] == '$60K - $80K').astype(int)
df['Income_Category_80-120'] = (df['Income_Category'] == '$80K - $120K').astype(int)
df['Income_Category_120+'] = (df['Income_Category'] == '$120K +').astype(int)
df = df[df['Income_Category'] != 'Unknown']


In [27]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2,Marital_Status_Married,Marital_Status_Single,Marital_Status_Divorced,Income_Category_less40,Income_Category_40-60,Income_Category_60-80,Income_Category_80-120,Income_Category_120+
0,768805383,1,45,1,3,-2,Married,$60K - $80K,0,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991,1,0,0,0,0,1,0,0
1,818770008,1,49,0,5,1,Single,Less than $40K,0,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994,0,1,0,1,0,0,0,0
2,713982108,1,51,1,3,1,Married,$80K - $120K,0,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998,1,0,0,0,0,0,1,0
4,709106358,1,40,1,3,-3,Married,$60K - $80K,0,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998,1,0,0,0,0,1,0,0
5,713061558,1,44,1,2,1,Married,$40K - $60K,0,36,3,1,2,4010.0,1247,2763.0,1.376,1088,24,0.846,0.311,5.5e-05,0.99994,1,0,0,0,1,0,0,0


In [56]:
#defining variables

X = df[['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Marital_Status_Married', 'Marital_Status_Single', 'Marital_Status_Divorced', 'Income_Category_less40', 'Income_Category_40-60', 'Income_Category_60-80', 'Income_Category_80-120', 'Income_Category_120+']].values.reshape(-1,25)
y = df['Attrition_Flag'].astype('int').values.reshape(-1,1).ravel()

#dividing into train and test

random_state = 999

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit(X_test)



In [58]:
lr = LogisticRegression(random_state=random_state, max_iter=5000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test).reshape(-1,1)
print(accuracy_score(y_pred, y_test))

0.9095808383233533


In [59]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test).reshape(-1,1)
print(accuracy_score(y_pred, y_test))

0.9041916167664671
