In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [10]:
df = pd.read_csv("bank_customer_churn.csv")

In [11]:
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
7973,7974,15656005,Millar,592,Germany,Male,31,7,124593.23,1,1,0,86079.67,0
8385,8386,15649297,T'ang,605,France,Female,62,4,111065.93,2,0,1,125660.99,0
5584,5585,15768270,DeRose,579,Spain,Female,31,9,0.0,2,1,0,112395.98,0
5576,5577,15635964,Eve,566,Germany,Male,65,4,120100.41,1,1,0,107563.16,1
5051,5052,15572728,Ross,704,Spain,Male,36,8,127397.34,1,1,0,151335.24,0


In [12]:
useful_columns = df[['CreditScore', 'Geography', 'Gender',
                        'Age', 'Tenure', 'Balance', 'NumOfProducts',
                        'HasCrCard','IsActiveMember', 'EstimatedSalary',
                        'Exited']]
#useful_columns

In [13]:
df = useful_columns

In [14]:
df['isMale'] = df['Gender']=='Male'
df = df.drop('Gender', axis=1)
df

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale
0,619,France,42,2,0.00,1,1,1,101348.88,1,False
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0,False
2,502,France,42,8,159660.80,3,1,0,113931.57,1,False
3,699,France,39,1,0.00,2,0,0,93826.63,0,False
4,850,Spain,43,2,125510.82,1,1,1,79084.10,0,False
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,39,5,0.00,2,1,0,96270.64,0,True
9996,516,France,35,10,57369.61,1,1,1,101699.77,0,True
9997,709,France,36,7,0.00,1,0,1,42085.58,1,False
9998,772,Germany,42,3,75075.31,2,1,0,92888.52,1,True


In [15]:
df['Geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [16]:
X = pd.get_dummies(df.drop('Exited', axis=1))
y = df['Exited']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, test_size=0.2, random_state=42)

8000 training rows and 2000 test rows

### Logistic Regression using SKLearn

In [19]:
lr = LogisticRegression()

In [20]:
lr.fit(X_train, y_train)

LogisticRegression()

In [21]:
y_pred = lr.predict(X_test); y_pred

array([0, 0, 0, ..., 0, 0, 0])

### Finding Accuracy

In [22]:
(y_pred == y_test).astype(int).sum()

NumExpr defaulting to 8 threads.


1601

In [27]:
(y_pred != y_test).astype(int).sum()

399

In [28]:
len(y_test) - (y_pred == y_test).astype(int).sum()

399

In [29]:
len(y_test)

2000

In [30]:
accuracy = (y_pred == y_test).astype(int).sum()/len(y_test)
print(f'This model is {100*accuracy}% accurate')

This model is 80.05% accurate


### Finding Precision

In [31]:
precision =  y_test[y_pred == True].sum()/(y_pred==True).sum()
print(f'This model is ≈ {100*precision.round(4)}% precise')

This model is ≈ 45.16% precise


### Finding Recall

In [32]:
recall =  y_pred[y_test == True].sum()/(y_test==True).sum(); recall

0.07124681933842239

Finding the F1

In [33]:
f1 = 2 * precision * recall / (precision + recall)
print(f'the F1 Score is ≈ {f1.round(5)}')

the F1 Score is ≈ 0.12308


## Using Decision Trees to Classify Churning

In [34]:
clf = DecisionTreeClassifier()

In [35]:
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [36]:
y_pred = clf.predict(X_test); y_pred

array([0, 0, 0, ..., 0, 0, 1])

### Accuracy

In [37]:
DTaccuracy = (y_pred == y_test).astype(int).sum()/len(y_test)
print(f'This model is {100*DTaccuracy}% accurate')

This model is 77.7% accurate


In [38]:
DTprecision =  y_test[y_pred == True].sum()/(y_pred==True).sum()
print(f'This model is ≈ {(100*DTprecision).round(3)}% precise')

This model is ≈ 43.936% precise


In [39]:
DTrecall =  y_pred[y_test == True].sum()/(y_test==True).sum(); DTrecall

0.48854961832061067

In [40]:
DTf1 = 2 * DTprecision * DTrecall / (DTprecision + DTrecall)
print(f'the F1 Score is ≈ {DTf1.round(5)}')

the F1 Score is ≈ 0.46265


### Comparing the Decision Tree with the Logistic Regression Model

In [41]:
print('ACCURACY -','Decision Tree:', DTaccuracy.round(3),'---- '
      'Regression:', accuracy.round(3)),'\n'
print('PRECISION -','Decision Tree:',DTprecision.round(3),'--- '
      'Regression:',precision.round(3)),'\n'
print('RECALL -','Decision Tree:', DTrecall.round(3),'------ '
      'Regression:',recall.round(3)),'\n'
print('F1 SCORES -','Decision Tree:', DTf1.round(3),'--- '
      'Regression:',f1.round(3))

ACCURACY - Decision Tree: 0.777 ---- Regression: 0.8
PRECISION - Decision Tree: 0.439 --- Regression: 0.452
RECALL - Decision Tree: 0.489 ------ Regression: 0.071
F1 SCORES - Decision Tree: 0.463 --- Regression: 0.123


#### The Decision Tree Model is the better model in this instance since the F1 Score and the Recall are higher. Even though the Accuracy and Precision is lower than that of the Logistic Regression, the F1 Score and Recall demonstrate that the Decision Tree as a more useful tool for predicting chrun.