# Decision Trees

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Churn_Modelling.csv")

data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
from sklearn.preprocessing import LabelEncoder

d_types = dict(data.dtypes)

for name, type_ in d_types.items():
    if str(type_) == 'object':
        Le = LabelEncoder()
        data[name] = Le.fit_transform(data[name])

In [5]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [6]:
remaining_columns = list(data.columns)
remaining_columns.remove("Exited")

In [7]:
X = data[remaining_columns].values
Y = data['Exited'].values.astype(np.uint8)

In [8]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(Xtrain, Ytrain)

RandomForestClassifier()

In [10]:
print("Testing Accuracy : ", model.score(Xtest, Ytest))

Testing Accuracy :  0.8745


In [11]:
predictions = model.predict(Xtest)

In [12]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(Ytest, predictions)
matrix

# false negative / true positives

array([[1551,   53],
       [ 198,  198]], dtype=int64)

Improve

In [13]:
value_counts = dict(data["Exited"].value_counts())

In [14]:
for key, value in value_counts.items():
    value_counts[key] = value/len(data)

In [15]:
class_weights = {}

for key, value in value_counts.items():
    class_weights[key] = sum(value_counts.values()) - value / sum(value_counts.values())
    
class_weights

{0: 0.2037, 1: 0.7963}

In [16]:
from sklearn.model_selection import train_test_split

# if the dataset is too large you can go with 0.99 vs 1% split
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size = 0.05, random_state = 4)

In [17]:
def getScores(model, Xtrain, Ytrain, Xval, Yval):
    return {
        "training acc" : model.score(Xtrain, Ytrain),
        "validation acc" : model.score(Xval, Yval),
        "oob_score" : model.oob_score_
    }

In [18]:
#oob = out of bag error

model = RandomForestClassifier(n_estimators = 200,
                               min_samples_leaf = 5,
                               class_weight = class_weights,
                               max_features = 0.5,
                               n_jobs = -1,
                               oob_score = True)

model.fit(Xtrain, Ytrain)

RandomForestClassifier(class_weight={0: 0.2037, 1: 0.7963}, max_features=0.5,
                       min_samples_leaf=5, n_estimators=200, n_jobs=-1,
                       oob_score=True)

In [19]:
getScores(model, Xtrain, Ytrain, Xval, Yval)

{'training acc': 0.9389473684210526,
 'validation acc': 0.864,
 'oob_score': 0.8432631578947368}

In [20]:
predictions = model.predict(Xval)

matrix = confusion_matrix(Yval, predictions)
matrix

array([[358,  38],
       [ 30,  74]], dtype=int64)