In [1]:
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing dataset
dataset = pd.read_csv('Churn_Modelling.csv')
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [2]:
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:] # Avoiding dummy variable trap!

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [3]:
# Fitting the classifier to the Training set
# Create your classifier here (Ex: Logistic Regression, SVM ...)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set
y_pred = classifier.predict(X_test)

In [4]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
X_set, y_set = X_train, y_train
y_hat = classifier.predict(X_set)
y_hat = np.reshape(y_hat, -1)
y_hat[y_hat >= 0.5] = 1
y_hat[y_hat < 0.5] = 0

cm = confusion_matrix(y_set, y_hat)
accuracy = (cm[0, 0] + cm[1, 1])/cm.sum()
TPR = cm[0, 0]/cm[:, 0].sum() # Sensitivitive, Recall
TNR = cm[1, 1]/cm[:, 1].sum() # Specificitive
PPV = cm[0, 0]/cm[0, :].sum() # Positive Predictive Value, Precision
NPV = cm[1, 1]/cm[1, :].sum() # Negative Predictive Value,  
F1_score = 2/(1/PPV + 1/TPR)
summary = {'Accuracy': accuracy, 
           'Positive_Predictive_Value': PPV, 
           'Negative_Predictive_Value': NPV,            
           'Sensitivitive': TPR, 
           'Specificitive': TNR,            
           'F1_score': F1_score}
summary

{'Accuracy': 0.823375,
 'Positive_Predictive_Value': 0.9462939698492462,
 'Negative_Predictive_Value': 0.34375,
 'Sensitivitive': 0.8490911652811047,
 'Specificitive': 0.6212624584717608,
 'F1_score': 0.8950612699591534}

In [5]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
X_set, y_set = X_test, y_test
y_hat = classifier.predict(X_set)
y_hat = np.reshape(y_hat, -1)
y_hat[y_hat >= 0.5] = 1
y_hat[y_hat < 0.5] = 0

cm = confusion_matrix(y_set, y_hat)
accuracy = (cm[0, 0] + cm[1, 1])/cm.sum()
TPR = cm[0, 0]/cm[:, 0].sum() # Sensitivitive, Recall
TNR = cm[1, 1]/cm[:, 1].sum() # Specificitive
PPV = cm[0, 0]/cm[0, :].sum() # Positive Predictive Value, Precision
NPV = cm[1, 1]/cm[1, :].sum() # Negative Predictive Value,  
F1_score = 2/(1/PPV + 1/TPR)
summary = {'Accuracy': accuracy, 
           'Positive_Predictive_Value': PPV, 
           'Negative_Predictive_Value': NPV,            
           'Sensitivitive': TPR, 
           'Specificitive': TNR,            
           'F1_score': F1_score}
summary

{'Accuracy': 0.8295,
 'Positive_Predictive_Value': 0.9398119122257054,
 'Negative_Predictive_Value': 0.3950617283950617,
 'Sensitivitive': 0.8595183486238532,
 'Specificitive': 0.625,
 'F1_score': 0.897873614854747}

In [6]:
# Predicting a new data
RowNumber = [0]
CustomerId = [0]
Surname = ["name"]
CreditScore = [600]
Geography = ["France"]
Gender = ["Male"]
Age = [40]
Tenure = [3]
Balance = [60000]
NumOfProducts = [2]
HasCrCard = [1]
IsActiveMember = [1]
EstimatedSalary = [50000]

X_new_dict = {
        'RowNumber': RowNumber,
        'CustomerId': CustomerId,
        'Surname': Surname,
        'CreditScore': CreditScore,
        'Geography': Geography,
        'Gender': Gender,
        'Age': Age,
        'Tenure': Tenure,
        'Balance': Balance,
        'NumOfProducts': NumOfProducts,
        'HasCrCard': HasCrCard,
        'IsActiveMember': IsActiveMember,
        'EstimatedSalary': EstimatedSalary   
}
X_new_df = pd.DataFrame(X_new_dict)
X_new = X_new_df.iloc[:, 3:13].values

X_new[:, 1] = labelencoder_X_1.transform(X_new[:, 1])
X_new[:, 2] = labelencoder_X_2.transform(X_new[:, 2])
X_new = onehotencoder.transform(X_new).toarray()
X_new = X_new[:, 1:] # Avoiding dummy variable trap!
X_new = sc.transform(X_new)

y_new_pred = classifier.predict(X_new)
y_new_pred = (y_new_pred > 0.5)
y_new_pred

array([False])

In [7]:
# Example
a = np.array([[0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])
a = sc.transform(a)
a_pred = classifier.predict(a)
a_pred = (a_pred > 0.5)
a_pred

array([False])