In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
#pd.set_option('display.max_rows', None)
dataset = pd.read_csv('Churn_Modelling.csv')
# X values start at 3 because we don't care about row number, surname, etc.
X = dataset.iloc[:, 3:13].values
# y value is the final column (Exited {0,1})
y = dataset.iloc[:, 13].values

In [3]:
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [4]:
#The actual data we will be working with (independent variables)
df = pd.DataFrame(data=X, columns=["Credit Score", "Country", "Gender", "Age", "Tenure", "Balance", "# of Products", "Has Credit Card", "Is Active", "Salary"])
df.head()

Unnamed: 0,Credit Score,Country,Gender,Age,Tenure,Balance,# of Products,Has Credit Card,Is Active,Salary
0,619,France,Female,42,2,0.0,1,1,1,101349.0
1,608,Spain,Female,41,1,83807.9,1,0,1,112543.0
2,502,France,Female,42,8,159661.0,3,1,0,113932.0
3,699,France,Female,39,1,0.0,2,0,0,93826.6
4,850,Spain,Female,43,2,125511.0,1,1,1,79084.1


In [5]:
# Output data, whether or not the customer left the bank. (Should be vertical).
y

array([1, 0, 1, ..., 1, 1, 0])

In [6]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
#Label encode the second column of X (Geography)
#Converts country strings into integers (i.e. France = 0, Spain = 1, Germany = 2)
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

labelencoder_X_2 = LabelEncoder()
#Label encode the third column of X (Gender)
#Converts gender strings into integers (i.e. Male = 0, Female = 1)
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
#One hot encode the geography where each category has its own column

In [7]:
#Create a dummy variable at column 1
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
#Remove the second column from X to prevent the dummy variable trap
X = X[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
#The actual data we will be working with (independent variables)
df = pd.DataFrame(data=X, columns=["Country Code 1", "Country Code 2", "Credit Score", "Gender", "Age", "Tenure", "Balance", "# of Products", "Has Credit Card", "Is Active", "Salary"])
pd.set_option('display.max_rows', 10)
df

Unnamed: 0,Country Code 1,Country Code 2,Credit Score,Gender,Age,Tenure,Balance,# of Products,Has Credit Card,Is Active,Salary
0,0.0,0.0,619.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88
1,0.0,1.0,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58
2,0.0,0.0,502.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57
3,0.0,0.0,699.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63
4,0.0,1.0,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10
...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,771.0,1.0,39.0,5.0,0.00,2.0,1.0,0.0,96270.64
9996,0.0,0.0,516.0,1.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.77
9997,0.0,0.0,709.0,0.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.58
9998,1.0,0.0,772.0,1.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.52


In [9]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Part 2 - Now let's make the ANN!

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense


Using TensorFlow backend.


In [10]:
## Evaluating and improving the model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

def buildClassifier():
	classifier = Sequential()
	classifier.add(Dense(units = 8, kernel_initializer = 'RandomNormal', activation = 'relu', input_dim = 11))
	classifier.add(Dense(units = 8, kernel_initializer = 'RandomNormal', activation = 'relu'))
	classifier.add(Dense(units = 1, kernel_initializer = 'RandomNormal', activation = 'sigmoid'))
	classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
	return classifier

In [11]:
classifier = KerasClassifier(build_fn = buildClassifier, batch_size = 50, epochs = 100)
#cv = 10 for 10-fold cross validation || n_jobs = -1 means it will use all CPUs in parallel to get it done faster
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs = -1) 
print(accuracies)
mean = accuracies.mean()
variance = accuracies.std()

[0.84125    0.85124999 0.82999998 0.84500003 0.8725     0.85374999
 0.84375    0.83125001 0.81999999 0.83749998]


In [12]:
mean

0.8426249980926513

In [13]:
variance

0.01387950613120285

In [None]:
## In the event of overfitting (i.e., the variance is high) apply dropout which disables a fraction of the nuerons
from keras.layers import Dropout

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 8, kernel_initializer = 'RandomNormal', activation = 'relu', input_dim = 11))

# ***************
# p defines the fraction of neurons we want to disable. Good practice to start with 0.1 and increase until variance is lower
# Generally not advisable to go above 0.5 because then you risk underfitting
classifier.add(Dropout(p = 0.1))
# ***************

# Adding the second hidden layer
classifier.add(Dense(units = 8, kernel_initializer = 'RandomNormal', activation = 'relu'))
# ***************
# Add where needed
classifier.add(Dropout(p = 0.1))
# ***************

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'RandomNormal', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 50, epochs = 100)

y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
acc = (cm[0][0] + cm[1][1]) / 2000
