# Artificial Neural Networks

In [1]:
# Installing Theano
# pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git

# Installing Tensorflow
# Install Tensorflow from the website: https://www.tensorflow.org/versions/r0.12/get_started/os_setup.html

# Installing Keras
# pip install --upgrade keras

## PART 1 - DATA PREPROCESSING

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')

In [4]:
print("number of rows: " + str(len(dataset)) + "\nnumber of columns: " + str(len(dataset.columns)))

number of rows: 10000
number of columns: 14


In [5]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### the goal here is to predict if a customer/client of a bank will leave (exited column) or not
#### [3:13[  independent variables (first two columns aren't important), variable to predict column Exited [, 13]

In [6]:
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

In [7]:
X[0:5]

array([[619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
       [608, 'Spain', 'Female', 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [502, 'France', 'Female', 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [699, 'France', 'Female', 39, 1, 0.0, 2, 0, 0, 93826.63],
       [850, 'Spain', 'Female', 43, 2, 125510.82, 1, 1, 1, 79084.1]],
      dtype=object)

In [11]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 1])

onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()


In [13]:
X[0:3]

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
        0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 6.0800000e+02,
        2.0000000e+00, 4.1000000e+01, 1.0000000e+00, 8.3807860e+04,
        1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.0200000e+02,
        0.0000000e+00, 4.2000000e+01, 8.0000000e+00, 1.5966080e+05,
        3.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1393157e+05]])

### avoid the dummy variable trap, we need to remove one dummy variable (the 1st for example)

In [14]:
X = X[:, 1:]

In [16]:
X[0:2]

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
        4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, 2.0000000e+00,
        4.1000000e+01, 1.0000000e+00, 8.3807860e+04, 1.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 1.1254258e+05]])

In [26]:
print( "number of independent variables (NN's nodes): " + str(X.shape[1]))

number of independent variables (NN's nodes): 11


### Splitting the dataset into the Training set and Test set

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Scaling  ( we gonna apply a lot of computations, and we don't want a variable dominating others)

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Part 2 - Now let's make the ANN!

### Importing the Keras libraries and packages

In [19]:
import keras
from keras.models import Sequential ## required to initialize our neural network
from keras.layers import Dense  ## required to build layers of our NN

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Initialising the ANN: defining the sequence of layers (another way is defining a graph)
https://keras.io/models/sequential/

In [20]:
classifier = Sequential()
## we won't put any arguments here because we will define the layers step by step afterwards

### Adding the input layer and the first hidden layer

###### we will initialize the weights with values close to zero, using the Dense module
https://keras.io/layers/core/

In [28]:
###### Arguments
###### units: Positive integer, dimensionality of the output space.
###### activation: Activation function to use (see activations). If you don't specify anything, no activation is applied
####   (ie. "linear" activation: a(x) = x).
###### use_bias: Boolean, whether the layer uses a bias vector.
###### kernel_initializer: Initializer for the kernel weights matrix (see initializers).
###### bias_initializer: Initializer for the bias vector (see initializers).
###### kernel_regularizer: Regularizer function applied to the kernel weights matrix (see regularizer).
###### bias_regularizer: Regularizer function applied to the bias vector (see regularizer).
###### activity_regularizer: Regularizer function applied to the output of the layer (its "activation"). (see regularizer).
###### kernel_constraint: Constraint function applied to the kernel weights matrix (see constraints).
###### bias_constraint: Constraint function applied to the bias vector (see constraints).

In [30]:
## we choose the number of nodes in the hidden layer as the average of the number of nodes in the input layer and the number of 
## nodes in the output layer (a+b)/2. if we want to be artists what we have to do is experimenting with
## a technique called parameter tuning. 
## here we have 11 nodes in input layer (independent variables) and one node (one output y), so (11+1)/2= 6 
## the activation function for the hidden layer will be "rectifier" function called "relu" in keras and the sigmoid activation 
## function for the output layer.

In [33]:
## example of dense: keras.layers.Dense(units, activation=None, use_bias=True, kernel_initializer='glorot_uniform', 
## bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, 
## bias_constraint=None)
## input_dim must be added too

In [32]:
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))

### Adding the second hidden layer

In [34]:
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

### Adding the output layer

In [35]:
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
## if we re dealing with a problem with an output of 3 categories so the units = 3, activation = 'soft max = sigmoid(>2 class)'

### Compiling the ANN

###### compile the whole artificial neural network  is basically applying stochastic gradient descent on the whole ANN.
###### if the dependent variable has a binary outcome then loss function is called binary_crossentropy. And if the dependent variable has more than two outcomes like three categories then th loss function is called categorical_crossentropy.

In [36]:
classifier.compile(optimizer = 'adam',  loss = 'binary_crossentropy', metrics = ['accuracy'])
## optimizer name of the gradient descent algorithm
## loss the loss function that will be used to optimize the weights 
## metrics: List of metrics to be evaluated by the model during training and testing

### Fitting the ANN to the Training set

###### fit(x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)

###### batch_size: is the number of observations after which we want to update the weights.
###### epochs: Number of epochs to train the model. An epoch is an iteration over the entire x and y data provided

###### to choose we need to experiment but here we gonna choose some significant values

In [37]:
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2349cd04ac8>

Accuracy of 83%

### Part 3 - Making the predictions and evaluating the model

In [39]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred

array([[0.13963003],
       [0.2863777 ],
       [0.10518845],
       ...,
       [0.2673705 ],
       [0.15926984],
       [0.15023522]], dtype=float32)

###### it as an array of probabilities, whether the customer will leave or not the  bank.

###### in order to use this confusion matrix we don't need the probabilities but we need the predicted results in the form: True or false. So we just need to do a little something to convert these probabilities into the predicted results in the form (1,0). And to do this we need to choose a threshold to decide when the predicted result is one and when the predicted result is zero. So we predict one over the threshold and we predict zero below the threshold. And of course a natural threshold to take is 0.5 50%.

In [40]:
y_pred = (y_pred > 0.5)
y_pred

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [41]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1550,   45],
       [ 270,  135]], dtype=int64)

In [44]:
print("accuracy: " + str(((1550+135)/2000)*100)+"%")

accuracy: 84.25%


well 84.25% of accuracy on new data, compared to 83% on the training set. an excellent result without being an artist, means without any paramter tunning