# Artifical Neural Networks example
In this example we predict churn for the imaginary bank in Europe. Training set has 10k clients which are monitored for a period of 6 months.

## Data preparation

In [2]:
import pandas as pd
import numpy as np

# import data
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
# extract values into arrays
x = df.iloc[:,3:13].values
y = df.iloc[:,13].values

In [15]:
# encode text categories
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# encode objects
encode_country = LabelEncoder()
encode_gender = LabelEncoder()
# encode country
x[:,1] = encode_country.fit_transform(x[:,1])
x[:,2] = encode_gender .fit_transform(x[:,2])

# create boolean array from country values
array_encoder = OneHotEncoder(categorical_features=[1])
x = array_encoder.fit_transform(x).toarray()
df_x = pd.DataFrame(x)
df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,1.0,1.0,1.0,0.0,619.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88
1,1.0,0.0,1.0,1.0,0.0,1.0,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58
2,1.0,0.0,1.0,1.0,1.0,0.0,502.0,0.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57
3,1.0,0.0,1.0,1.0,1.0,0.0,699.0,0.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63
4,1.0,0.0,1.0,1.0,0.0,1.0,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1


In [17]:
# split data into training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [19]:
# feature scaling - normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
# examine train set
df_x = pd.DataFrame(x_train)
df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.569844,-0.569844,0.569844,0.569844,-1.014607,1.74309,0.169582,-1.091687,-0.464608,0.006661,-1.215717,0.809503,0.642595,-1.03227,1.106432
1,-1.754865,1.754865,-1.754865,-1.754865,-1.014607,-0.573694,-2.304559,0.916013,0.301026,-1.37744,-0.006312,-0.921591,0.642595,0.968738,-0.748664
2,0.569844,-0.569844,0.569844,0.569844,0.985604,-0.573694,-1.191196,-1.091687,-0.943129,-1.031415,0.579935,-0.921591,0.642595,-1.03227,1.485335
3,0.569844,-0.569844,0.569844,0.569844,-1.014607,1.74309,0.035566,0.916013,0.109617,0.006661,0.473128,-0.921591,0.642595,-1.03227,1.276528
4,0.569844,-0.569844,0.569844,0.569844,-1.014607,1.74309,2.056114,-1.091687,1.736588,1.044737,0.810193,0.809503,0.642595,0.968738,0.558378


## ANN - Artificial Neural Network

Building a neural network. For ANN we use keras and tensorflow backend. Sequential model to init network and Dense defines the layers. In the hidden layers we use 'ractifier' and for the output layer we use sigmoid activation function. Training the ANN with Stochastic Gradient Descent consist of 7 steps:

- radomly initialise weights with number close to 0, but not 0
- input first observarion of your dataset in the input layer, each variable (feature) is one input node
- forward-propagation: in hidden layers preferably use 'racitifier' and end layer signmoid
- compare predicted with actual results 
- back-propagation: update weights based on calculated error, learning rate defines by how much we update the weights
- repeat steps and update weights after each observation (refinforcment learning), or update after a batch of observations (Batch learning)
- when whole training set is used it makes one epoch. Redo more epochs.


In [32]:
import keras
from keras.models import Sequential
from keras.layers import Dense

# intitalize ANN
classifier = Sequential()

# add layers - input & hidden layer
# create hidden layer using Dense with 7 nodes, intialize weights randomly (uniform),
# use ractifier activation function (relu) and 14 input variables
hidden_layer_1 = Dense(output_dim=7, init='uniform', activation='relu', input_dim=15)
classifier.add(hidden_layer_1)

# add second hidden layer
hidden_layer_2 = Dense(output_dim=7, init='uniform', activation='relu')
classifier.add(hidden_layer_2)

# add output layer - predictor is 1 node, use sigmoid
# note! when more categories use softmax as activation method ()
output_layer = Dense(output_dim=1, init='uniform', activation='sigmoid')
classifier.add(output_layer)

# compile ANN
# use adam optimizer (gradient descent algo),
# loss func (logaritmic loss func) binary_crossentropy  
# note for categorical use categorical_crossentropy
# and we use accuracy metric
classifier.compile(optimizer='adam', loss='binary_crossentropy', metric=['accuracy'] )



  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


In [33]:
# fit / train ANN to the training set
# use training set, update weights by batch of 10, and train for 100 epochs
classifier.fit(x_train, y_train, batch_size=10, epochs=10)

ValueError: ('Some keys in session_kwargs are not supported at this time: %s', dict_keys(['metric']))