# **Churn modelling using ANN**

## Setting up the development environment by importing required libraries and modules.

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score

#### Numpy: It will provide the support for efficient numerical computation.
#### Pandas: It is convenient library that supports dataframes. Working with pandas will bring ease in many crucial data operations.
### sklearn.preprocessing:
####  * LabelEncoder:LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels.
####  * OneHotEncoder:One-hot encoding is used in machine learning as a method to quantify categorical data. In short, this method produces a vector with length equal to the number of categories in the data set.
#### * StandardScaler:Standardize features by removing the mean and scaling to unit variance.
### sklearn.compose:
#### * This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.
### sklearn.model_selection:
#### * train_test_split: It splits data in training and test set as x_train,x_test,y_train,y_test. 
### sklearn.metrics:
#### * confusion_matrix:Compute confusion matrix to evaluate the accuracy of a classification.
#### * accuracy_score:In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.

## importing dataset


In [2]:
data=pd.read_csv("../input/churnmodelling/Churn_Modelling.csv")

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data.shape

(10000, 14)

## About dataset 
 ### * This data set contains details of a bank's customers and the target variable is a binary variable reflecting the fact whether the customer left the bank (closed his account) or he continues to be a customer.
 ### * Dataset consist of 14 columns out of which 13 are independent variable and 14th column is dependent variabll.
 ### * It consist of 10000 rows

# Data Preprocessing

## seprating independent variable (x) and dependent variable(y) .

In [5]:
x=data.iloc[:,3:-1]
y=data.iloc[:,-1]
print("x:")
print(x.head())
print("  y:")
print(y.head())

x:
   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               1        101348.88  
1          0               1        112542.58  
2          1               0        113931.57  
3          0               0         93826.63  
4          1               1         79084.10  
  y:
0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64


### As we can see we have two categorical features 1:Geography ,2:Gender
### checking there unique names so to encode them.

In [6]:
print(x.iloc[:,1].unique())
print(x.iloc[:,2].unique())

['France' 'Spain' 'Germany']
['Female' 'Male']


### Coverting the data in numpy-array by taking only the values of each column

In [7]:
x=data.iloc[:,3:-1].values
y=data.iloc[:,-1].values
x

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

### From the above we can see that gender has two values(male and females) so we will do label encoding for gender column and Geography column have 3 values (France,Spain,Germany) so we will do OneHotEndcoding for that.

In [8]:
Le=LabelEncoder()
x[:,2]=Le.fit_transform(x[:,2])
print(Le.classes_)


['Female' 'Male']


In [9]:
ct=ColumnTransformer(transformers=[( 'OneHotEncoder',OneHotEncoder(), [1])], remainder='passthrough')
x=np.array(ct.fit_transform(x))
x

array([[1.0, 0.0, 0.0, ..., 1, 1, 101348.88],
       [0.0, 0.0, 1.0, ..., 0, 1, 112542.58],
       [1.0, 0.0, 0.0, ..., 1, 0, 113931.57],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 1, 42085.58],
       [0.0, 1.0, 0.0, ..., 1, 0, 92888.52],
       [1.0, 0.0, 0.0, ..., 1, 0, 38190.78]], dtype=object)

### So now after encoding we get values as follow:
 ### female:0 , Male:1
 ### France: 1 0 0,  Spain: 0 0 1  , Germany: 0 1 0

### Spliting data in train test split.

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size = 0.2, random_state = 0)

## Scaling the training data so that it can be input in neural networks

In [11]:
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)

# Building Artificial Neural Network

## Initializing ANN

In [12]:
ann=tf.keras.models.Sequential()

##  hidden Layers

In [13]:
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))

## Output Layer

In [14]:
ann.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))


# Training ANN

## Compiling ANN

In [15]:
ann.compile(optimizer='adam',loss='binary_crossentropy' ,metrics=['accuracy'])

## Fitting ANN

In [16]:
ann.fit(x_train,y_train,batch_size=64,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc19c639a10>

## As we can see that it is giving 86% accuracy

## Making prediction and evaluating results.

In [17]:
y_pred=ann.predict(x_test)
y_pred=(y_pred>0.5)
y_pred


array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

### calculating accuracy and confusion matrix

In [18]:
cm=confusion_matrix(y_test,y_pred)
print('confusion_matrix:')
print(cm)
accuracy=accuracy_score(y_test,y_pred)
print("acuuracy:",accuracy)

confusion_matrix:
[[1519   76]
 [ 213  192]]
acuuracy: 0.8555


### Geography: spain
### Credit Score : 600
### Gender: Male
### Age: 40 years old
### Tenure: 3 years
### Balance: 60000 usd
### Number of Products: 2
### Does this customer have a credit card ? Yes
### Is this customer an Active Member: Yes
### Estimated Salary: 50000 usd

In [19]:
new_pred=ann.predict(sc.transform(np.array([[1,0,0,600,1,40,3,60000,2,1,1,50000]])))
new_pred=(new_pred>0.5)
print(new_pred)

[[False]]
