# Artificial Neural Network

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
tf.__version__

'2.13.0'

## Part 1 - Data Preprocessing

### Importing the dataset

In [3]:
dataset = pd.read_csv('Churn_Modelling.csv')

In [4]:
# we see that the dataset has some useless features that won't help our prediction
# i.e. row number, customerID, surname, etc.
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


### Check for Missing Values

In [5]:
print(dataset.isnull())

      RowNumber  CustomerId  Surname  CreditScore  Geography  Gender    Age  \
0         False       False    False        False      False   False  False   
1         False       False    False        False      False   False  False   
2         False       False    False        False      False   False  False   
3         False       False    False        False      False   False  False   
4         False       False    False        False      False   False  False   
...         ...         ...      ...          ...        ...     ...    ...   
9995      False       False    False        False      False   False  False   
9996      False       False    False        False      False   False  False   
9997      False       False    False        False      False   False  False   
9998      False       False    False        False      False   False  False   
9999      False       False    False        False      False   False  False   

      Tenure  Balance  NumOfProducts  HasCrCard  Is

In [6]:
# there are no missing data
print(dataset.isnull().sum())  # This will print the total number of missing values per column

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [7]:
# iloc: This is an indexer in Pandas used for integer-location based indexing/selection by position.
# [:, :-1]: This is the indexing operation being performed by iloc.
# The first parameter with a single colon ":" specifies that all rows should be selected.
# The second parameter with "3:-1" specifies that all columns from the 4th until except the last one should be selected.
# we skip the first 3 columns because they are useless for our prediction - i.e. customerId, rowNumber, surname, etc.
# .values: This converts the selected data into a NumPy array.
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values


In [8]:
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [9]:
# see how this data ignores the first 3 column which have useless data like row number, customerID, surname, etc.
X

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [10]:
y

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

### Encoding categorical data
- we realize our dataset has two categorical data, namely country and gender
- so we need to label them
  (transform a list of labels into encoded numeric values. This is often necessary because machine learning algorithms generally work with numeric data, so labels like strings or other types need to be converted into numbers.)

#### Label Encoding Example

In [11]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder

# Sample data: a list of labels (could be strings or integers)
# For example, these could be categories like 'cat', 'dog', and 'bird'
example_y = ['cat', 'dog', 'bird', 'cat', 'bird', 'dog']

# Print original labels
print("Original labels:")
print(example_y)

# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the labels and transform the labels into encoded integers
# The transformation is based on alphabetical order, so 'bird' becomes 0, 'cat' becomes 1, and 'dog' becomes 2
example_y_encoded = le.fit_transform(example_y)

# Print encoded labels
print("\nEncoded labels:")
print(example_y_encoded)

# If you want to transform the encoded labels back to the original labels
y_original = le.inverse_transform(example_y_encoded)

# Print the transformed back labels
print("\nTransformed back labels:")
print(y_original)

Original labels:
['cat', 'dog', 'bird', 'cat', 'bird', 'dog']

Encoded labels:
[1 2 0 1 0 2]

Transformed back labels:
['cat' 'dog' 'bird' 'cat' 'bird' 'dog']


In [12]:
# output: (6,) => A shape of (6,) means the array is one-dimensional, containing 6 elements. For example:
# A shape of (6, 1) indicates a two-dimensional array with 6 rows and 1 column. It's like a column vector in linear algebra. Example:
# b = np.array([[1], [2], [0], [1], [0], [2]])
# print(b.shape)  # Output: (6, 1)
print(example_y_encoded.shape)

(6,)


#### Label Encoding the "Gender" column

In [13]:
# before encoding gender
# X is a two-diemnsionary array or DataFrame
# [:,2] is an example of slicing. The colon : means "select all rows", and 2 after the comma refers to the third column (since Python indexing starts at 0).
# So X[:, 2] means "select all rows from the third column of X"
X[:,2]

array(['Female', 'Female', 'Female', ..., 'Female', 'Male', 'Female'],
      dtype=object)

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# See example above to understand how LabelEncoder really works - TLDR; it convert some string into numbers, i.e. all "Male" become 1 and "Female" becomes 0, etc.
# Note that Label encoding will treat different capitalizations as distinct categories. The LabelEncoder in scikit-learn does not automatically lowercase or uppercase the data, so "Male" and "male" would be encoded as two different categories.

# what this is doing is taking all the rows in X, and JUST the 3rd column (which is the gender) and running fit_transform through it, and reassigning back to all the rows.
X[:, 2] = le.fit_transform(X[:, 2])
# after encoding gender
X[:, 2]

array([0, 0, 0, ..., 0, 1, 0], dtype=object)

#### One Hot Encoding the "Geography" column

##### One Hot Encoding vs Label Encoding (why we can't use label encoding for the geograph variable)

Firstly, label encoding would turn `["Red", "Green", "Red", "Blue", "Blue"]` into `array([2, 1, 2, 0, 0])` (integer encoding are assigned alphabetically by default)

While, one hot encoding output may be
```
array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])
```

Why the difference?

Issues with Label Encoding:

- **Ordinality**: Label encoding assigns each unique category in a feature to a number. For example, if you have a feature "color" with values "red", "green", and "blue", these might be encoded as 0, 1, and 2 respectively. The problem is that many machine learning algorithms (especially those that are mathematically based) may interpret these values as ordinal data. So, they might consider "blue" (2) as having a higher value than "green" (1) and "red" (0). In reality, these are just distinct categories with no intrinsic order.

- **Bias**: Since machine learning models might consider these values as ordinal, there’s a risk of introducing a bias. The model might incorrectly learn patterns that are influenced by the ordering of categories, even though such an order does not exist. This can lead to incorrect or suboptimal predictions.

When One-Hot Encoding is Preferable:

- One-hot encoding, on the other hand, mitigates these issues by creating a binary column for each category/label in the feature. It does not introduce ordinality or a bias because each category is represented equally. In the one-hot encoded form, each category is equally distant from the others, so the machine learning model won’t misinterpret the categorical data as ordinal.



In [15]:
# before one hot encoding
X[:,1]

array(['France', 'Spain', 'France', ..., 'France', 'Germany', 'France'],
      dtype=object)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# The column indices is set to [1] which means to apply the transformer to column at index 1 of the dataset (i.e. column 2, which represents geography, since python index starts from 0)
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')

X = np.array(column_transformer.fit_transform(X))

# after one hot encoding - france, germany, spain etc. are converted to some multi-dimensional array
print(X[:,1])

[0.0 0.0 0.0 ... 0.0 1.0 0.0]


In [17]:
X

array([[1.0, 0.0, 0.0, ..., 1, 1, 101348.88],
       [0.0, 0.0, 1.0, ..., 0, 1, 112542.58],
       [1.0, 0.0, 0.0, ..., 1, 0, 113931.57],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 1, 42085.58],
       [0.0, 1.0, 0.0, ..., 1, 0, 92888.52],
       [1.0, 0.0, 0.0, ..., 1, 0, 38190.78]], dtype=object)

### Splitting the dataset into the Training set and Test set

In [18]:
from sklearn.model_selection import train_test_split
# see your template if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0)

### Feature Scaling

In [19]:
# feature scaling is extremely important for deep learning; 
from sklearn.preprocessing import StandardScaler

# instansitate a scaler object
sc = StandardScaler()
# we will do feature scaling all for columns and rows, even for those we already done label encoding on

# Feature scaling (like min-max scaling, z-score normalization, etc.) should be fitted only on the training data, but it should be applied to both the training and testing datasets. Here’s the reasoning:
# Fitting to the Training Set Only
# Avoid Data Leakage: Fitting the scaler to the whole dataset, including the test set, means that you are using information from the test set to scale the training set. This is a type of data leakage, where information from the testing set leaks into the training process, potentially leading to overly optimistic performance estimates.

# Simulation of Real-World Scenario: In a real-world scenario, you train your model on the available data (training set), and you evaluate its performance on new, unseen data (testing set). If you fit the scaler to the whole dataset, you’re violating this principle because you’re using information from the future (i.e., the test set) that wouldn’t be available at the time of training.

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Part 2 - Building the ANN

### Initializing the ANN

In [20]:
ann = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [21]:
# units stands for number of neurons
# in deep learning, there's no rule of thumb for how many neurons - it is based on experimentation
# we need to experiment based on hyperparameters - hyperparameters referring to parameters that won't be trained during the training process
# maybe a decent number is between 5 and 30
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

### Adding the second hidden layer

In [22]:
# we could also add dropout layer to prevent overfitting
# or in Convultional Neural Network, we could add conv2D layer, which is a convolution layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

### Adding the output layer

In [23]:
# we use units = 1 for output because we are predicting a binary value - i.e. 1 or 0 for classification
# if you are doing multi-class classification, i.e. Chocolate, Vanilla, or Strawberry, then you need 3 output neuron - and you'll then do one-hot-encoding like [0 0 1], [0 1 0], etc. to represent the classified output
# we are also using the sigmoid activation function because that gives us the probability of how confident we are with the final output
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [24]:
# if we're doing multi-class classification, we would use softmax activation function instead of sigmoid above

## Part 3 - Training the ANN

### Compiling the ANN

#### About `optimizer='adam'`

Role of the Optimizer: The optimizer is an algorithm or method used to change the attributes of the neural network, such as weights and learning rate, to reduce the losses. Optimizers help to minimize (or maximize) an objective function (the loss function) that is calculated as a function of the network's weights.

How it Works: During training, the optimizer adjusts the weights based on the gradients of the loss function with respect to the weights. This process is iteratively done to find the best values of the weights to minimize the loss function, and hence improve the accuracy of the model.

Common Types of Optimizers:

1. Stochastic Gradient Descent (SGD): Uses a fixed learning rate and updates weights iteratively based on the training data.

2. Momentum: Similar to SGD but takes into account the previous updates to smooth out the optimization landscape.

3. Adagrad: Adapts the learning rate to the parameters, performing larger updates for infrequent and smaller updates for frequent parameters.

4. RMSprop: Resolves Adagrad's radically diminishing learning rates by using a moving average of squared gradients.

5. Adam (Adaptive Moment Estimation): Combines the ideas of Momentum and RMSprop, keeping track of an exponentially decaying average of past gradients and squared gradients.

Choosing an Optimizer: The choice of an optimizer can significantly affect the performance of a neural network. It depends on the specific problem, the nature of the dataset, and the architecture of the network. Adam is a popular choice due to its effectiveness in a wide range of problems.

#### Code

In [25]:
# we need to compile the ANN with an optimizer, a loss function, and a metric (i.e. accuracy)

# TLDR; why we choose adam, why we choose binary_crossentropy, and why we choose accuracy is based on best practices

# Adam is the most popular choice for optimizer
# loss function: the way to compute the difference between the predicted value and the actual value
# accuracy is your final metric
# when you're doing binary classifiacation/i.e. predict a, binary output, you have to use binary_crossentropy
# for binary classification, you have to use binary_crossentropy
# for multiclass classification, you need to use categorical_crossentropy)

# the metrics is purely for us to see during the training below; even if we omit it, there's no issue to the training. Of course, we add it so we can monitor the performance and finetune accordingly
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### Training the ANN on the Training set

In [26]:
# .fit is to train the model
# you start with epochs=100 as a general guide
# we usually use batch_size = 32 as a general guide - i.e. mini batch gradient descent is a more precise term than stochastic gradient descent. (SGD uses a batch size of 1, while mini-batch gradient descent uses a batch size greater than 1.)
# batch_size =32 means we are comparing 32 sets of predictions and 32 actual values, and then we update the weights
# if u run this, you'll realize at around epoch=50, the loss will start to plateau/converge, so you can stop the training at around epoch=50
ann.fit(X_train, y_train, batch_size=32, epochs=100)

Epoch 1/100
Epoch 2/100

Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

<keras.src.callbacks.History at 0x18582790d90>

## Some notes about ML Model in Notebooks and Computer Memory

TLDR; i trained the ANN and came back the next day and needed to retrain the whole thing; this is because the ANN exists only in memory while the notebook is running.

As long as the Jupyter notebook kernel is running and has not been restarted or interrupted, the ANN remains in memory. This means you can train your ANN, leave the notebook open, and still access the ANN even after several hours, as long as the kernel is active.

If I restart my computer or do anything that kestarts the kernel, then all variables (including my ANN) will be cleared from memory.

Also, the amount of memory an ANN occupies depends on its size (number of layers and neurons) and the data types of its parameters. Larger networks with more parameters will naturally use more memory.

therefore you need to save your ANN to a file if you want to keep it


## Part 4 - Making the predictions and evaluating the model

### Predicting the result of a single observation

**Homework**

Use our ANN model to predict if the customer with the following informations will leave the bank: 

Geography: France

Credit Score: 600

Gender: Male

Age: 40 years old

Tenure: 3 years

Balance: \$ 60000

Number of Products: 2

Does this customer have a credit card ? Yes

Is this customer an Active Member: Yes

Estimated Salary: \$ 50000

So, should we say goodbye to that customer ?

### **The need to reapply preprocessing step is SUPER interesting (check this out with gpt answer)
https://chat.openai.com/c/0092e96d-3e68-47cc-9013-9311fdfe01d8

In [27]:
dataset.iloc[0]

RowNumber                  1
CustomerId          15634602
Surname             Hargrave
CreditScore              619
Geography             France
Gender                Female
Age                       42
Tenure                     2
Balance                  0.0
NumOfProducts              1
HasCrCard                  1
IsActiveMember             1
EstimatedSalary    101348.88
Exited                     1
Name: 0, dtype: object

In [28]:
# the ann.predict method always expects a 2D array

# Please check out this chatgpt convo out: https://chat.openai.com/c/0092e96d-3e68-47cc-9013-9311fdfe01d8
# TLDR; when you predict a new data point, you NEED to perform all the same preprocessing step you did with your training data
# This includes 1. One Hot Encoding some variables like Geography, 2. Label Encoding some variables like Gender, and 3. Perform Feature Scaling on the New Single Data point. Here, you NEED to use the same encoder and scaler in order to transform the new data point in the same way as the training data. (i.e. your training data's standard scaler was scaled according to the mean/median of the training data set, so you need to apply that to this new dataset)
# **very important to APPLY and not fit_transform; fit_transform would recalculate the mean/median which is not what we want. we want to reuse the same stats from the training data
# currently, this tutor just eyeball the encoding and replace "France" with 1, 0, 0 (which is correct) and "Male" with 1 which is also correct. But ideally you can check out the chatgpt convo above for the right way to do it.
y_pred = ann.predict(sc.transform([[1, 0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]]))

# because we indicated a sigmoid function in our output layer earlier, what we will get is the probability that the customer would leave the bank
# churn baby: https://chat.openai.com/c/937de16f-6973-424e-818e-c9a24fab2e5f

# this represnts the probabiltiy the customer would leave the bank
y_pred



array([[0.04205877]], dtype=float32)

In [29]:
# you can, to some degree, use this metric to determine if the customer would leave the bank or not
# this is just an abitrary threshold that you set (i.e. if your confidence rate >50% means he will leave)
# in this case, the y_pred rate is 0.03 so the prediction is that the customer likely won't leave the bank
y_pred > 0.5

array([[False]])

**Solution**

Therefore, our ANN model predicts that this customer stays in the bank!

**Important note 1:** Notice that the values of the features were all input in a double pair of square brackets. That's because the "predict" method always expects a 2D array as the format of its inputs. And putting our values into a double pair of square brackets makes the input exactly a 2D array.

**Important note 2:** Notice also that the "France" country was not input as a string in the last column but as "1, 0, 0" in the first three columns. That's because of course the predict method expects the one-hot-encoded values of the state, and as we see in the first row of the matrix of features X, "France" was encoded as "1, 0, 0". And be careful to include these values in the first three columns, because the dummy variables are always created in the first columns.

### Predicting the Test set results

In [31]:
# again, using 0.5 as our threshold, we check if the customer is predicted to leave the bank
y_pred = ann.predict(X_test) > 0.5
# y_pred on the left; y_test on the right
print('[Prediction | Actual]')
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[Prediction | Actual]
[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making the Confusion Matrix

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

# accuracy rate of 86%
accuracy_score(y_test, y_pred)

[[1534   61]
 [ 209  196]]


0.865