### 1. Import Dependencies

In [16]:
import os
import pandas as pd #alias
import numpy as np #alias
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### 2. Important Concepts

### 2.1 Normalization vs Standardization

#### 2.1.1 What is Normalization?
    - Normalization is a scalling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. it is also known as `Min-Max Scalling`

#### 2.1.2 What is Standardization?
    - Standardization is another scalling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute beacomes zero(0) and the resultant distribution has a unit standard deviation. 

mean = 0

0 - 1, 0 + 1 => -1, 1

In [12]:
df = pd.read_csv('processed\ChurnModelling_Encoded.csv')
df.head(10)

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.0,2,0.0,1,1,1,101348.88,1,1,True,False,False,True,False
1,41.0,1,83807.86,1,0,1,112542.58,0,1,False,False,True,True,False
2,42.0,8,159660.8,3,1,0,113931.57,1,0,True,False,False,True,False
3,38.91,1,0.0,2,0,0,93826.63,0,2,True,False,False,True,False
4,43.0,2,125510.82,1,1,1,79084.1,0,4,False,False,True,True,False
5,44.0,8,113755.78,2,1,0,149756.71,1,1,False,False,True,False,True
6,50.0,7,0.0,2,1,1,10062.8,0,4,True,False,False,False,True
7,29.0,4,115046.74,4,1,0,119346.88,1,0,False,True,False,True,False
8,44.0,4,142051.07,2,0,1,74940.5,0,0,True,False,False,False,True
9,27.0,2,134603.88,1,1,1,71725.73,0,2,True,False,False,False,True


| Condition                                        | Min-Max Scaling                      | Standardization(Z-Score)                  |
|--------------------------------------------------|--------------------------------------|-------------------------------------------|
| Data has a known, fixed range                    | ✅ Yes                               | ❌ Not ideal                             |                                 
| Data contains outliers                           | ❌ Sensitive to outliers             | ✅ More robust to outliers               |                                                        
| Data is normally distributed                     | ❌ Not necessary                     | ✅ Preferred                             |
| Data is not normally distributed (eg: skewed..)  | ✅ If shape needs to be preserved    | ✅ Often works well after log-transform  |
| Model is distance-based (KNN, SVM)               | ✅ Recommended                       | ✅ Also acceptable                       |                                                                
| Model is neural network                          | ✅ Strongly recommended              | ❌ May slow training                     |                                                       
| Model is linear or uses regularization           | ❌ Not ideal                         | ✅ Helps with covergence                 |                                              
| Input features need bounded values (0-1)         | ✅ Required                          | ❌ Not bounded                           |                              
| Applying PCA or LDA                              | ❌ May distort variance              | ✅ Required (centering needed)           |                                                       
| Want to preserve original distribution shape     | ✅ Maintains feature shape           | ✅ Maintains shape but centers data      |                                         
| Working with tree-based models                   | ❌ Not needed                        | ❌ Not needed                            |            

In [18]:
columns_need_to_be_scaled = ['Age', 'Tenure', 'Balance', 'EstimatedSalary']

for col in columns_need_to_be_scaled:
    scaler = StandardScaler()
    # scaler = MinMaxScaler()   # you can see there are no negative values when Min-Max used
    df[col] = scaler.fit_transform(df[col].values.reshape(10000, 1))
    
df

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.302983,-1.041760,-1.225848,1,1,1,0.021886,1,1,True,False,False,True,False
1,0.204867,-1.387538,0.117350,1,0,1,0.216534,0,1,False,False,True,True,False
2,0.302983,1.032908,1.333053,3,1,0,0.240687,1,0,True,False,False,True,False
3,-0.000196,-1.387538,-1.225848,2,0,0,-0.108918,0,2,True,False,False,True,False
4,0.401100,-1.041760,0.785728,1,1,1,-0.365276,0,4,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.008634,-0.004426,-1.225848,2,1,0,-0.066419,0,3,True,False,False,False,True
9996,-0.383831,1.724464,-0.306379,1,1,1,0.027988,0,0,True,False,False,False,True
9997,-0.285715,0.687130,-1.225848,1,0,1,-1.008643,1,2,True,False,False,True,False
9998,0.302983,-0.695982,-0.022608,2,1,0,-0.125231,1,3,False,True,False,False,True


In [20]:
df.to_csv('processed/ChurnModelling_Final.csv', index=False)

### This would be the final dataset that we will be ready to feed into the models.