### 1. Import Dependencies

In [33]:
import os
import pandas as pd #alias
import numpy as np #alias
import seaborn as sns
from matplotlib import pyplot as plt

| Variable     |   Preferred Encoding   | Why?                                                                 |
|--------------|------------------------|----------------------------------------------------------------------|
| Nominal      | One-Hot Encoding       | No inherent order -> Avoids implying false ordinal relationships     |
| Ordinal      | Label Encoding         | Preserves order -> Small integers represent increasing levels        |

- Nominal : means that there are no order. (ex: Gender, Eye Color, Type of Car, Hair Color, Political Party)
- Ordinal : means that there are some kind of a order. (ex: Educational Level, Income Level, Customer Satisfaction, Age Range)

`Gender <Male, Female>`   // we will create 'n'  number of columns based on the number of diffrent columns.

|          |  Gender_Male | Gender_Female  |
|----------|--------------|----------------|
| Male     |       1      |       0        |
| Female   |       0      |       1        |

this is how the one-hot encoding representation usually works like.

`(Assume Gender is Ordinal);`
Male -> 1
Female -> 0


In this dataset these are the Values representations:

- Gender -> Nominal
- Geography -> Nominal
- CreditScoreBins -> Ordinal

In [34]:
df = pd.read_csv('processed\ChrunModelling_Binning_Applied.csv')
df.head()

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,Fair
1,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,Fair
2,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,Poor
3,France,Female,38.91,1,0.0,2,0,0,93826.63,0,Good
4,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,Excellent


### 2. Encode Nominal Variables

In [35]:
nominal_variables = ['Geography', 'Gender']   #putting them in a list

# Create dummy variables
geography_dummies = pd.get_dummies(df['Geography'], prefix="Geography")
gender_dummies = pd.get_dummies(df['Gender'], prefix="Gender")

# Start with original df
df_encoded = df.copy()

# Add dummy variables
df_encoded = pd.concat([df_encoded, geography_dummies, gender_dummies], axis=1)

# Drop original categorical columns
df_encoded.drop(['Geography', 'Gender'], axis=1, inplace=True)

df_encoded


Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.00,2,0.00,1,1,1,101348.88,1,Fair,True,False,False,True,False
1,41.00,1,83807.86,1,0,1,112542.58,0,Fair,False,False,True,True,False
2,42.00,8,159660.80,3,1,0,113931.57,1,Poor,True,False,False,True,False
3,38.91,1,0.00,2,0,0,93826.63,0,Good,True,False,False,True,False
4,43.00,2,125510.82,1,1,1,79084.10,0,Excellent,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39.00,5,0.00,2,1,0,96270.64,0,Very Good,True,False,False,False,True
9996,35.00,10,57369.61,1,1,1,101699.77,0,Poor,True,False,False,False,True
9997,36.00,7,0.00,1,0,1,42085.58,1,Good,True,False,False,True,False
9998,42.00,3,75075.31,2,1,0,92888.52,1,Very Good,False,True,False,False,True


### 3. Encode Ordinal Variables

In [36]:
encode_dict_creditscore = {
                                'Poor'      : 0,
                                'Fair'      : 1,
                                'Good'      : 2,
                                'Very Good' : 3,
                                'Excellent' : 4
                            }

df_encoded['CreditScoreBins'] = df_encoded['CreditScoreBins'].map(encode_dict_creditscore)
df_encoded

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.00,2,0.00,1,1,1,101348.88,1,1,True,False,False,True,False
1,41.00,1,83807.86,1,0,1,112542.58,0,1,False,False,True,True,False
2,42.00,8,159660.80,3,1,0,113931.57,1,0,True,False,False,True,False
3,38.91,1,0.00,2,0,0,93826.63,0,2,True,False,False,True,False
4,43.00,2,125510.82,1,1,1,79084.10,0,4,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39.00,5,0.00,2,1,0,96270.64,0,3,True,False,False,False,True
9996,35.00,10,57369.61,1,1,1,101699.77,0,0,True,False,False,False,True
9997,36.00,7,0.00,1,0,1,42085.58,1,2,True,False,False,True,False
9998,42.00,3,75075.31,2,1,0,92888.52,1,3,False,True,False,False,True


- Now all of them are in numerical formats, but in different ranges.
- We need to fix that.

### Also you can do these with Scikit-Learn as well.

In [37]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [38]:
df = pd.read_csv('processed\ChrunModelling_Binning_Applied.csv')
df

ohe_geography = OneHotEncoder()
ohe_gender = OneHotEncoder()

le_credit_score = LabelEncoder()

ohe_geography.fit(df['Geography'].values.reshape(10000, 1))
ohe_gender.fit(df['Gender'].values.reshape(10000, 1))

In [42]:
geography_ohe = ohe_geography.transform(df['Geography'].values.reshape(10000, 1))
geography_ohe = geography_ohe.toarray()
geography_ohe

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [43]:
gender_ohe = ohe_gender.transform(df['Gender'].values.reshape(10000, 1))
gender_ohe = gender_ohe.toarray()
gender_ohe

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])