# Categorical encoding and numerical scaling/normalization

Author: Filipe Lauar

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.datasets import load_boston

In [21]:
boston = load_boston()
data = pd.DataFrame(data= np.c_[boston['data'], boston['target']],
                     columns= list(boston['feature_names'])  + ['target'])
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [22]:
for i in range(data.shape[1]):
  print(data.iloc[:,i].dtype)

float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64


# Categorical Feature Encoding

***Label Encoder***

In [23]:
le = LabelEncoder()
le_RAD = le.fit_transform(data['RAD'])
le_RAD

array([0, 1, 1, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 4, 1, 4, 7, 7, 7, 7, 7, 7, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2,
       1, 1, 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 0, 0, 3, 1, 1,
       1, 2, 2, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5,
       5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 2, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 0, 0, 0,
       0, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,

***One Hot Encoder***

In [24]:
ohe = OneHotEncoder()
ohe_RAD = ohe.fit_transform(data['RAD'].values.reshape((-1,1)))
ohe_RAD

<506x9 sparse matrix of type '<class 'numpy.float64'>'
	with 506 stored elements in Compressed Sparse Row format>

***Target Encoder***

In [25]:
te = dict(data.groupby('RAD').mean()['target'])
te_RAD = data['RAD'].map(te)
te_RAD

0      24.365000
1      26.833333
2      26.833333
3      27.928947
4      27.928947
         ...    
501    24.365000
502    24.365000
503    24.365000
504    24.365000
505    24.365000
Name: RAD, Length: 506, dtype: float64

***Frequency Encoder***

In [26]:
fe = dict(data.groupby('RAD').count()/data.shape[0])['target']
fe_RAD = data['RAD'].map(fe)
fe_RAD

0      0.039526
1      0.047431
2      0.047431
3      0.075099
4      0.075099
         ...   
501    0.039526
502    0.039526
503    0.039526
504    0.039526
505    0.039526
Name: RAD, Length: 506, dtype: float64

# Numerical feature scaling

***Standard Scaling***

$x_i = \frac{x_i - mean(X)}{std(X)}$

In [31]:
standard = StandardScaler()
standard_RM = standard.fit_transform(data['RM'].values.reshape((-1,1))) 

***Min Max Normalization***

$x_i = \frac{x_i - min(X)}{max(X) - min(x)}$

In [32]:
minMax = MinMaxScaler()
minMax_RM = minMax.fit_transform(data['RM'].values.reshape((-1,1))) 