In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("./data/house_price.csv")
df

Unnamed: 0,LotArea,MSSubClass
0,8450,60
1,9600,20
2,11250,60
3,9550,70
4,14260,60
...,...,...
1455,7917,60
1456,13175,20
1457,9042,70
1458,9717,20


## Absolute Maximum Scaling
- Every value scaled to between -1 and 1
- Very prone to outliers

In [15]:
max_vals = np.max(np.abs(df), axis=0)
max_vals

LotArea       215245
MSSubClass       190
dtype: int64

In [16]:
(df - max_vals) / max_vals

Unnamed: 0,LotArea,MSSubClass
0,-0.960742,-0.684211
1,-0.955400,-0.894737
2,-0.947734,-0.684211
3,-0.955632,-0.631579
4,-0.933750,-0.684211
...,...,...
1455,-0.963219,-0.684211
1456,-0.938791,-0.894737
1457,-0.957992,-0.631579
1458,-0.954856,-0.894737


## Min-Max Scaling
- Every value scaled to between 0 and 1
- Very prone to outliers

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
scaler = MinMaxScaler()
scaler

In [20]:
scaled_data = scaler.fit_transform(df)
scaled_data

array([[0.0334198 , 0.23529412],
       [0.03879502, 0.        ],
       [0.04650728, 0.23529412],
       ...,
       [0.03618687, 0.29411765],
       [0.03934189, 0.        ],
       [0.04037019, 0.        ]], shape=(1460, 2))

In [21]:
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,LotArea,MSSubClass
0,0.033420,0.235294
1,0.038795,0.000000
2,0.046507,0.235294
3,0.038561,0.294118
4,0.060576,0.235294
...,...,...
1455,0.030929,0.235294
1456,0.055505,0.000000
1457,0.036187,0.294118
1458,0.039342,0.000000


## Normalization

In [22]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaler

In [23]:
scaled_data = scaler.fit_transform(df)
scaled_data

array([[0.99997479, 0.00710041],
       [0.99999783, 0.00208333],
       [0.99998578, 0.00533326],
       ...,
       [0.99997003, 0.00774142],
       [0.99999788, 0.00205824],
       [0.99999797, 0.00201268]], shape=(1460, 2))

In [25]:
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,LotArea,MSSubClass
0,0.999975,0.007100
1,0.999998,0.002083
2,0.999986,0.005333
3,0.999973,0.007330
4,0.999991,0.004208
...,...,...
1455,0.999971,0.007578
1456,0.999999,0.001518
1457,0.999970,0.007741
1458,0.999998,0.002058


## Standardization
- Creates a normal distribution

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler

In [27]:
scaled_data = scaler.fit_transform(df)
scaled_data

array([[-0.20714171,  0.07337496],
       [-0.09188637, -0.87256276],
       [ 0.07347998,  0.07337496],
       ...,
       [-0.14781027,  0.30985939],
       [-0.08016039, -0.87256276],
       [-0.05811155, -0.87256276]], shape=(1460, 2))

In [28]:
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,LotArea,MSSubClass
0,-0.207142,0.073375
1,-0.091886,-0.872563
2,0.073480,0.073375
3,-0.096897,0.309859
4,0.375148,0.073375
...,...,...
1455,-0.260560,0.073375
1456,0.266407,-0.872563
1457,-0.147810,0.309859
1458,-0.080160,-0.872563


## Robust Scaling
- Based off IQR

In [30]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler

In [31]:
scaled_data = scaler.fit_transform(df)
scaled_data

array([[-0.25407609,  0.2       ],
       [ 0.03001482, -0.6       ],
       [ 0.43762352,  0.2       ],
       ...,
       [-0.10783103,  0.4       ],
       [ 0.05891798, -0.6       ],
       [ 0.11326581, -0.6       ]], shape=(1460, 2))

In [32]:
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,LotArea,MSSubClass
0,-0.254076,0.2
1,0.030015,-0.6
2,0.437624,0.2
3,0.017663,0.4
4,1.181201,0.2
...,...,...
1455,-0.385746,0.2
1456,0.913167,-0.6
1457,-0.107831,0.4
1458,0.058918,-0.6
