In [None]:
### Preprocessing Data
# 1. data formatting
# 2. data cleaning
# 3. data sampling
### Begin the feature engineering
## transform the preprocessed data
# 1. Scaling
# 2. Decomposition
# 3. Aggregation

In [1]:
from sklearn import preprocessing as pp
import numpy as np
x = np.array([[1.,-2.,2.], [3.,0.,0.], [0.,1.,-1]])
x

array([[ 1., -2.,  2.],
       [ 3.,  0.,  0.],
       [ 0.,  1., -1.]])

In [2]:
# standarized
x_scaled = pp.scale(x)
x_scaled

array([[-0.26726124, -1.33630621,  1.33630621],
       [ 1.33630621,  0.26726124, -0.26726124],
       [-1.06904497,  1.06904497, -1.06904497]])

In [3]:
x_scaled.mean(axis=0)

array([7.40148683e-17, 0.00000000e+00, 0.00000000e+00])

In [4]:
x_scaled.std(axis=0)

array([1., 1., 1.])

In [3]:
# normalize features
pp.normalize(x, norm='l1')

array([[ 0.2, -0.4,  0.4],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  0.5, -0.5]])

In [6]:
res = pp.normalize(x, norm='l2')
res

array([[ 0.33333333, -0.66666667,  0.66666667],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [8]:
# axis=0 -> by column
# axis=1 -> by row
res.mean(axis=1)

array([0.11111111, 0.33333333, 0.        ])

In [11]:
# range features
min_max_scaler = pp.MinMaxScaler() # (0,1) by default
x_min_max = min_max_scaler.fit_transform(x)
print('min_max_scaler with (0,1) range =', x_min_max)
min_max_scaler = pp.MinMaxScaler(feature_range=(-10,10)) # set feature_range
x_min_max = min_max_scaler.fit_transform(x)
print('min_max_scaler with (-10,10) range =', x_min_max)

min_max_scaler with (0,1) range = [[0.33333333 0.         1.        ]
 [1.         0.66666667 0.33333333]
 [0.         1.         0.        ]]
min_max_scaler with (-10,10) range = [[ -3.33333333 -10.          10.        ]
 [ 10.           3.33333333  -3.33333333]
 [-10.          10.         -10.        ]]


In [12]:
# binarizing features
binarizer = pp.Binarizer(threshold=0.5)
x_binary = binarizer.transform(x)
x_binary

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [17]:
# handle the missing data
from numpy import nan
from sklearn.impute import SimpleImputer
x = np.array([
    [nan, 0, 3],
    [2, 9, -8],
    [1, nan, 1],
    [5, 2, 4],
    [7, 6, -3]
])
# three strategies to impute missing values
imp = SimpleImputer(strategy='mean') # axis=0 by default
imp.fit_transform(x)

array([[ 3.75,  0.  ,  3.  ],
       [ 2.  ,  9.  , -8.  ],
       [ 1.  ,  4.25,  1.  ],
       [ 5.  ,  2.  ,  4.  ],
       [ 7.  ,  6.  , -3.  ]])

In [18]:
np.mean(x[1:,0])

3.75

In [20]:
imp = SimpleImputer(strategy='median')
imp.fit_transform(x)

array([[ 3.5,  0. ,  3. ],
       [ 2. ,  9. , -8. ],
       [ 1. ,  4. ,  1. ],
       [ 5. ,  2. ,  4. ],
       [ 7. ,  6. , -3. ]])

In [21]:
np.median(x[1:, 0])

3.5