## Data preprocessing with SKLearn

In [2]:
import numpy as np
from sklearn import preprocessing

sample_data = np.array([[2.5,4.1,-2.1],
              [-1.9,3.4,2.0],
              [-0.5,5.1,1.8],
              [0.5,-3.1,2.9]])
sample_data

array([[ 2.5,  4.1, -2.1],
       [-1.9,  3.4,  2. ],
       [-0.5,  5.1,  1.8],
       [ 0.5, -3.1,  2.9]])

In [3]:
sample_data.shape

(4, 3)

In [11]:
# Binarisation

binarizer = preprocessing.Binarizer(threshold = 0.5) #under or equal to 0.5 transform to 0, above 0.5 transform to 1
binarised_data = binarizer.transform(sample_data)
binarised_data

array([[1., 1., 0.],
       [0., 1., 1.],
       [0., 1., 1.],
       [0., 0., 1.]])

In [21]:
# Scaling

scaling = preprocessing.MinMaxScaler(feature_range = (0,1))
scaling.fit(sample_data)
scaled_data = scaling.transform(sample_data)
scaled_data

array([[1.        , 0.87804878, 0.        ],
       [0.        , 0.79268293, 0.82      ],
       [0.31818182, 1.        , 0.78      ],
       [0.54545455, 0.        , 1.        ]])

In [22]:
scaled_data = scaling.fit_transform(sample_data)
scaled_data

array([[1.        , 0.87804878, 0.        ],
       [0.        , 0.79268293, 0.82      ],
       [0.31818182, 1.        , 0.78      ],
       [0.54545455, 0.        , 1.        ]])

In [13]:
help(preprocessing)

Help on package sklearn.preprocessing in sklearn:

NAME
    sklearn.preprocessing

DESCRIPTION
    The :mod:`sklearn.preprocessing` module includes scaling, centering,
    normalization, binarization methods.

PACKAGE CONTENTS
    _csr_polynomial_expansion
    _data
    _discretization
    _encoders
    _function_transformer
    _label
    setup
    tests (package)

CLASSES
    sklearn.base.BaseEstimator(builtins.object)
        sklearn.preprocessing._data.Binarizer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
        sklearn.preprocessing._data.KernelCenterer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
        sklearn.preprocessing._data.MaxAbsScaler(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
        sklearn.preprocessing._data.MinMaxScaler(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
        sklearn.preprocessing._data.Normalizer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
        sklearn.preprocessing._data.Po

In [25]:
# Normalisation l1: Least Absolute Deviations

normalized_data_l1 = preprocessing.normalize(sample_data, norm = 'l1')
normalized_data_l1

array([[ 0.28735632,  0.47126437, -0.24137931],
       [-0.26027397,  0.46575342,  0.2739726 ],
       [-0.06756757,  0.68918919,  0.24324324],
       [ 0.07692308, -0.47692308,  0.44615385]])

In [26]:
# Normalisation l2: Least Square

normalized_data = preprocessing.normalize(sample_data, norm = 'l2')
normalized_data

array([[ 0.47699154,  0.78226613, -0.4006729 ],
       [-0.43395285,  0.7765472 ,  0.45679247],
       [-0.09205746,  0.93898611,  0.33140686],
       [ 0.11697707, -0.72525782,  0.67846699]])