In [4]:
import numpy as np
from sklearn import preprocessing
x = np.array([[ 33., 4., 5.],
             [ 23., 65., 9.],
             [ 2., 23., 56.]])

In [7]:
# standardinzing the data
# data should have mean = 0 anf standard deviation = 1
x_scale = preprocessing.scale(x)
x_scale

array([[ 1.05791041, -1.04631049, -0.79171529],
       [ 0.28382962,  1.34712476, -0.61897741],
       [-1.34174003, -0.30081427,  1.41069269]])

In [8]:
# mean should be close to 0 after scaling
x_scale.mean(axis = 0)

array([ 7.40148683e-17, -1.29526020e-16,  7.40148683e-17])

In [9]:
# standard deviation should be 1 after scaling
x_scale.std(axis = 0)

array([1., 1., 1.])

In [12]:
# normalizing the data
# l1 norm -> manhattan distance
x_norm_l1 = preprocessing.normalize(x, norm = 'l1')
x_norm_l1

array([[0.78571429, 0.0952381 , 0.11904762],
       [0.2371134 , 0.67010309, 0.09278351],
       [0.02469136, 0.28395062, 0.69135802]])

In [13]:
# l2 norm -> euclideandistance
x_norm_l2 = preprocessing.normalize(x, norm = 'l2')
x_norm_l2

array([[0.98169079, 0.11899282, 0.14874103],
       [0.33077265, 0.93479228, 0.12943278],
       [0.03301841, 0.3797117 , 0.92451545]])

In [26]:
# scaling the feature to a range
# scaling the data within 0-1
min_max_scaler = preprocessing.MinMaxScaler()
x_min_mac = min_max_scaler.fit_transform(x)
x_min_max

array([[1.        , 0.        , 0.        ],
       [0.67741935, 1.        , 0.07843137],
       [0.        , 0.31147541, 1.        ]])

In [25]:
# scaling the data within data range of -10-10
min_max_scaler_wfr = preprocessing.MinMaxScaler(feature_range = (-10, 10))
x_min_max_wfr = min_max_scaler_wfr.fit_transform(x)
x_min_max

array([[1.        , 0.        , 0.        ],
       [0.67741935, 1.        , 0.07843137],
       [0.        , 0.31147541, 1.        ]])

In [29]:
# binaring features -> preasent(1) or absent(0) based on a threshold
binarizer = preprocessing.Binarizer(threshold = (10))
x_binarized = binarizer.transform(x)
x_binarized

array([[1., 0., 0.],
       [1., 1., 0.],
       [0., 1., 1.]])

In [38]:
# handling missing data
from numpy import nan
x2 = np.array([[2., 4., 8., nan],
               [3., nan, 5., 1],
               [9., 3., 11., nan],
               [14., nan, 7., 6]])

In [36]:
from sklearn.preprocessing import Imputer
# median stategy of imputing the data
# replace every nan with the mean of the row (default axis:0)
imp_mean = preprocessing.Imputer(strategy = "mean")
x2_imputed_mean = imp_mean.fit_transform(x2)
x2_imputed_mean

array([[ 2. ,  4. ,  8. ,  3.5],
       [ 3. ,  3.5,  5. ,  1. ],
       [ 9. ,  3. , 11. ,  3.5],
       [14. ,  3.5,  7. ,  6. ]])

In [45]:
# median strategy
imp_median = preprocessing.Imputer(strategy = "median")
x2_imputed_median = imp_median.fit_transform(x2)
x2_imputed_median

array([[ 2. ,  4. ,  8. ,  3.5],
       [ 3. ,  3.5,  5. ,  1. ],
       [ 9. ,  3. , 11. ,  3.5],
       [14. ,  3.5,  7. ,  6. ]])

In [47]:
# most frequent stategy
imp_mf = preprocessing.Imputer(strategy = "most_frequent")
x2_imputed_mf = imp_mf.fit_transform(x2)
x2_imputed_mf

array([[ 2.,  4.,  8.,  1.],
       [ 3.,  3.,  5.,  1.],
       [ 9.,  3., 11.,  1.],
       [14.,  3.,  7.,  6.]])