# Set up Environment

In [1]:
import numpy as np
from sklearn import preprocessing

In [65]:
head = lambda x, n = 6: x[:n]
tail = lambda x, n = 6: x[-n:]

In [None]:
def apply()

In [79]:
d = dict(p1=1, p2=2)
def f2(p1,p2):
    print(p1, p2)
f2(**d)

1 2


# 1. Standardization, or mean removal and variance scaling

`X -> Z = (X - mu) / sig` which has zero mean and unit variance

Write it by myself in numpy

In [25]:
X = np.array([
    [1, 2, 3],
    [2, 4, 6]
], dtype = np.float32)

In [26]:
mu_col = np.mean(X, axis = 0)
sig_col = np.std(X, axis = 0)

print("Mean of each column              ", mu_col)
print("Standard deviation of each column", sig_col)
print("-----------------------")
print("After standardization: ")
print((X - mu_col) / sig_col)

Mean of each column               [1.5 3.  4.5]
Standard deviation of each column [0.5 1.  1.5]
-----------------------
After standardization: 
[[-1. -1. -1.]
 [ 1.  1.  1.]]


In [27]:
mu_row = np.mean(X, axis = 1)
sig_row = np.std(X, axis = 1)

print("Mean of each column              ", mu_row)
print("Standard deviation of each column", sig_row)
print("-----------------------")
print("After standardization: ")
print((X - mu_row) / sig_row) # => create error

Mean of each column               [2. 4.]
Standard deviation of each column [0.8164966 1.6329932]
-----------------------
After standardization: 


ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

Using `scale` function in sklearn preprocessing module

In [28]:
X_scaled = preprocessing.scale(X)
print(X_scaled)                                          

[[-1. -1. -1.]
 [ 1.  1.  1.]]


In [29]:
print("mean of each column")
print(X_scaled.mean(axis=0))
print("-------------------")
print("std  of each column")
print(X_scaled.std(axis=0))

mean of each column
[0. 0. 0.]
-------------------
std  of each column
[1. 1. 1.]


In [30]:
X_scaled = preprocessing.scale(X, axis = 1)
print(X_scaled)                             

[[-1.2247448  0.         1.2247448]
 [-1.2247448  0.         1.2247448]]


In [31]:
print("mean of each row")
print(X_scaled.mean(axis = 1))
print("-------------------")
print("std  of each row")
print(X_scaled.std(axis = 1))

mean of each row
[0. 0.]
-------------------
std  of each row
[0.9999999 0.9999999]


## 1.1 Scaling features to a range

An alternative standardization is scaling features to lie between a given minimum and maximum value, often between zero and one, or so that the maximum absolute value of each feature is scaled to unit size
- MinMaxScaler
- MaxAbsScaler

In [33]:
X = np.array([
    [1, 2, 3],
    [2, 4, 6]
], dtype = np.float32)

In [34]:
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)

In [35]:
X_minmax

array([[0., 0., 0.],
       [1., 1., 1.]], dtype=float32)

In [36]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [37]:
X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [38]:
min_max_scaler.scale_ 

array([0.5       , 0.5       , 0.33333333])

In [39]:
min_max_scaler.min_

array([0.        , 0.5       , 0.33333333])

Question: does min_max_scaler memorize the last scaling

**MaxAbsScaler** => -1, 1

In [40]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)

In [41]:
X_train_maxabs                # doctest +NORMALIZE_WHITESPACE^

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [45]:
max_abs_scaler.scale_

array([2., 1., 2.])

In [48]:
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.fit_transform(X_test)

In [50]:
X_test_maxabs                 

array([[-1., -1.,  1.]])

In [51]:
max_abs_scaler.scale_         

array([3., 1., 4.])

## 1.2. Scaling sparse data

## 1.3. Scaling data with outliers

### Scaling vs Whitening

### Scaling target variables in regression

## 1.4. Centering kernel matrices

# 2. Non-linear transformation

- QuantileTransformer
- quantile_transform

In [53]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [54]:
iris = load_iris()

In [59]:
X, y = iris.data, iris.target

In [67]:
head(X)

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4]])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [69]:
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)

In [70]:
X_train_trans = quantile_transformer.fit_transform(X_train)

In [71]:
X_test_trans = quantile_transformer.transform(X_test)

In [72]:
np.percentile(X_train[:, 0], [0, 25, 50, 75, 100])

array([4.3, 5.1, 5.8, 6.5, 7.9])

In [73]:
np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100])

array([9.99999998e-08, 2.38738739e-01, 5.09009009e-01, 7.43243243e-01,
       9.99999900e-01])

In [74]:
np.percentile(X_test[:, 0], [0, 25, 50, 75, 100])

array([4.4  , 5.125, 5.75 , 6.175, 7.3  ])

In [75]:
np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100])

array([0.01351351, 0.25012513, 0.47972973, 0.6021021 , 0.94144144])

In [76]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)

In [77]:
X_trans = quantile_transformer.fit_transform(X)

# Normalization

scaling individual samples to have unit norm

# 4. Binarization

## 4.1. Feature binarization

In [80]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]

In [81]:
binarizer = preprocessing.Binarizer().fit(X)  # fit does nothing

In [82]:
binarizer

Binarizer(copy=True, threshold=0.0)

In [83]:
preprocessing.Binarizer()

Binarizer(copy=True, threshold=0.0)

In [84]:
binarizer.transform(X)

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [85]:
binarizer = preprocessing.Binarizer(threshold=1.1)

In [88]:
binarizer.transform(X)

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 0.]])

# 5. Encoding categorical features

In [89]:
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) 

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [98]:
enc.transform([[0, 1, 3]]).toarray()

array([[1., 0., 0., 1., 0., 0., 0., 0., 1.]])

# 6. Imputation of missing values

In [99]:
import numpy as np
from sklearn.preprocessing import Imputer

In [100]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [101]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))  

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


In [None]:
np.apply_along_axis()

In [102]:
b = np.array([[8,1,7], [4,3,9], [5,2,6]])

In [104]:
np.apply_along_axis(sorted, 0, b)

array([[4, 1, 6],
       [5, 2, 7],
       [8, 3, 9]])

In [105]:
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit(X)

X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
print(imp.transform(X_test))                      

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


In [115]:
print(X_test.toarray())
print(imp.transform(X_test))  

[[0 2]
 [6 0]
 [7 6]]
[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


In [108]:
X.toarray()

array([[1, 2],
       [0, 3],
       [7, 6]], dtype=int64)

In [113]:
np.apply_along_axis(np.mean, 0, X.toarray())

array([2.66666667, 3.66666667])

# 7. Generating polynomial features

In [116]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [118]:
X = np.arange(6).reshape(3, 2)
print(X)

[[0 1]
 [2 3]
 [4 5]]


[PolynomialFeatures](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures)

`class sklearn.preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)`

In [120]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

# 8. Custom transformers

In [122]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])

In [124]:
X

array([[0, 1],
       [2, 3]])

In [123]:
transformer.transform(X)

array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])

Note: np.log1p calculate log(1 + x)

In [126]:
print(np.log1p(np.arange(6)))

[0.         0.69314718 1.09861229 1.38629436 1.60943791 1.79175947]


In [127]:
print(np.log1p(np.arange(6)))

<ufunc 'log'>