In [1]:
from sklearn import datasets

#### Generator For Regression  
`make_regression` - produces regression targets as an optionally-sparse random linear combination of random features, with noise.  


In [2]:
datasets.make_regression

<function sklearn.datasets.samples_generator.make_regression(n_samples=100, n_features=100, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)>

`make_sparse_uncorrelated` - produces a target as a linear combination of four features with fixed coefficients  

In [3]:
datasets.make_sparse_uncorrelated

<function sklearn.datasets.samples_generator.make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None)>

`make_friedman1` - related by polynomial and sine transforms

In [4]:
datasets.make_friedman1

<function sklearn.datasets.samples_generator.make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None)>

`make_friedman2` -  includes feature multiplication and reciprocation

In [5]:
datasets.make_friedman2

<function sklearn.datasets.samples_generator.make_friedman2(n_samples=100, noise=0.0, random_state=None)>

`make_friedman3` - similar with an arctan transformation on the target

In [6]:
datasets.make_friedman3

<function sklearn.datasets.samples_generator.make_friedman3(n_samples=100, noise=0.0, random_state=None)>

#### Generator For Manifold Learning  
`make_s_curve` - Generate an S curve dataset

In [7]:
datasets.make_s_curve

<function sklearn.datasets.samples_generator.make_s_curve(n_samples=100, noise=0.0, random_state=None)>

`make_swiss_roll` - Generate a swiss roll dataset 

In [8]:
datasets.make_swiss_roll

<function sklearn.datasets.samples_generator.make_swiss_roll(n_samples=100, noise=0.0, random_state=None)>

#### Generator For Decomposition  
`make_low_rank_matrix` - Generate a mostly low rank matrix with bell-shaped singular values  

In [9]:
datasets.make_low_rank_matrix

<function sklearn.datasets.samples_generator.make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, tail_strength=0.5, random_state=None)>

`make_sparse_coded_signal` - Generate a signal sas a sparse combination of dictionary elements

In [10]:
datasets.make_sparse_coded_signal

<function sklearn.datasets.samples_generator.make_sparse_coded_signal(n_samples, n_components, n_features, n_nonzero_coefs, random_state=None)>

`make_spd_matrix` - Generate a random symmetric, positive-definite matrix 

In [11]:
datasets.make_spd_matrix

<function sklearn.datasets.samples_generator.make_spd_matrix(n_dim, random_state=None)>

`make_sparse_spd_matrix` - Generate a sparse symmetric definite positive matrix

In [12]:
datasets.make_sparse_spd_matrix

<function sklearn.datasets.samples_generator.make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False, smallest_coef=0.1, largest_coef=0.9, random_state=None)>

### Datasets in svmlight / libsvm format 
svmlight/libsvm format: \<label\>  \<feature-id\>:\<feature-value\> \<feature-id\>: \<feature-value\> per line  

In [13]:
datasets.load_svmlight_file

<function sklearn.datasets.svmlight_format.load_svmlight_file(f, n_features=None, dtype=<class 'numpy.float64'>, multilabel=False, zero_based='auto', query_id=False, offset=0, length=-1)>

### Loading From Extrenal Datasets  
1. pandas.io
2. scipy.io  
3. numpy/routine.io  
4. skimage.io / Imageio  
5. scipy.misc.imread  
6. scipy.io.wavfile.read  

### Downloading datasets from the mldata.org repository  
`fetch_mldata`  

In [14]:
datasets.fetch_mldata

<function sklearn.datasets.mldata.fetch_mldata(dataname, target_name='label', data_name='data', transpose_data=True, data_home=None)>

## Dataset Transformations  
clean, reduce, expand, generate feature representations  

In [15]:
from sklearn.pipeline import Pipeline

### Pipeline And FeatureUnion: Cobining Estimators  
Pipeline can use to chain estimators into one, useful when there is ofen a fixed sequence of steps in processing the data

#### How to use  
`Pipeline(estimators)`, estimators is a list of (key, value) tuple, key is the name of estimator, value is the estimator object

In [16]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA  
estimators = [('reduce_im', PCA()), ('clf', SVC())]  
pipe = Pipeline(estimators)
pipe

Pipeline(memory=None,
     steps=[('reduce_im', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

`make_pipeline(estimator1, estimator, ...)` shorthand for creating pipeline, name was autofilled  

In [17]:
from sklearn.pipeline import make_pipeline  
pipe = make_pipeline(PCA(), PCA(), SVC())
pipe

Pipeline(memory=None,
     steps=[('pca-1', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('pca-2', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

**steps/named_steps** attribute store the estimators

In [18]:
pipe.steps

[('pca-1',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('pca-2',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [19]:
pipe.named_steps

{'pca-1': PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False),
 'pca-2': PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False),
 'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)}

use **estimator__parameter** to access estimator's parameter

In [20]:
pipe.set_params(svc__C=0)

Pipeline(memory=None,
     steps=[('pca-1', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('pca-2', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=0, cache_size=2...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

Following shows how to use GridSearchCV  

In [21]:
from sklearn.model_selection import GridSearchCV
param_grid = {'pca-1__n_components': [2, 5, 10], 'svc__C': [0.1, 10, 100]}
grid_search = GridSearchCV(pipe, param_grid=param_grid)

#### Perfermance  
pipeline will cache each transform after calling fit, so if parameters and input data are identical the tranformation wont running  
`Pipeline(..., memory=dirname_or_jobmemoryobject)`

In [22]:
from tempfile import mkdtemp
from shutil import rmtree
estimators = [('reduce_dim', PCA()), ('clf', SVC())]
cachedir = mkdtemp()
pipe = Pipeline(estimators, memory=cachedir)
rmtree(cachedir)

### FeatureUnion: Composite Feature Spaces  
FeatureUnion combine several transformer objects into a new transformer that combines their output  
while fitting, each estimator fit data indenpendently, the output are concatenated end-to-end into larger vectors  

In [23]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import KernelPCA
estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
combined = FeatureUnion(estimators)
combined 

FeatureUnion(n_jobs=1,
       transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
     random_state=None, remove_zero_eig=False, tol=0))],
       transformer_weights=None)

### Feature Extraction  
The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.
**Difference with Feature Selection**: the former consists in transforming arbitrary data, such as text or images, into numerical features usable for machine learning. The latter is a machine learning technique applied on these features  

#### Loading Features From Dicts  
`DictVectorizer` - convert feature arrays represented as liss of standard python dict object to numpy/scipy representation, it implement the `one-hot` coding for categorical features

In [24]:
>>> measurements = [
...     {'city': 'Dubai', 'temperature': 33.},
...     {'city': 'London', 'temperature': 12.},
...     {'city': 'San Francisco', 'temperature': 18.},
... ]

>>> from sklearn.feature_extraction import DictVectorizer
>>> vec = DictVectorizer()

>>> print(vec.fit_transform(measurements).toarray())

>>> print(vec.get_feature_names())

[[ 1.  0.  0. 33.]
 [ 0.  1.  0. 12.]
 [ 0.  0.  1. 18.]]
['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']


Also is a useful representation transformation for training sequence classifiers in NLP model

In [25]:
>>> pos_window = [
...     {
...         'word-2': 'the',
...         'pos-2': 'DT',
...         'word-1': 'cat',
...         'pos-1': 'NN',
...         'word+1': 'on',
...         'pos+1': 'PP',
...     },
...     # in a real application one would extract many such dictionaries
... ]
>>> vec = DictVectorizer()
>>> pos_vectorized = vec.fit_transform(pos_window)
>>> print(pos_vectorized)
>>> print(pos_vectorized.toarray())
>>> print(vec.get_feature_names())

  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
[[1. 1. 1. 1. 1. 1.]]
['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']


#### Feature Hashing  
An implementation of Feature hashing, apply a hash function to features to determine their column index in sample matrices directly  
About collisions: use a signed hash function   
Accept mappings or (feature, value) pair or strings  
Output scipy.sparse  

In [26]:
from sklearn.feature_extraction import FeatureHasher
def token_features(token, part_of_speech):
    if token.isdigit():
        yield "numeric"
    else:
        yield "token={}".format(token.lower())
        yield "token,pos={},{}".format(token, part_of_speech)
    if token[0].isupper():
        yield "uppercase_initial"
    if token.isupper():
        yield "all_uppercase"
    yield "pos={}".format(part_of_speech)
raw_X = (token_features(tok, pos) for tok, pos in [('A', 5)])
hasher = FeatureHasher(input_type='string')
X = hasher.transform(raw_X)
print(X)

  (0, 729803)	-1.0
  (0, 740061)	1.0
  (0, 892359)	-1.0
  (0, 950346)	-1.0
  (0, 1002789)	-1.0


### Text Feature Extraction  
Extract numerical features from text content - **Vercorization**   
Bag of words/Bag of n-grams Representation  
1. tokenizing  
2. counting  
3. normalizing  

#### Sparsity  
Words in documents is a very small subset, use sparse to store it in order to save memory and fasten 

#### CountVectorizer  
Implement both tokenization and occurrence counting  

In [27]:
>>> from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
>>> corpus = [
...     'This is the first document.',
...     'This is the second second document.',
...     'And the third one.',
...     'Is this the first document?',
... ]
>>> X = vectorizer.fit_transform(corpus)
print(X.shape)  
>>> analyze = vectorizer.build_analyzer()
>>> analyze("This is a text document to analyze.") == (
...     ['this', 'is', 'text', 'document', 'to', 'analyze'])
>>> vectorizer.get_feature_names() == (
...     ['and', 'document', 'first', 'is', 'one',
...      'second', 'the', 'third', 'this'])
>>> print(vectorizer.vocabulary_.get('document'))
>>> vectorizer.transform(['Something completely new.']).toarray()

(4, 9)
1


array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

We can extract 2-grams of words in order to preserve some of the local ordering infomation  

In [28]:
>>> bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
...                                     token_pattern=r'\b\w+\b', min_df=1)
>>> analyze = bigram_vectorizer.build_analyzer()
>>> analyze('Bi-grams are cool!') == (
...     ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
>>> X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
>>> X_2

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]],
      dtype=int64)

#### TF-IDF Term Weighting  
Why? some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document  
What? 
> tf-idf(t, d) = tf(t, d) x idf(t)  
> tf means term-frequency, idf means term-frequency times inverse document-frequency  
> idf(t) = log((1+n<d\>) / (1+df(d, t)) + 1  

In [29]:
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> transformer = TfidfTransformer(smooth_idf=False)
>>> tfidf = transformer.fit_transform(X_2)
>>> print(transformer.idf_)
>>> print(tfidf.toarray())

[2.38629436 2.38629436 1.28768207 1.69314718 1.69314718 1.28768207
 1.69314718 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436
 1.         1.69314718 2.38629436 2.38629436 2.38629436 2.38629436
 1.28768207 1.69314718 2.38629436]
[[0.         0.         0.28574186 0.37571621 0.37571621 0.28574186
  0.37571621 0.         0.         0.         0.         0.
  0.22190405 0.37571621 0.         0.         0.         0.
  0.28574186 0.37571621 0.        ]
 [0.         0.         0.1793146  0.         0.         0.1793146
  0.23577716 0.         0.         0.66460105 0.33230052 0.33230052
  0.13925379 0.         0.33230052 0.         0.         0.
  0.1793146  0.23577716 0.        ]
 [0.40240191 0.40240191 0.         0.         0.         0.
  0.         0.         0.40240191 0.         0.         0.
  0.16863046 0.         0.         0.40240191 0.40240191 0.40240191
  0.         0.         0.        ]
 [0.         0.         0.25271307 0.33228732 0.33228732 0.25271307
  0.         0.46

#### Decoding Text Files  
`CountVectorizer`take the encoding params(default is utf-8)  

In [30]:
>>> import chardet    
>>> text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"
>>> text2 = b"holdselig sind deine Ger\xfcche"
>>> text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"
>>> decoded = [x.decode(chardet.detect(x)['encoding'])
...            for x in (text1, text2, text3)]        
>>> v = CountVectorizer().fit(decoded).vocabulary_    
>>> for term in v: print(v)       

{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut

#### More  
1. [Applications And Examples](http://scikit-learn.org/stable/modules/feature_extraction.html#applications-and-examples)  
2. [Limitations of the Bag of Words Representation](http://scikit-learn.org/stable/modules/feature_extraction.html#limitations-of-the-bag-of-words-representation)  
3. [Vectorizing a large text corpus with hashing trick](http://scikit-learn.org/stable/modules/feature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick)  
4. [Performing out-of-core scaling with HashingVectorizer](http://scikit-learn.org/stable/modules/feature_extraction.html#performing-out-of-core-scaling-with-hashingvectorizer)  
5. [Customizing the vectorizer classes](http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes)  

### Image Feature Extract  
#### Patch Extraction  
`extract_patches_2d` - extract patches from image stored as a 2d array/3d(color) array  
`reconstruct_from_patches_2d` - reconstruct image  

In [31]:
>>> import numpy as np
>>> from sklearn.feature_extraction import image
>>> one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
>>> print(one_image[:, :, 0])  # R channel of a fake RGB picture
>>> patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)
>>> print(patches.shape)
>>> print(patches[:, :, :, 0])
>>> patches = image.extract_patches_2d(one_image, (2, 2))
>>> print(patches.shape)
>>> print(patches[4, :, :, 0])
>>> reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
>>> np.testing.assert_array_equal(one_image, reconstructed)

[[ 0  3  6  9]
 [12 15 18 21]
 [24 27 30 33]
 [36 39 42 45]]
(2, 2, 2, 3)
[[[ 0  3]
  [12 15]]

 [[15 18]
  [27 30]]]
(9, 2, 2, 3)
[[15 18]
 [27 30]]


`PatchExtractor` - like patch extractor, but support mult-image, is an estimator, can be use in pipeline

In [32]:
>>> five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
>>> patches = image.PatchExtractor((2, 2)).transform(five_images)
>>> patches.shape

(45, 2, 2, 3)

#### Connectivity Graph of an Image  
[Detail](http://scikit-learn.org/stable/modules/feature_extraction.html#connectivity-graph-of-an-image)  

### Preprocessing Data  
standardize data
#### Standardization or mean removal and variance scaling  
In practice we often ignore the shape of the distribution and just **transform the data to center it by removing the mean value of each feature**, then **scale it by dividing non-constant features by their standard deviation**.  

`scale` - a quick and easy way to perform standardization, scaled data has zero mean and unit variance 
`StandardScaler` - kind of estimator that implement scale  

In [33]:
from sklearn.preprocessing import scale as sk_scale, normalize as sk_normalize, StandardScaler,MinMaxScaler,MaxAbsScaler,QuantileTransformer,Normalizer,Binarizer, OneHotEncoder
import numpy as np
X_train = np.array([[ 1., -1.,  2.], [ 2.,  0.,  0.], [ 0.,  1., -1.]])
X_scaled = sk_scale(X_train)
print(X_scaled)  
scaler = StandardScaler().fit(X_train)
print(scaler.mean_)
X_test = [[-1., 1., 0.]]
print(scaler.transform(X_test))

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]
[1.         0.         0.33333333]
[[-2.44948974  1.22474487 -0.26726124]]


`MinMaxScaler/MaxAbsScaler` - scale features to lie between a given minimum and maximum value  
For `MinMaxScaler`  
> X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
> X_scaled = X_std * (max - min) + min  

In [34]:
>>> X_train = np.array([[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]])
>>> min_max_scaler = MinMaxScaler(feature_range=(0, 1))
>>> X_train_minmax = min_max_scaler.fit_transform(X_train)
>>> X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [35]:
>>> X_train = np.array([[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]])
>>> max_abs_scaler = MaxAbsScaler()
>>> X_train_maxabs = max_abs_scaler.fit_transform(X_train)
>>> print(X_train_maxabs)
>>> X_test = np.array([[ -3., -1.,  4.]])
>>> X_test_maxabs = max_abs_scaler.transform(X_test)
>>> print(X_test_maxabs)
>>> max_abs_scaler.scale_         

[[ 0.5 -1.   1. ]
 [ 1.   0.   0. ]
 [ 0.   1.  -0.5]]
[[-1.5 -1.   2. ]]


array([2., 1., 2.])

`robust_scale/RobustScaler`- data contains many outliers, scaling using the mean and variance of the data is likely to not work very well, They use more robust estimates for the center and range of your data.

#### Non-Linear Transformation  
`QuantileTransformer/quantile_transform` provide a non-parametric transformation based on the quantile function to map the data to a uniform distribution with values between 0 and 1  
It is also possible to map the transformed data to a normal distribution by setting output_distribution='normal'

In [36]:
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
>>> quantile_transformer = QuantileTransformer(random_state=0)
>>> X_train_trans = quantile_transformer.fit_transform(X_train)
>>> X_test_trans = quantile_transformer.transform(X_test)
>>> print(np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]))
>>> quantile_transformer = QuantileTransformer(
...     output_distribution='normal', random_state=0)
>>> X_trans = quantile_transformer.fit_transform(X)
>>> print(quantile_transformer.quantiles_)

[4.3 5.1 5.8 6.5 7.9]
[[4.3        2.         1.         0.1       ]
 [4.31491491 2.02982983 1.01491491 0.1       ]
 [4.32982983 2.05965966 1.02982983 0.1       ]
 ...
 [7.84034034 4.34034034 6.84034034 2.5       ]
 [7.87017017 4.37017017 6.87017017 2.5       ]
 [7.9        4.4        6.9        2.5       ]]


#### Normalization  
Normalization is the process of scaling individual samples to have unit norm  
This process can be useful if you plan to use a quadratic form such as the dot-product or any other kernel to quantify the similarity of any pair of samples  
`normalize/Normalizer(scioy.sparse-liked-input)` - provides a quick and easy way to perform this operation on a single array-like dataset, either using the l1 or l2 norms

In [37]:
>>> X = [[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]]
>>> X_normalized = sk_normalize(X, norm='l2')
>>> print(X_normalized)
>>> normalizer = Normalizer().fit(X)  # fit does nothing
>>> print(normalizer.transform(X))
>>> print(normalizer.transform([[-1.,  1., 0.]]))

[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]
[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]
[[-0.70710678  0.70710678  0.        ]]


#### Binarization  
Feature binarization is the process of thresholding numerical features to get boolean values  
This can be useful for downstream probabilistic estimators that make assumption that the input data is distributed according to a multi-variate Bernoulli distribution  
It is also common among the text processing community to use binary feature values (probably to simplify the probabilistic reasoning) even if normalized counts (a.k.a. term frequencies) or TF-IDF valued features often perform slightly better in practice.  
`Binarizer(threshold)` 

In [38]:
>>> X = [[ 1., -1.,  2.],
...      [ 2.,  0.,  0.],
...      [ 0.,  1., -1.]]

>>> binarizer = Binarizer().fit(X)  # fit does nothing
>>> print(binarizer.transform(X))
>>> binarizer = Binarizer(threshold=1.1)
>>> print(binarizer.transform(X))

[[1. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 0.]]


#### Encoding Categorical Features  
Convert categorical features to features that can be used with scikit-learn estimators
 `OneHotEncoder` - implemented the `one-of-K/one-hot` encoding,  transforms each categorical feature with m possible values into m binary features, with only one active( if there is a possibility that the training data might have missing categorical features, one has to explicitly set n_values)

In [39]:
>>> enc = OneHotEncoder()
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
>>> print(enc.transform([[0, 1, 3]]).toarray())
>>> enc = OneHotEncoder(n_values=[2, 3, 4])
>>> # Note that there are missing categorical values for the 2nd and 3rd
>>> # features
>>> print(enc.fit([[1, 2, 3], [0, 2, 0]]))

[[1. 0. 0. 1. 0. 0. 0. 0. 1.]]
OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values=[2, 3, 4], sparse=True)


#### Imputation(填充)  
How to handle missing value in dataset?  

1. Discard entire rows and/or columns containing missing values  
2. Impute the missing values(i.e., to infer them from the known part of the data)  

`Imputer` - provides basic strategies for imputing missing values, either using the mean, the median or the most frequent value of the row or column in which the missing values are located, this class also allows for different missing values encodings, support sparse matrices    

In [40]:
>>> import numpy as np
>>> from sklearn.preprocessing import Imputer
>>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
>>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])
>>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
>>> print(imp.transform(X))  
>>> import scipy.sparse as sp
>>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
>>> imp = Imputer(missing_values=0, strategy='mean', axis=0)
>>> imp.fit(X)
>>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
>>> print(imp.transform(X_test))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]
[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


#### Generating Polynomial Features  
Often it’s useful to add complexity to the model by considering nonlinear features of the input data. A simple and common method to use is polynomial features, which can get features’ high-order and interaction terms  
`PolynomialFeatures`  


In [41]:
>>> import numpy as np
>>> from sklearn.preprocessing import PolynomialFeatures
>>> X = np.arange(6).reshape(3, 2)
>>> print(X)
>>> poly = PolynomialFeatures(2)
>>> print(poly.fit_transform(X))
>>> X = np.arange(9).reshape(3, 3)
>>> print(X)
>>> poly = PolynomialFeatures(degree=3, interaction_only=True)
>>> print(poly.fit_transform(X))

[[0 1]
 [2 3]
 [4 5]]
[[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[  1.   0.   1.   2.   0.   0.   2.   0.]
 [  1.   3.   4.   5.  12.  15.  20.  60.]
 [  1.   6.   7.   8.  42.  48.  56. 336.]]


#### Custom Transformer  
`FunctionTransformer` - Convert an existing Python function into a transformer to assist in data cleaning or processing

In [42]:
>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log1p)
>>> X = np.array([[0, 1], [2, 3]])
>>> print(transformer.transform(X))

[[0.         0.69314718]
 [1.09861229 1.38629436]]


### UnSupervised Dimensionality Reduction  
[Detail](http://scikit-learn.org/stable/modules/unsupervised_reduction.html)

### Random Projection  
Reduce the dimensionality of the data by trading a controlled amount of accuracy, for faster processing times and smaller model sizes.

#### The Johnson-Lindenstrauss Lemma  
The main theoretical result behind the efficiency of random projection is the Johnson-Lindenstrauss lemma (quoting Wikipedia):
> In mathematics, the Johnson-Lindenstrauss lemma is a result concerning low-distortion embeddings of points from high-dimensional into low-dimensional Euclidean space. The lemma states that a small set of points in a high-dimensional space can be embedded into a space of much lower dimension in such a way that distances between the points are nearly preserved. The map used for the embedding is at least Lipschitz, and can even be taken to be an orthogonal projection.   

`johnson_lindenstrauss_min_dim` - Knowing only the number of sample, estimates conservatively the minimal size of the random subspace to guarantee a bounded distortion introduced by the random projection  

In [43]:
>>> from sklearn.random_projection import johnson_lindenstrauss_min_dim
>>> print(johnson_lindenstrauss_min_dim(n_samples=1e6, eps=0.5))
>>> print(johnson_lindenstrauss_min_dim(n_samples=1e6, eps=[0.5, 0.1, 0.01]))
>>> print(johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1))

663
[    663   11841 1112658]
[ 7894  9868 11841]


#### Gaussian Random Projection  
`GaussianRandomProjection` - reduces the dimensionality by projecting the original input space on a randomly generated matrix where components are drawn from the following distribution N(0, \frac{1}{n_{components}})  

In [44]:
>>> import numpy as np
>>> from sklearn import random_projection
>>> X = np.random.rand(100, 10000)
>>> transformer = random_projection.GaussianRandomProjection()
>>> X_new = transformer.fit_transform(X)
>>> print(X_new.shape)

(100, 3947)


#### Sparse Random Projection  
`SparseRandomProjection` - reduces the dimensionality by projecting the original input space using a sparse random matrix  
[Detail](http://scikit-learn.org/stable/modules/random_projection.html#sparse-random-projection)  

In [45]:
>>> import numpy as np
>>> from sklearn import random_projection
>>> X = np.random.rand(100,10000)
>>> transformer = random_projection.SparseRandomProjection()
>>> X_new = transformer.fit_transform(X)
>>> print(X_new.shape)

(100, 3947)


### Kernel Approximation  
[Detail](http://scikit-learn.org/stable/modules/kernel_approximation.html)  

### Pairwise Metrics, Affinities, Kernels  
evaluate pairwise distances or affinity of sets of samples(distance metrics and kernels)  
all following functions under `sklearn.metrics.pairwise`  

#### Cosine Similarity  
`cosine_similarity(sparse)`  
cosine_similarity computes the L2-normalized dot product of vectors. That is, if x and y are row vectors, their cosine similarity k is defined as:

k(x, y) = \frac{x y^\top}{\|x\| \|y\|}

This is called cosine similarity, because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.  

#### Linear Kernel  
`linear_kernel` - computes the linear kernel, that is, a special case of polynomial_kernel with degree=1 and coef0=0 (homogeneous). If x and y are column vectors, their linear kernel is:

k(x, y) = x^\top y  

#### Polynomial kernel  
`polynomial_kernel` - computes the degree-d polynomial kernel between two vectors. The polynomial kernel represents the similarity between two vectors. Conceptually, the polynomial kernels considers not only the similarity between vectors under the same dimension, but also across dimensions. When used in machine learning algorithms, this allows to account for feature interaction.

The polynomial kernel is defined as:

k(x, y) = (\gamma x^\top y +c_0)^d

where:

x, y are the input vectors
d is the kernel degree
If c_0 = 0 the kernel is said to be homogeneous.  

#### Sigmoid Kernel  
`sigmoid_kernel` computes the sigmoid kernel between two vectors. The sigmoid kernel is also known as hyperbolic tangent, or Multilayer Perceptron (because, in the neural network field, it is often used as neuron activation function). It is defined as:

k(x, y) = \tanh( \gamma x^\top y + c_0)

where:

x, y are the input vectors
\gamma is known as slope
c_0 is known as intercept

#### RBF Kernel  
`rbf_kernel` computes the radial basis function (RBF) kernel between two vectors. This kernel is defined as:

k(x, y) = \exp( -\gamma \| x-y \|^2)

where x and y are the input vectors. If \gamma = \sigma^{-2} the kernel is known as the Gaussian kernel of variance \sigma^2.  


#### Laplacian Kernel  
`laplacian_kernel` is a variant on the radial basis function kernel defined as:

k(x, y) = \exp( -\gamma \| x-y \|_1)

where x and y are the input vectors and \|x-y\|_1 is the Manhattan distance between the input vectors.

It has proven useful in ML applied to noiseless data.

#### Chi-squared kernel  
The chi-squared kernel is a very popular choice for training non-linear SVMs in computer vision applications. It can be computed using chi2_kernel and then passed to an sklearn.svm.SVC with kernel="precomputed"

In [46]:
>>> from sklearn.svm import SVC
>>> from sklearn.metrics.pairwise import chi2_kernel
>>> X = [[0, 1], [1, 0], [.2, .8], [.7, .3]]
>>> y = [0, 1, 0, 1]
>>> K = chi2_kernel(X, gamma=.5)
>>> print(K)
>>> svm = SVC(kernel='precomputed').fit(K, y)
>>> print(svm.predict(K))
>>> print('or')
>>> svm = SVC(kernel=chi2_kernel).fit(X, y)
>>> print(svm.predict(X))

[[1.         0.36787944 0.89483932 0.58364548]
 [0.36787944 1.         0.51341712 0.83822343]
 [0.89483932 0.51341712 1.         0.7768366 ]
 [0.58364548 0.83822343 0.7768366  1.        ]]
[0 1 0 1]
or
[0 1 0 1]


### Transforming The Prediction Target(y)  
#### LabelBinarizer  
create a label indicator matrix from a list of multi-class labels  

In [47]:
>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> print(lb.fit([1, 2, 6, 4, 2]))
>>> print(lb.classes_)
>>> print(lb.transform([1, 6]))

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
[1 2 4 6]
[[1 0 0 0]
 [0 0 0 1]]


#### LabelEncoder  
normalize labels such that they contain only values between 0 and n_classes-1  

In [48]:
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
>>> print(le.classes_)
>>> print(le.transform([1, 1, 2, 6]))
>>> print(le.inverse_transform([0, 0, 1, 2]))

[1 2 6]
[0 0 1 2]
[1 1 2 6]


  if diff:


It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels

In [49]:
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
>>> print(list(le.classes_))
>>> print(le.transform(["tokyo", "tokyo", "paris"]))
>>> print(list(le.inverse_transform([2, 2, 1])))

['amsterdam', 'paris', 'tokyo']
[2 2 1]
['tokyo', 'tokyo', 'paris']


  if diff:
