## Combining features

Pipelines are containers of steps. A step can be one of the following:

- Transformer
- Estimator
- Pipeline
- FeatureUnion

Now we will inspect feature Union

In [8]:
import sklearn
from sklearn import pipeline

In [19]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

### New ###
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import FeatureUnion

# generate some data to play with
X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42)
features = [("pca", PCA()), ("kernel_pca",KernelPCA())]

# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')

In [24]:
feature_combiner = FeatureUnion(features)

In [30]:
feature_combiner

FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
     random_state=None, remove_zero_eig=False, tol=0))],
       transformer_weights=None)

In [32]:
X.shape

(100, 20)

In [39]:
PCA().fit_transform(X).shape

(100, 20)

In [40]:
KernelPCA().fit_transform(X).shape

(100, 59)

#### Stacking features from PCA and KernelPCA

In [29]:
feature_combiner.fit_transform(X).shape

(100, 79)

#### Learning on top of the stacked features

In [43]:
svmpipe = Pipeline([("feature_combination",feature_combiner), ('svc', clf)])

In [49]:
svmpipe.fit(X,y)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
  ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [115]:
svmpipe.steps[0][1].transform(X).shape

(100, 79)

### Pipeception: Setting attributes of pipelines inside pipelines

In [116]:
svmpipe2.steps

[('feature_combination', FeatureUnion(n_jobs=1,
         transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
       fit_inverse_transform=False, gamma=None, kernel='linear',
       kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
       random_state=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)),
 ('feature_selector',
  SelectKBest(k=20, score_func=<function f_regression at 0x7fa3160f5620>)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [117]:
svmpipe2.set_params(feature_selector__k=20)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
    ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [118]:
svmpipe2.set_params(feature_combination__pca__n_components=10)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
    ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

#### Fitting the pipeline

In [51]:
svmpipe.score(X, y) 

0.89000000000000001

## Going fancier: Genereating features and prunning them: without model

In [78]:
anova_filter = SelectKBest(f_regression, k=40)

In [79]:
svmpipe2 = Pipeline([("feature_combination",feature_combiner), ("feature_selector", anova_filter), ('svc', clf)])

In [82]:
svmpipe2.fit(X,y)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
  ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [84]:
svmpipe2.steps[0][1].transform(X).shape

(100, 79)

In [94]:
X_transformed = svmpipe2.steps[0][1].transform(X)
svmpipe2.steps[1][1].transform(X_transformed).shape

(100, 40)

In [96]:
svmpipe2.score(X, y) 

0.91000000000000003

### Selecting values for the different parts of a pipeline via CrossValidation

In [122]:
svmpipe2.steps

[('feature_combination', FeatureUnion(n_jobs=1,
         transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
       fit_inverse_transform=False, gamma=None, kernel='linear',
       kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
       random_state=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)),
 ('feature_selector',
  SelectKBest(k=20, score_func=<function f_regression at 0x7fa3160f5620>)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [101]:
from sklearn.model_selection import GridSearchCV

In [168]:
pipeline_grid = {"feature_combination__pca__n_components":[10,15,20], 
                 "feature_combination__kernel_pca__degree":[2,3,4],
                 "feature_selector__k":[20,25,30,25,40]
                }

In [169]:
pipe_cv = GridSearchCV(svmpipe2, param_grid=pipeline_grid, n_jobs = -1, cv=10 )

In [170]:
pipe_cv.fit(X,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
    ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'feature_combination__pca__n_components': [10, 15, 20], 'feature_combination__kernel_pca__degree': [2, 3, 4], 'feature_selector__k': [20, 25, 30, 25, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [171]:
pipe_cv.score(X,y)

0.87

In [172]:
len(pipe_cv.cv_results_)

33

In [176]:
pipe_cv.cv_results_["params"][0]

{'feature_combination__kernel_pca__degree': 2,
 'feature_combination__pca__n_components': 10,
 'feature_selector__k': 20}

In [178]:
pipe_cv.cv_results_["params"][1]

{'feature_combination__kernel_pca__degree': 2,
 'feature_combination__pca__n_components': 10,
 'feature_selector__k': 25}

## Going fancier: Genereating features and prunning them: with model