## Combining features

Pipelines are containers of steps. A step can be one of the following:

- Transformer
- Estimator
- Pipeline
- FeatureUnion

Now we will inspect feature Union

In [1]:
import sklearn
from sklearn import pipeline

In [2]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

### New ###
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import FeatureUnion

# generate some data to play with
X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42)
features = [("pca", PCA()), ("kernel_pca",KernelPCA())]

# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')

In [3]:
feature_combiner = FeatureUnion(features)

In [4]:
feature_combiner

FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=None, n_jobs=None,
     random_state=None, remove_zero_eig=False, tol=0))],
       transformer_weights=None)

In [5]:
X.shape

(100, 20)

In [6]:
PCA().fit_transform(X).shape

(100, 20)

In [7]:
KernelPCA().fit_transform(X).shape

(100, 65)

### Stacking features from PCA and KernelPCA

In [8]:
feature_combiner.fit_transform(X).shape

(100, 85)

### Learning on top of the stacked features

In [9]:
svmpipe = Pipeline([("feature_combination",feature_combiner), ('svc', clf)])

In [10]:
svmpipe.fit(X,y)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [11]:
svmpipe.steps[0][1].transform(X).shape

(100, 85)

## Pipeception: Setting attributes of pipelines inside pipelines

In [12]:
svmpipe2 = Pipeline([("feature_combination",feature_combiner), ("feature_selector", anova_filter), ('svc', clf)])

In [13]:
svmpipe2.steps

[('feature_combination', FeatureUnion(n_jobs=None,
         transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
       fit_inverse_transform=False, gamma=None, kernel='linear',
       kernel_params=None, max_iter=None, n_components=None, n_jobs=None,
       random_state=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)),
 ('feature_selector',
  SelectKBest(k=5, score_func=<function f_regression at 0x7f389ca6d9d8>)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))]

We modify a regular pipeline parameter

In [14]:
svmpipe2.set_params(feature_selector__k=20)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

Now we modify a parameter inside the feature combinator. In this case we set the nameOfTheFeatureUnion__nameOfTheTransformer__parameter

In [15]:
svmpipe2.set_params(feature_combination__pca__n_components=10)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
 ...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [16]:
svmpipe2.steps

[('feature_combination', FeatureUnion(n_jobs=None,
         transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
       fit_inverse_transform=False, gamma=None, kernel='linear',
       kernel_params=None, max_iter=None, n_components=None, n_jobs=None,
       random_state=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)),
 ('feature_selector',
  SelectKBest(k=20, score_func=<function f_regression at 0x7f389ca6d9d8>)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))]

### Fitting the pipeline

In [17]:
svmpipe2.fit(X, y) 

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
 ...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [18]:
svmpipe2.score(X, y) 

0.89

## Going fancier: Genereating features and prunning them: without model

In [19]:
anova_filter = SelectKBest(f_regression, k=40)

In [20]:
svmpipe2 = Pipeline([("feature_combination",feature_combiner), ("feature_selector", anova_filter), ('svc', clf)])

In [21]:
svmpipe2.fit(X,y)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
 ...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [22]:
svmpipe2.steps[0][1].transform(X).shape

(100, 75)

In [23]:
X_transformed = svmpipe2.steps[0][1].transform(X)
svmpipe2.steps[1][1].transform(X_transformed).shape

(100, 40)

In [24]:
svmpipe2.score(X, y) 

0.89

### Selecting values for the different parts of a pipeline via CrossValidation

In [25]:
svmpipe2.steps

[('feature_combination', FeatureUnion(n_jobs=None,
         transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
       fit_inverse_transform=False, gamma=None, kernel='linear',
       kernel_params=None, max_iter=None, n_components=None, n_jobs=None,
       random_state=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)),
 ('feature_selector',
  SelectKBest(k=40, score_func=<function f_regression at 0x7f389ca6d9d8>)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))]

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
pipeline_grid = {"feature_combination__pca__n_components":[10,15,20], 
                 "feature_combination__kernel_pca__degree":[2,3,4],
                 "feature_selector__k":[20,25,30,25,40]
                }

In [28]:
pipe_cv = GridSearchCV(svmpipe2, param_grid=pipeline_grid, n_jobs = -1, cv=10 )

In [29]:
pipe_cv.fit(X,y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
 ...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'feature_combination__pca__n_components': [10, 15, 20], 'feature_combination__kernel_pca__degree': [2, 3, 4], 'feature_selector__k': [20, 25, 30, 25, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
pipe_cv.score(X,y)

0.89

In [31]:
#THIS SHOULD HAVE 3*3*5 ELEMENTS 
pipe_cv.cv_results_['mean_fit_time'].shape

(45,)

In [32]:
pipe_cv.cv_results_["params"][0]

{'feature_combination__kernel_pca__degree': 2,
 'feature_combination__pca__n_components': 10,
 'feature_selector__k': 20}

In [33]:
pipe_cv.cv_results_["params"][1]

{'feature_combination__kernel_pca__degree': 2,
 'feature_combination__pca__n_components': 10,
 'feature_selector__k': 25}

## Going fancier: Genereating features and prunning them: with model