## Combining features

Pipelines are containers of steps. A step can be one of the following:

- Transformer
- Estimator
- Pipeline
- FeatureUnion

Now we will inspect feature Union

In [8]:
import sklearn
from sklearn import pipeline
import numpy as np

In [68]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

### New ###
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import FeatureUnion

# generate some data to play with
X, y = samples_generator.make_classification(n_samples=5000,n_informative=5, n_redundant=0, random_state=42)

X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(X,y)
pol_feat = sklearn.preprocessing.PolynomialFeatures(degree=2, interaction_only=False)



features = [("pca", PCA()), ("pol_feat",pol_feat)]

# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear')

In [69]:
feature_combiner = FeatureUnion(features)

In [70]:
feature_combiner

FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('pol_feat', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False))],
       transformer_weights=None)

In [71]:
X.shape

(5000, 20)

In [72]:
pol_feat.fit_transform(X).shape

(5000, 231)

In [73]:
PCA().fit_transform(X).shape

(5000, 20)

In [74]:
pol_feat = sklearn.preprocessing.PolynomialFeatures(degree=2, interaction_only=False)
pol_feat.fit_transform(X).shape

(5000, 231)

### Stacking features from PCA and KernelPCA

In [75]:
feature_combiner.fit_transform(X).shape

(5000, 251)

### Learning on top of the stacked features

In [76]:
X_tr.shape, y_tr.shape, X_te.shape

((3750, 20), (3750,), (1250, 20))

In [77]:
feature_combiner = FeatureUnion(features)
svmpipe = Pipeline([("feature_combination",feature_combiner), ('svc', clf)])

In [78]:
svmpipe.fit(X_tr,y_tr)

Pipeline(memory=None,
     steps=[('feature_combination', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('pol_feat', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False))],
...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [79]:
svmpipe.steps[0][1].transform(X).shape

(5000, 251)

We can see we get better results learning on top of the constructed features than in the original input

In [80]:
print("train acc pipe" ,np.mean(svmpipe.predict(X_tr) == y_tr))
print("test acc pipe" ,np.mean(svmpipe.predict(X_te) == y_te))

train acc pipe 0.9504
test acc pipe 0.904


In [81]:
clf = svm.SVC(kernel='linear')
clf.fit(X_tr,y_tr)

print("train acc svm" ,np.mean(clf.predict(X_tr) == y_tr))
print("test acc svm" ,np.mean(clf.predict(X_te) == y_te))

train acc svm 0.8026666666666666
test acc svm 0.7936


## Pipeception: Setting attributes of pipelines inside pipelines

In [None]:
svmpipe2 = Pipeline([("feature_combination",feature_combiner), ("feature_selector", anova_filter), ('svc', clf)])

In [None]:
svmpipe2.steps

We modify a regular pipeline parameter

In [None]:
svmpipe2.set_params(feature_selector__k=20)

Now we modify a parameter inside the feature combinator. In this case we set the nameOfTheFeatureUnion__nameOfTheTransformer__parameter

In [None]:
svmpipe2.set_params(feature_combination__pca__n_components=10)

In [None]:
svmpipe2.steps

### Fitting the pipeline

In [None]:
svmpipe2.fit(X, y) 

In [None]:
svmpipe2.score(X, y) 

## Going fancier: Genereating features and prunning them: without model

In [None]:
anova_filter = SelectKBest(f_regression, k=40)

In [None]:
svmpipe2 = Pipeline([("feature_combination",feature_combiner), ("feature_selector", anova_filter), ('svc', clf)])

In [None]:
svmpipe2.fit(X,y)

In [None]:
svmpipe2.steps[0][1].transform(X).shape

In [None]:
X_transformed = svmpipe2.steps[0][1].transform(X)
svmpipe2.steps[1][1].transform(X_transformed).shape

In [None]:
svmpipe2.score(X, y) 

### Selecting values for the different parts of a pipeline via CrossValidation

In [None]:
svmpipe2.steps

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline_grid = {"feature_combination__pca__n_components":[10,15,20], 
                 "feature_combination__kernel_pca__degree":[2,3,4],
                 "feature_selector__k":[20,25,30,25,40]
                }

In [None]:
pipe_cv = GridSearchCV(svmpipe2, param_grid=pipeline_grid, n_jobs = -1, cv=10 )

In [None]:
pipe_cv.fit(X,y)

In [None]:
pipe_cv.score(X,y)

In [None]:
#THIS SHOULD HAVE 3*3*5 ELEMENTS 
pipe_cv.cv_results_['mean_fit_time'].shape

In [None]:
pipe_cv.cv_results_["params"][0]

In [None]:
pipe_cv.cv_results_["params"][1]

## Going fancier: Genereating features and prunning them: with model