In [3]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'],data['target'])

In [4]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [5]:
# Data normalization
# Dimensionality reduction
# Regression

In [6]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [7]:
X_train = scaler.fit_transform(X_train)
X_train = pca.fit_transform(X_train)
ridge.fit(X_train,y_train)

Ridge()

In [9]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regressor', Ridge())
])

In [10]:
pipe.fit(X_train,y_train)
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  -7898.244407159165


In [11]:
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


On every object within the pipeline the methods fit_transform are invoked during training, while transform (or predict) are called during test. So far using pipelines is just a matter of code cleaness and minimization

### Pipeline tuning

In [15]:
import numpy as np 

# PCA
n_features_to_test = np.arange(1,11)

In [16]:
# Regularization
alpha_to_test = 2.0**np.arange(-6,6)

In [18]:
params = {'reduce_dim__n_components': n_features_to_test,'regressor__alpha': alpha_to_test}

In [21]:
from sklearn.model_selection import GridSearchCV

gridsearch = GridSearchCV(pipe,params, verbose=0).fit(X_train,y_train)
print('Final score is: ', gridsearch.score(X_test,y_test))

Final score is:  -6709.855911558575


In [20]:
gridsearch.best_params_

{'reduce_dim__n_components': 10, 'regressor__alpha': 0.015625}

# Advanced Pipeline tuning

In [22]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]
params = {'scaler':scalers_to_test, 'reduce_dim__n_components': n_features_to_test,'regressor__alpha': alpha_to_test}

In theory, we could also apply the same approach to the dimensionality reduction step, for example to choose between PCA and SelectKBest. The only problem in this case is that PCA relies on a parameter named n_components, while SelectKBest requires to optimize a parameter named k.

Luckily, GridSearchCV also allows to optimize lists of parameter dictionaries, which solves this issue as well: 

In [26]:
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

In [27]:
gridsearch = GridSearchCV(pipe,params, verbose=1).fit(X_train,y_train)
print('Final score is :', gridsearch.score(X_test,y_test))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Final score is : -14057.348006142514
[Parallel(n_jobs=1)]: Done 3600 out of 3600 | elapsed:   21.4s finished


In [28]:
gridsearch.best_params_

{'reduce_dim': SelectKBest(k=9, score_func=<function f_regression at 0x0000013FB66B1820>),
 'reduce_dim__k': 9,
 'regressor__alpha': 8.0,
 'scaler': StandardScaler()}