In [9]:
import pandas as pd
import sklearn 
import scipy
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_boston, make_blobs, load_iris, load_digits, load_breast_cancer 
from sklearn.cross_validation import train_test_split, cross_val_score, KFold, StratifiedKFold, LeaveOneOut, ShuffleSplit, LabelKFold, StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report, precision_recall_curve, roc_curve, roc_auc_score, accuracy_score
from sklearn.metrics.scorer import SCORERS
from sklearn.ensemble import RandomForestClassifier 
import seaborn as sns

In [11]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state = 0)
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
svm = SVC()
svm.fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
print("Test score: %.2f" %svm.score(X_test_scaled, y_test))

Test score: 0.95


# Parameter Selection with Preprocessing

Lets say we want to find better parameters for SVC. But we can't just run Grid Search as when sclaing the data with MinMaxScaler we already used some information from the test set. We therefore use the `Pipeline` class that is a class gluing together multiple processing steps. 

## Building Pipelines

In [13]:
from sklearn.pipeline import Pipeline 
pipe = Pipeline([("scaler", MinMaxScaler()),("svm", SVC())])

We just created two steps: first one called "scaler" which is a MinMaxScaler(), and the second one, called "svm" is an SVC. 

In [14]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
print("test score: %.2f" %pipe.score(X_test,y_test))

test score: 0.95


Using pipeline we reduced the code needed for our preprocessing classification process. 

## Using Pipelines in Grid-Searches

In [18]:
param_grid = {'svm__C':[0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma':[0.001,0.01,0.1,1,10,100]}
grid = GridSearchCV(pipe, param_grid = param_grid, cv=5)
grid.fit(X_train, y_train)
print("best cross-validation accuracy: %.2f" %grid.best_score_)
print("test set score : %.2f" %grid.score(X_test,y_test))
print("best parameters: ", grid.best_params_)

best cross-validation accuracy: 0.98
test set score : 0.97
best parameters:  {'svm__gamma': 1, 'svm__C': 1}


let us take an example that highlights of leaking information in cross-validation. 

In [21]:
rnd = np.random.RandomState(seed=0)
X=rnd.normal(size=(100,10000))
y=rnd.normal(size=(100,))

So far it should not be able to learn anything from this dataset, as there is no relation between X and y. 

In [22]:
from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func = f_regression, percentile = 5).fit(X,y)
X_selected = select.transform(X)
print(X_selected.shape)

(100, 500)


In [23]:
print("%.2f" %np.mean(cross_val_score(Ridge(), X_selected, y, cv=5)))

0.91


In [24]:
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)), ("ridge", Ridge()) ])
print("%.2f" %np.mean(cross_val_score(pipe, X,y, cv=5)))

-0.25


What happened here is that in the feature selection step, we picked out some features that were very well correlated with the target. 

## The General Pipeline Interface

In [25]:
def fit(self,X,y):
    X_transformed = X
    for name, estimator in self.steps[:-1]:
        X_transformed = estimator.fit_transform(X_transformed, y)
    self.steps[-1][1].fit(X_trnasformed, y)
    return self
def predict(self, X):
    X_transformed = X
    for steps in self.steps[:-1]:
        X_transformed = step[1].tranform(X_tranformed)
    return self.steps[-1][1].predict(X_transformed)

## Convenient Pipeline creation with `make_pipeline`

In [26]:
from sklearn.pipeline import make_pipeline
pipe_long = Pipeline([("scaler", MinMaxScaler()),("svm",SVC(C=100))])
pipe_short = make_pipeline(MinMaxScaler(),SVC(C=100))

the only difference between `pipe_long` and `pipe_short`, is that `pipe_short`has steps that were automatically named. 

In [29]:
pipe_short.steps

[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('svc', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
pipe.steps

[('standardscaler-1',
  StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('pca', PCA(copy=True, n_components=2, whiten=False)),
 ('standardscaler-2',
  StandardScaler(copy=True, with_mean=True, with_std=True))]

## Accessing step attributes

In [32]:
pipe.fit(cancer.data)
components = pipe.named_steps["pca"].components_
print(components.shape)

(2, 30)


## Accessing attributes in grid-searched pipeline

In [34]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())
param_grid = {'logisticregression__C':[0.01,0.1,1,10,100]}
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state = 4)
grid = GridSearchCV(pipe, param_grid, cv= 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [35]:
print(grid.best_estimator_)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [36]:
print(grid.best_estimator_.named_steps["logisticregression"])

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [38]:
print(grid.best_estimator_.named_steps["logisticregression"].coef_)

[[-0.38856355 -0.37529972 -0.37624793 -0.39649439 -0.11519359  0.01709608
  -0.3550729  -0.38995414 -0.05780518  0.20879795 -0.49487753 -0.0036321
  -0.37122718 -0.38337777 -0.04488715  0.19752816  0.00424822 -0.04857196
   0.21023226  0.22444999 -0.54669761 -0.52542026 -0.49881157 -0.51451071
  -0.39256847 -0.12293451 -0.38827425 -0.4169485  -0.32533663 -0.13926972]]


## Grid-searching preprocessing steps and model parameters

In [44]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state = 0)
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(), Ridge())
param_grid = {'polynomialfeatures__degree':[1, 2, 3], 'ridge__alpha':[0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [53]:
print("%.2f" %grid.score(X_test,y_test))

0.77


In [54]:
param_grid = {'ridge__alpha':[0.001,0.01,0.1,1,10,100]}
pipe = make_pipeline(StandardScaler(), Ridge())
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train,y_train)
print("%.2f" %grid.score(X_test, y_test))

0.63
