## Pipelines

Pipelines in Sklearn offer a convenient way of building models. Imagine the standard way of building the model: Imuting using Imputer, then PCA, then building model and then prediction. Instead of making it in steps, all of them can be combined using Pipeline. 

In [1]:
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.svm import SVR

n_features = 10
X, y = make_regression(n_informative=5, n_features=n_features)

ImportError: cannot import name 'Imputer' from 'sklearn.preprocessing' (/Users/rahmad/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/__init__.py)

In [None]:
# randomly put nans in X
for rand_row in [np.random.randint(100) for x in range(20)]:
    rand_col = np.random.randint(n_features) 
    X[rand_row][rand_col] = np.nan
    print("X[{0}][{1}] = np.nan".format(rand_row, rand_col))

In [105]:
pipe = Pipeline([("Imputer", Imputer(strategy="mean")), ("Scaler", StandardScaler()), 
                 ("PCA", PCA(n_components=6)), ("SVM", SVR())])
pipe.steps
# lists all the steps in pipeline

[('Imputer',
  Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)),
 ('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('SVM',
  SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [107]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)