Q: What does "pipeline" do?

A: Chains together multiple steps: output of each step is used as input to the next step.

Makes it easy to apply the same preprocessing to train and test!

In [1]:

import pandas as pd
import numpy as np

In [2]:

train = pd.DataFrame({'feat1':[10, 20, np.nan, 2], 'feat2':[25., 20, 5, 3], 'label':['A', 'A', 'B', 'B']})
test = pd.DataFrame({'feat1':[30., 5, 15], 'feat2':[12, 10, np.nan]})

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [4]:

imputer = SimpleImputer()
clf = LogisticRegression()

In [5]:
# 2-step pipeline: impute missing values, then pass the results to the classifier
pipe = make_pipeline(imputer, clf)

In [6]:
train

Unnamed: 0,feat1,feat2,label
0,10.0,25.0,A
1,20.0,20.0,A
2,,5.0,B
3,2.0,3.0,B


In [7]:
test

Unnamed: 0,feat1,feat2
0,30.0,12.0
1,5.0,10.0
2,15.0,


In [8]:
features = ['feat1', 'feat2']

In [9]:

X, y = train[features], train['label']
X_new = test[features]

In [10]:
# pipeline applies the imputer to X before fitting the classifier
pipe.fit(X, y)

# pipeline applies the imputer to X_new before making predictions
# note: pipeline uses imputation values learned during the "fit" step
pipe.predict(X_new)

array(['A', 'B', 'A'], dtype=object)

Q: How do you examine the intermediate steps in a Pipeline?

A: By using the "named_steps" attribute:

pipe.named_steps.STEP_NAME.ATTRIBUTE

In [11]:
import pandas as pd
df = pd.read_csv('C:\\Users\\Prashant\\Desktop\\Course_Content\\titanic_train.csv')
df = df[['Age', 'Pclass', 'Survived']]
df

Unnamed: 0,Age,Pclass,Survived
0,22.0,3,0
1,38.0,1,1
2,26.0,3,1
3,35.0,1,1
4,35.0,3,0
...,...,...,...
886,27.0,2,0
887,19.0,1,1
888,,3,0
889,26.0,1,1


In [12]:
X = df[['Age', 'Pclass']]
y = df['Survived']

In [13]:
pipe = make_pipeline(SimpleImputer(), LogisticRegression())

In [14]:
# use semicolon to suppress output in IPython
pipe.fit(X, y);

In [15]:

# display the imputation values for "Age" and "Pclass"
pipe.named_steps.simpleimputer.statistics_

array([29.69911765,  2.30864198])

In [16]:
# display the model coefficients for "Age" and "Pclass"
pipe.named_steps.logisticregression.coef_

array([[-0.0370155 , -1.06825731]])

## You can cross-validate and grid search an entire pipeline!

Preprocessing steps will automatically occur AFTER each cross-validation split, which is critical if you want meaningful scores.

In [19]:
df = pd.read_csv('C:\\Users\\Prashant\\Desktop\\Course_Content\\titanic_train.csv')
cols = ['Sex', 'Name']
X = df[cols]
y = df['Survived']

In [23]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [24]:

ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'))

In [25]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [26]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, clf)

### Cross-validate the entire pipeline (not just the model)

In [27]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.8024543343167408

### Find optimal tuning parameters for the entire pipeline

In [28]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

In [29]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [30]:

# what was the best score found during the search?
grid.best_score_

0.8215177954930638

In [31]:

# which combination of parameters produced the best score?
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 1,
 'logisticregression__penalty': 'l1'}

In [None]:
Q: What's the difference between Pipeline and make_pipeline?

A: Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)



In [32]:

cols = ['Embarked', 'Sex', 'Age', 'Fare']
X = df[cols]

In [33]:
ohe = OneHotEncoder()
imp = SimpleImputer()
clf = LogisticRegression()

In [34]:

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [35]:
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']),
    (imp, ['Age']),
    remainder='passthrough')

In [36]:

pipe = make_pipeline(ct, clf)

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [38]:

ct = ColumnTransformer(
    [('encoder', ohe, ['Embarked', 'Sex']),
     ('imputer', imp, ['Age'])],
    remainder='passthrough')

In [39]:
pipe = Pipeline([('preprocessor', ct), ('classifier', clf)])