In [2]:
#Pipelines work by allowing for a linear sequence of data transforms to be chained together
# culminating in a modeling process that can be evaluated.
# An easy trap to fall into in applied machine learning is leaking data from 
#your training dataset to your test dataset.

# To avoid this trap you need a robust test harness with strong separation of 
# training and testing. This includes data preparation.
# Pipelines help you prevent data leakage in your test harness by ensuring that 
# data preparation like standardization is constrained to each fold of your cross 
# validation procedure.

# The example below demonstrates this important data preparation and model evaluation workflow. The pipeline is defined with two steps:
# - Standardize the data.
# - Learn a Linear Discriminant Analysis model.
# The pipeline is then evaluated using 10-fold cross validation.


# Create a pipeline that standardizes the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]


In [3]:
# create pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)
# evaluate pipeline
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.773462064251538


In [7]:
# Feature extraction is another procedure that is susceptible to data leakage.
# Like data preparation, feature extraction procedures must be restricted 
# to the data in your training dataset.

# The pipeline provides a handy tool called the FeatureUnion which allows the 
# results of multiple feature selection and extraction procedures to be combined 
# into a larger dataset on which a model can be trained. Importantly, 
# all the feature extraction and the feature union occurs within each fold of the cross validation procedure.

# The example below demonstrates the pipeline defined with four steps:
# - Feature Extraction with Principal Component Analysis (3 features)
# - Feature Extraction with Statistical Selection (6 features)
# - Feature Union
# - Learn a Logistic Regression Model

from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression(solver='lbfgs',n_iter_=10)))
model = Pipeline(estimators)
# evaluate pipeline
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

TypeError: __init__() got an unexpected keyword argument 'n_iter_'