###  Python Basics Tutorial

#### Automate Machine Learning Workflows with Pipelines

####  Machine Learning Mastery with Python
####  Jason Brownlee

### Data Prep and Modeling Pipeline
- pipelines create workflows that prevent data leakage in test harnass

In [2]:
from pandas import read_csv

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
## Read file and designate depen and indepen vars

path = 'D:\OneDrive - QJA\My Files\DataScience\DataSets'
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 
         'mass', 'pedi', 'age', 'class']

df = read_csv(path + '\\' + filename, names = names)
array = df.values
X = array[:, 0:8]
Y = array[:, 8]

## Create Pipeline
## estimator is a list of steps provided to Pipeline
## note that Pipeline itself is evaluted by kfold cv procedure
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))

n_splits = 10
seed = 7

model = Pipeline(estimators)
kfold = KFold(n_splits = n_splits, random_state = seed)
results = cross_val_score(model, X, Y,
                          cv = kfold)

print('Error Estimate: %.4f' % results.mean())


Error Estimate: 0.7735


### Pipeline with Feature Extraction
- feature extraction is susceptible to data leakage

In [9]:
from pandas import read_csv

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

array = df.values
X = array[:, 0:8]
Y = array[:, 8]

## Create Feature Union
## feature untion allows results of multiple feature selection
##     and extraction procedures to be combined and applied to 
##     the cv procedure
features = []
features.append(('pca', PCA(n_components = 3)))
features.append(('select best', SelectKBest(k = 6)))

feature_union = FeatureUnion(features, 
                             #verbose = True
                            )

## create pipeline 
estimators = []
estimators.append(('feature_union', feature_union)) # results of PCA feature selection
estimators.append(('logistic', LogisticRegression(solver = 'liblinear')))

model = Pipeline(estimators)

## evaluate pipeline
kfold = KFold(n_splits = 10, random_state = 7)
results = cross_val_score(model, X, Y,
                          cv = kfold)

print('Error Estimate: % .4f' % results.mean())

[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[FeatureUnion] ........... (step 1 of 2) Processing pca, total=   0.0s
[FeatureUnion] ... (step 2 of 2) Processing select best, total=   0.0s
[Featu