In [1]:
# - How to use pipelines to minimize data leakage.

# - How to construct a data preparation and modeling pipeline.

# - How to construct a feature extraction and modeling pipeline.

In [2]:
from pandas import read_csv

In [3]:
import numpy

In [4]:
import sys

In [5]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [6]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [7]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [8]:
_dataframe = read_csv(_uri, names=_col_names)

In [9]:
_array = _dataframe.values

In [10]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [11]:
_X = _array[:,0:8]

In [12]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [13]:
_Y = _array[:,8:]

In [14]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [15]:
_Y = numpy.ravel(_Y)

In [16]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [17]:
# 14.1 Automating Machine Learning Workflows

In [18]:
# - Pipelines work by allowing for a linear sequence of data transforms to be chained together culminating in a 
# modeling process that can be evaluated.

# - The goal is to ensure that all of the steps in the pipeline are constrained to the data available for the 
# evaluation, such as the training dataset or each fold of the cross-validation procedure.

# - http://scikit-learn.org/stable/modules/pipeline.html

In [19]:
# 14.2 Data Preparation and Modeling Pipeline

In [20]:
# - Data preparation is one easy way to leak knowledge of the whole training dataset to the algorithm. 

# - For example, preparing your data using normalization or standardization on the entire training dataset 
# before learning would not be a valid test because the training dataset would have been influenced by the 
# scale of the data in the test set.

In [21]:
# - create pipeline

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [24]:
from sklearn.pipeline import Pipeline

In [25]:
_estimators = []

In [26]:
_estimators.append(('standardize', StandardScaler()))

In [27]:
_estimators.append(('lda', LinearDiscriminantAnalysis()))

In [28]:
_model = Pipeline(_estimators)

In [29]:
# - evaluate pipeline

In [30]:
from sklearn.model_selection import KFold

In [31]:
from sklearn.model_selection import cross_val_score

In [32]:
_kfold = KFold(n_splits=10, random_state=7)

In [33]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring='accuracy')

In [34]:
'{:.3%}'.format(_score.mean())

'77.346%'

In [35]:
# 14.3 Feature Extraction and Modeling Pipeline

In [36]:
# - Feature extraction is another procedure that is susceptible to data leakage. 

# - Like data preparation, feature extraction procedures must be restricted to the data in your 
# training dataset. 

# - The pipeline provides a handy tool called the FeatureUnion which allows the results of multiple 
# feature selection and extraction procedures to be combined into a larger dataset on which a model 
# can be trained.

In [37]:
from sklearn.pipeline import FeatureUnion

In [38]:
from sklearn.decomposition import PCA

In [39]:
from sklearn.feature_selection import SelectKBest

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
# - create feature union

In [42]:
_features = []

In [43]:
_features.append(('pca', PCA(n_components=3)))

In [44]:
_features.append(('select_best', SelectKBest(k=6)))

In [45]:
_feature_union = FeatureUnion(_features)

In [46]:
# - create pipeline

In [47]:
_estimators = []

In [48]:
_estimators.append(('feature_union', _feature_union))

In [49]:
_estimators.append(('logistic', LogisticRegression()))

In [50]:
_model = Pipeline(_estimators)

In [51]:
# evaluate pipeline

In [52]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [53]:
'{:.3%}'.format(_score.mean())

'77.604%'