In [1]:
# pipelines with supervised learning

In [5]:
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

data = load_iris()
df = pd.DataFrame(data.data,columns=data.feature_names)
df['target'] = data.target

'''
# add random noise to inputs (lower accuracy)
df[df.columns[0]] += np.random.normal(2, 2, len(df.index))
df[df.columns[1]] += np.random.normal(-1, 2, len(df.index))
df[df.columns[3]] += np.random.normal(0, 2, len(df.index))
df[df.columns[2]] += np.random.normal(0, 2, len(df.index))
'''
# add cateogrical data for OHE
df['category'] = 'first'
df.loc[df.index>int(len(df.index)/2),'category'] = 'second'

# index as a feature
df['index'] = df.index


display(df.head(3))
display(df.tail(3))

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,category,index
0,5.1,3.5,1.4,0.2,0,first,0
1,4.9,3.0,1.4,0.2,0,first,1
2,4.7,3.2,1.3,0.2,0,first,2


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,category,index
147,6.5,3.0,5.2,2.0,2,second,147
148,6.2,3.4,5.4,2.3,2,second,148
149,5.9,3.0,5.1,1.8,2,second,149


In [6]:
clf = GradientBoostingClassifier()
clf.fit(df[df.columns[:2]],df['target'])
clf.score(df[df.columns[:2]],df['target'])

0.92

In [7]:
## train pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('estimator', GradientBoostingClassifier(max_depth=3, n_estimators=100))
    ])
pipeline.fit(df[df.columns[:2]],df['target'])
print('score',pipeline.score(df[df.columns[:2]],df['target']))
pickle.dump( pipeline, open( "pipeline.pkl", "wb" ) )

score 0.92


In [8]:
## load pipeline and predict on new data (10 times to see variance)
pipeline_loaded = pickle.load( open( "pipeline.pkl", "rb" ) )

# look at trained pipeline params
print('trained scaler params:')
print('scale_',pipeline_loaded['scaler'].scale_)
print('feature_importances_',pipeline_loaded['estimator'].feature_importances_)
print()
for i in range(10):
    df_new = df.sample(n=len(df)*10,replace=True)
    print('score',pipeline_loaded.score(df_new[df.columns[:2]],df_new['target']))

trained scaler params:
scale_ [0.82530129 0.43441097]
feature_importances_ [0.72342329 0.27657671]

score 0.9233333333333333
score 0.9206666666666666
score 0.9126666666666666
score 0.9073333333333333
score 0.928
score 0.92
score 0.92
score 0.912
score 0.9013333333333333
score 0.9133333333333333


In [9]:
### mixed type transforms in pipeline


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

numeric_features = ['sepal length (cm)','sepal width (cm)']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['category']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier())])

clf.fit(df[numeric_features+categorical_features], df['target'])
print("model score: %.3f" % clf.score(df[numeric_features+categorical_features], df['target']))
pickle.dump( clf,open( "clf.pkl", "wb" ) )


model score: 0.973


In [10]:
## load pipeline and predict on new data (10 times to see variance)
clf_loaded = pickle.load( open( "clf.pkl", "rb" ) )

# look at trained pipeline params
print('trained scaler params:')
print('scale_',clf_loaded['preprocessor'].named_transformers_['num']['scaler'].scale_)
print('feature_importances_',clf_loaded['classifier'].feature_importances_)
print()
for i in range(10):
    df_new = df.sample(n=len(df)*10,replace=True)
    print('score',clf_loaded.score(df_new[numeric_features+categorical_features],df_new['target']))

trained scaler params:
scale_ [0.82530129 0.43441097]
feature_importances_ [0.49625004 0.2107349  0.15763196 0.13538309]

score 0.97
score 0.9786666666666667
score 0.9706666666666667
score 0.9673333333333334
score 0.9693333333333334
score 0.9746666666666667
score 0.97
score 0.974
score 0.9726666666666667
score 0.9726666666666667


In [11]:
### custom transforms

from sklearn.base import BaseEstimator, TransformerMixin

class PositionalSelector(BaseEstimator, TransformerMixin):
    def __init__(self, positions):
        self.positions = positions

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array(X)[:, self.positions]