In [1]:
# pipelines with supervised learning

In [27]:
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

data = load_iris()
df = pd.DataFrame(data.data,columns=data.feature_names)
df['target'] = data.target

'''
# add random noise to inputs (lower accuracy)
df[df.columns[0]] += np.random.normal(2, 2, len(df.index))
df[df.columns[1]] += np.random.normal(-1, 2, len(df.index))
df[df.columns[3]] += np.random.normal(0, 2, len(df.index))
df[df.columns[2]] += np.random.normal(0, 2, len(df.index))
'''
# add cateogrical data for OHE
df['category'] = 'first'
df.loc[df.index>int(len(df.index)/2),'category'] = 'second'

# index as a feature
df['index'] = df.index


display(df.head(3))
display(df.tail(3))

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,category,index
0,5.1,3.5,1.4,0.2,0,first,0
1,4.9,3.0,1.4,0.2,0,first,1
2,4.7,3.2,1.3,0.2,0,first,2


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,category,index
147,6.5,3.0,5.2,2.0,2,second,147
148,6.2,3.4,5.4,2.3,2,second,148
149,5.9,3.0,5.1,1.8,2,second,149


In [3]:
clf = GradientBoostingClassifier()
clf.fit(df[df.columns[:2]],df['target'])
clf.score(df[df.columns[:2]],df['target'])

0.92

In [4]:
## train pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('estimator', GradientBoostingClassifier(max_depth=3, n_estimators=100))
    ])
pipeline.fit(df[df.columns[:2]],df['target'])
print('score',pipeline.score(df[df.columns[:2]],df['target']))
pickle.dump( pipeline, open( "pipeline.pkl", "wb" ) )

score 0.92


In [5]:
## load pipeline and predict on new data (10 times to see variance)
pipeline_loaded = pickle.load( open( "pipeline.pkl", "rb" ) )

# look at trained pipeline params
print('trained scaler params:')
print('scale_',pipeline_loaded['scaler'].scale_)
print('feature_importances_',pipeline_loaded['estimator'].feature_importances_)
print()
for i in range(10):
    df_new = df.sample(n=len(df)*10,replace=True)
    print('score',pipeline_loaded.score(df_new[df.columns[:2]],df_new['target']))

trained scaler params:
scale_ [0.82530129 0.43441097]
feature_importances_ [0.72321019 0.27678981]

score 0.9266666666666666
score 0.928
score 0.922
score 0.918
score 0.9106666666666666
score 0.9286666666666666
score 0.9106666666666666
score 0.92
score 0.9346666666666666
score 0.9253333333333333


In [6]:
### mixed type transforms in pipeline


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

numeric_features = ['sepal length (cm)','sepal width (cm)']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['category']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier())])

clf.fit(df[numeric_features+categorical_features], df['target'])
print("model score: %.3f" % clf.score(df[numeric_features+categorical_features], df['target']))
pickle.dump( clf,open( "clf.pkl", "wb" ) )


model score: 0.973


In [7]:
## load pipeline and predict on new data (10 times to see variance)
clf_loaded = pickle.load( open( "clf.pkl", "rb" ) )

# look at trained pipeline params
print('trained scaler params:')
print('scale_',clf_loaded['preprocessor'].named_transformers_['num']['scaler'].scale_)
print('feature_importances_',clf_loaded['classifier'].feature_importances_)
print()
for i in range(10):
    df_new = df.sample(n=len(df)*10,replace=True)
    print('score',clf_loaded.score(df_new[numeric_features+categorical_features],df_new['target']))

trained scaler params:
scale_ [0.82530129 0.43441097]
feature_importances_ [0.49117511 0.21078395 0.15371062 0.14433031]

score 0.9746666666666667
score 0.9713333333333334
score 0.9746666666666667
score 0.9726666666666667
score 0.9673333333333334
score 0.9733333333333334
score 0.976
score 0.9773333333333334
score 0.9786666666666667
score 0.9686666666666667


In [8]:
### custom transforms

from sklearn.base import BaseEstimator, TransformerMixin

class RootFunction(BaseEstimator, TransformerMixin):
    def __init__(self, positions):
        self.positions = positions

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        root_function = lambda x: np.sign(x) * np.power(abs(x),.5)
        X_new = X.apply(root_function)
        return np.array(X)[:, self.positions]

In [38]:
def root_function(x):
    return np.sign(x) * np.power(abs(x),.5)
def unroot_function(x):
    return np.sign(x) * np.power(abs(x),2)
def stationarize(X,lag,horizon):
    return X - X.shift(lag)
def unstationarize(X,lag,horizon=None):
    return X + X.shift(lag - horizon)

lag = 7
horizon = 30
dates = pd.date_range('2021-01-01',freq='d',periods=len(df))
y = pd.Series(df['petal width (cm)'].values,index=dates)
X = df[numeric_features+categorical_features].set_index(dates)
### transform
y_new = y.apply(root_function)
y_new = y_new - y_new.shift(lag)
y_new = y_new.fillna(0)

# future dates in X, y
new_dates = pd.date_range('2021-01-01',freq='d',periods=len(df)+horizon)
X = X.reindex(new_dates)
y_new = y_new.reindex(new_dates)

### pipeline: input transformers and estimators
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


pipeline = Pipeline(steps=[('preprocessor',preprocessor),('classifier',GradientBoostingRegressor())])

pipeline.fit(X, y_new)
y_pred_new = pipeline.predict(X)
y_pred_new = pd.Series(y_pred_new,index=y.index)

### untransform
y_pred = y_pred_new + y_pred_new.shift(lag-horizon)
y_pred = y_pred.apply(unroot_function)
y_pred

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [31]:
y

0      0.2
1      0.2
2      0.2
3      0.2
4      0.2
      ... 
145    2.3
146    1.9
147    2.0
148    2.3
149    1.8
Name: petal width (cm), Length: 150, dtype: float64

In [30]:
len(y_pred_new)

150

In [19]:
y.tail(20)

130    2
131    2
132    2
133    2
134    2
135    2
136    2
137    2
138    2
139    2
140    2
141    2
142    2
143    2
144    2
145    2
146    2
147    2
148    2
149    2
Name: target, dtype: int64

In [10]:
root_function = Pipeline(steps=[('root_function',RootFunction)])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('root_function', root_function, ['target'])])


pipeline = Pipeline(steps=[('preprocessor',preprocessor),('classifier',GradientBoostingClassifier())])

pipeline.fit(df[numeric_features+categorical_features], df['target'])
pipeline.score(df[numeric_features+categorical_features], df['target'])


ValueError: A given column is not a column of the dataframe