In [72]:
import pandas as pd
import numpy as np
from sklearn.pipeline import  Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from nltk.stem import RSLPStemmer
import scipy

In [5]:
class DataFrameTransformer:
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
def process_dataframe(input_df):
    input_df['text'] = input_df['text'].map(lambda x: x.upper())
    return input_df

In [7]:
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","quux"]
})

In [17]:
pipeline = Pipeline([
    ('uppercase', DataFrameTransformer(process_dataframe))
])

In [18]:
# df

In [19]:
pipeline.fit_transform(df)

Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


### Custom Transformer example: To Dense

In [22]:
class ToDenseTransformer:
    def transform(self, X, y=None, **transform_params):
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self

In [44]:
data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,0,1,1])

In [47]:
# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline(steps=[
    ('to_dense', ToDenseTransformer()),
    ('PCA_', PCA()),
    ('clf', DecisionTreeClassifier())
])

In [48]:
pipeline.fit(data, target)



Pipeline(steps=[('to_dense',
                 <__main__.ToDenseTransformer object at 0x12cb36b50>),
                ('PCA_', PCA()), ('clf', DecisionTreeClassifier())])

In [53]:
pipeline.predict(data)



array([1, 1, 1, 0, 0, 1, 1, 1])

In [27]:
metrics.confusion_matrix(pipeline.predict(data), target)



array([[2, 0],
       [1, 5]])

In [54]:
# looking at the data at step "to_dense"
pipeline.named_steps["to_dense"].transform(data, target)

matrix([[1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.]])

### Custom Transformer example: Select Dataframe Columns

In [56]:
class SelectColumnTransformer:
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [57]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

In [58]:
pipeline = Pipeline(steps=[
    ('selector', SelectColumnTransformer(['name']))
])

In [60]:
df

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [59]:
pipeline.fit_transform(df)

Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward


In [61]:
transformer_step = ColumnTransformer([  # sklearn.compose
    ('impute_mean', SimpleImputer(strategy='mean'), ['age'])],
    remainder='passthrough')

In [62]:
pipeline = Pipeline(steps=[
    ('transformer', transformer_step)
])

In [68]:
pipeline.fit(df)

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_mean',
                                                  SimpleImputer(),
                                                  ['age'])]))])

In [67]:
pd.DataFrame(
    data=pipeline.transform(df),
    columns=['age', 'name']
)[["name","age"]]

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,28.5
3,david,38.0
4,edward,20.0


### FunctionTransformer with Parameters

In [69]:
# dummy dataframe
df = pd.DataFrame({
    'text':[
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
        'Sed accumsan congue enim non pretium.',
        'In hac habitasse platea dictumst.',
        'Sed tincidunt ipsum nec urna vulputate luctus.'
    ],
    'target':[0, 1, 0, 1]
})

In [70]:
df

Unnamed: 0,text,target
0,"Lorem ipsum dolor sit amet, consectetur adipis...",0
1,Sed accumsan congue enim non pretium.,1
2,In hac habitasse platea dictumst.,0
3,Sed tincidunt ipsum nec urna vulputate luctus.,1


In [71]:
def stem_str(input_series, stemmer):
    def stem(input_str):
        return " ".join([stemmer.stem(t) for t in input_str.split(" ")]).strip()
    return input_series.apply(stem)

In [74]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     /Users/georgemarchenko/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


True

In [86]:
pipeline = Pipeline(steps=[
    ('stemmer', FunctionTransformer(func=stem_str, kw_args={'stemmer': RSLPStemmer()})),
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

In [93]:
pipeline.fit(df["text"],df["target"])

Pipeline(steps=[('stemmer',
                 FunctionTransformer(func=<function stem_str at 0x12caff790>,
                                     kw_args={'stemmer': <nltk.stem.rslp.RSLPStemmer object at 0x12d690c40>})),
                ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

### Pipeline with Preprocessing and Classifier

In [94]:
df = pd.DataFrame({
    'favorite_color':['blue','green','red','green','blue'],
    'age': [10,15,10,np.nan,10],
    'target':[1,0,1,0,1]
})

In [95]:
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])

In [100]:
preprocess = ColumnTransformer([
    ('cat_preprocessing', categorical_preprocessing, ['favorite_color']),
     ('num_preprocessing',  numerical_preprocessing, ['age'])
])

In [101]:
# creating the final pipeline with preprocessing steps and the final classifier step
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', DecisionTreeClassifier())
])

In [102]:
df_features = df[['favorite_color', 'age']]
df_target = df['target']

In [103]:
pipeline.fit(df_features, df_target)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat_preprocessing',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder())]),
                                                  ['favorite_color']),
                                                 ('num_preprocessing',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer())]),
                                                  ['age'])])),
                ('clf', DecisionTreeClassifier())])