# Custom Transformers Template

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables=None):
        # we operate on a list of variables
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        # operations
        return self
        
    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].fillna('Missing')
        return X

Example with learned dictionary:

In [None]:
class NumericalImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables=None):
        # we operate on a list of variables
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        # persist model in a dictionary
        self.imputer_dict_ = {}
        for feature in variables:
            self.imputer_dict_[feature] = X[feature].mode()[0]
        return self
        
    def transform(self, X):
        X = X.copy()
        for features in self.variables:
            X[feature].fillna(elf.imputer_dict_[feature], inplace=True)
        return X

# Selecting Columns

## Custom Class + FeatureUnion

From Hands-On Machine Learning:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

Kaggle Kernel (https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

Application:

In [None]:
text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

Putting it all together:

In [None]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [None]:
pipeline.fit(X_train, y_train)

## ColumnTransformer

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

**Attention**: when using passthrough, the non-transformed columns will be concatenated with the transformed columns. The original order might not be preerved.

In [4]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ], remainder='passthrough')

housing_prepared = full_pipeline.fit_transform(housing)

NameError: name 'housing_num' is not defined

**make_column_transformer** (shorthand for ColumnTransformer) and **make_column_selector**

In [None]:
from sklearn.compose import make_column_selector, make_column_selector

In [None]:
ohe = OneHotEncoder()

# all SEVEN of these produce the same results
ct = make_column_transformer((ohe, ['Embarked', 'Sex']))
ct = make_column_transformer((ohe, [1, 2]))
ct = make_column_transformer((ohe, slice(1, 3)))
ct = make_column_transformer((ohe, [False, True, True, False]))
ct = make_column_transformer((ohe, make_column_selector(pattern='E|S')))
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)))
ct = make_column_transformer((ohe, make_column_selector(dtype_exclude='number')))

# One hot encode Embarked and Sex
ct.fit_transform(X)

# Neat Pipeline Template

Include all the custom classes on packages. Example from Udemy's course on model deployment.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
 
from regression_model.config import config
from regression_model.processing import preprocessors as pp
 
price_pipe = Pipeline([
                ('categorical_imputer', pp.CategoricalImputer(variables = config.CATEGORICAL_VARS_WITH_NA)),
                ('numerical_inputer', pp.NumericalImputer(variables = config.NUMERICAL_VARS_WITH_NA)),
                ('temporal_variable', pp.TemporalVariableEstimator(variables=config.TEMPORAL_VARS, reference_variable=config.REFERENCE_TEMP_VAR)),
                ('rare_label_encoder', pp.RareLabelCategoricalEncoder(tol = 0.01, variables = config.CATEGORICAL_VARS)),
                ('categorical_encoder', pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),
                ('log_transformer', pp.LogTransformer(variables = config.NUMERICALS_LOG_VARS)),
                ('drop_features', pp.DropUnecessaryFeatures(variables_to_drop = config.DROP_FEATURES)),
                ('scaler', MinMaxScaler()),
                ('Linear_model', Lasso(alpha=0.005, random_state=0))
            ])