# Custom Transformer

ðŸ‘‡ Consider the following dataset

In [4]:
import pandas as pd

data = pd.read_csv("data.csv")

data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. The columns describe the size and weight of each item. The target is the number of days between the order and the delivery.

ðŸ‘‡ In a pipeline:

- Engineer a 'volume' feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Train a default parameters `LinearRegression`

Use your pipeline to predict the delivery of the following order

In [5]:
new_data = pd.read_csv("data_new.csv")
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


## Build custom transformer class for engineering

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

# Create a class
class ColumnMultiplier(TransformerMixin, BaseEstimator): 
# TransformerMixin generates a fit_transform method from fit and transform
# BaseEstimator generates get_params and set_params methods
    
    # Create parameters "column_1", "column_2", "column_3" to choose which columns of dataframe to multiply
    def __init__(self, column_1, column_2, column_3):
        self.column_1 = column_1
        self.column_2 = column_2
        self.column_3 = column_3
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        # Multiplication
        multiplied_features =X[self.column_1]*X[self.column_2]*X[self.column_3]
        
        # Return result as dataframe (for integration into ColumnTransformer)
        return pd.DataFrame(multiplied_features, columns=['volume'])

## Preprocessing pipeline

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Plug custom transformer into a feature engineering ColumnTransformer
# Select and preserve the original features
feature_engineering = ColumnTransformer([
    ('column_multiplier', ColumnMultiplier(column_1="product_length_cm", 
                                           column_2="product_height_cm", 
                                           column_3="product_width_cm"), 
                                             ['product_length_cm','product_height_cm', "product_width_cm"]),
    
    ('select_original_features', FunctionTransformer(lambda x: x), ['product_length_cm',
                                                                                 'product_height_cm',
                                                                                 'product_width_cm'])]) 

# Chain the feature engineering transformer with a scaler
numerical_pipe = Pipeline([
    ('engineering', feature_engineering),
    ('scaling', MinMaxScaler())])

# Create a final preprocessing pipeline that combines the above pipeline with a One hot encoder for categorical features 
final_preprocessor = ColumnTransformer([
    ('categorical_preprocessing', OneHotEncoder(sparse = False, handle_unknown='ignore'), make_column_selector(dtype_include="object")), 
    ('numerical_preprocessing', numerical_pipe, make_column_selector(dtype_include="int64"))])

## Modelling

In [9]:
from sklearn.linear_model import LinearRegression

# Chain the final_preprocessor pipeline with a linear regression model
final_pipe = Pipeline([
    ('preprocessing', final_preprocessor),
    ('linear_regression', LinearRegression())])

In [10]:
from sklearn.model_selection import cross_val_score

# Cross validate the final_pipe
cross_val_score(final_pipe, 
                data.drop(columns="days_until_delivery"), data['days_until_delivery'],
                cv=10,
                scoring='r2').mean()

-2.4100493674088962e+24

In [11]:
# Fit the final pipe to the data
final_pipe.fit(data.drop(columns="days_until_delivery"), data['days_until_delivery'])

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categorical_preprocessing',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x111b92110>),
                                                 ('numerical_preprocessing',
                                                  Pipeline(steps=[('engineering',
                                                                   ColumnTransformer(transformers=[('column_multiplier',
                                                                                                    ColumnMultipli...
                                                                                                    ['product_length_cm',
                                                                  

## Predictions

In [12]:
# Use final pipe for prediction
final_pipe.predict(new_data)

array([20.8125])