# Custom Transformer

In [None]:
from sklearn import set_config; set_config(display='diagram')

👇 Consider the following dataset

In [1]:
import pandas as pd

data = pd.read_csv("data.csv")

data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


Each observation of the dataset represents an item being delivered from a  `seller_state` to a `customer_state`. The columns describe the size and weight of each item. The target is the number of days between the order and the delivery.

👇 In a pipeline:

- Engineer a 'volume' feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Train a default `Ridge` regression and cross_validate its score on the train set. Low r2 score are expected.

Use your pipeline to predict the delivery of the following order

In [23]:
new_data = pd.read_csv("data_new.csv")
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


<details><summary>Hints</summary>

- There are many ways to create your preprocessed matrix (using `ColumnTransformer` and/or `FeatureUnion`). 
    
- If your transformed feature matrix look wierd, it may be stored as "sparse" by the default behavior of `OneHotEncoder(sparse=True)`. Use `.todense()` to turn it back to a dense matrix

</details>

In [8]:
data.shape

(1000, 7)

In [3]:
X = data.drop(columns = ['days_until_delivery'])
y = data['days_until_delivery']

In [13]:
X.dtypes

customer_state       object
seller_state         object
product_weight_g      int64
product_length_cm     int64
product_height_cm     int64
product_width_cm      int64
dtype: object

In [11]:
X['customer_state'].nunique()

25

In [12]:
X['seller_state'].nunique()

2

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator

# Create a class
class Volume(TransformerMixin, BaseEstimator): 
# TransformerMixin generates a fit_transform method from fit and transform
# BaseEstimator generates get_params and set_params methods
    
    # Create parameters "column_1" and "column_2" to choose which columns of dataframe to multiply
    def __init__(self, column_1, column_2, column_3):
        self.column_1 = column_1
        self.column_2 = column_2
        self.column_3 = column_3
    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        # Multiplication
        multiplied_features =X[self.column_1]*X[self.column_2]*X[self.column_3]
        
        # Return result as dataframe (for integration into ColumnTransformer)
        return pd.DataFrame(multiplied_features, columns=['multiplied'])

In [4]:
Volume(column_1="product_weight_g", column_2="product_length_cm", column_3="product_height_cm").fit_transform(data)

Unnamed: 0,multiplied
0,967250
1,819000
2,967250
3,967250
4,967250
...,...
995,61387200
996,486000
997,1383750
998,23205000


In [27]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer([
    ('volume', Volume(column_1="product_weight_g", column_2="product_length_cm", column_3="product_height_cm"),\
             ['product_weight_g','product_length_cm', 'product_height_cm']),\
    ('scaling', MinMaxScaler(), make_column_selector(dtype_include=['int64'])),\
    ('select_original_features', FunctionTransformer(lambda x: x), ['product_length_cm',
                                                                                 'product_height_cm',
                                                                                 'product_width_cm']), 
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include="object")),
])
pd.DataFrame(preprocessor.fit_transform(X).todense())


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,967250.0,0.057692,0.402439,0.056818,0.271028,53.0,10.0,40.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,819000.0,0.020067,0.548780,0.147727,0.158879,65.0,18.0,28.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,967250.0,0.057692,0.402439,0.056818,0.271028,53.0,10.0,40.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,967250.0,0.057692,0.402439,0.056818,0.271028,53.0,10.0,40.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,967250.0,0.057692,0.402439,0.056818,0.271028,53.0,10.0,40.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,61387200.0,0.558528,0.524390,0.602273,0.121495,63.0,58.0,24.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
996,486000.0,0.019231,0.243902,0.147727,0.224299,40.0,18.0,35.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
997,1383750.0,0.065217,0.304878,0.113636,0.224299,45.0,15.0,35.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
998,23205000.0,0.289298,0.390244,0.522727,0.056075,52.0,51.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
from sklearn.linear_model import LinearRegression
#Pass the combined preprocessor into a Pipeline as a single step
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', LinearRegression())
])


In [33]:
target = 'days_until_delivery'
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, data[target], test_size = 0.3)

In [34]:
final_pipe_trained = final_pipe.fit(X_train, y_train)

In [35]:
final_pipe_trained.score(X_test, y_test)

0.07511543365151263

In [32]:
final_pipe_trained.predict(new_data)

array([15.28014851])