# Data Preparation - Transformers Pipelines

## Import Libraries

In [1]:
import pandas as pd
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder

## Simple Transformer

In [2]:

class SimpleTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        X["random_int"] = randint(0, 10, X.shape[0])
        return X



df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})

pipe = Pipeline(
    steps=[
        ("use_simple_transfomer", SimpleTransformer())
    ]
)
transformed_df = pipe.fit_transform(df)

print(df)

   a  b  c  random_int
0  1  4  7           2
1  2  5  8           5
2  3  6  9           1


## Simple Transformer with arguments

In [3]:
class SimpleTransformer_args(BaseEstimator, TransformerMixin):
    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)

        if self.columns:
            cols_to_transform = self.columns

        X[cols_to_transform] = X[cols_to_transform] * self.by
        return X


df = pd.DataFrame({"a": [1, -2, 3], "b": [-4, 5, 6], "c": [-7, -8, 9]})

pipe = Pipeline(
    steps=[
        ("use_simple_transfomer_args", SimpleTransformer_args(3, columns=["a", "c"]))
    ]
)
transformed_df = pipe.fit_transform(df)

print(df)

   a  b   c
0  3 -4 -21
1 -6  5 -24
2  9  6  27


## Transformer with function (wrapper)

In [4]:
data = {
    "id": [1, 2, 3, 4, 5,],
    "fruit": ["Apple", "Apple", "Peach", "Banana"],
}
df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()})

pipe = Pipeline(
    steps=[
        ("simple_one_hot_encode", FunctionTransformer(pd.get_dummies))
    ]
)
transformed_df = pipe.fit_transform(df)

print(transformed_df)

   id  fruit_Apple  fruit_Banana  fruit_Peach
0   1            1             0            0
1   2            1             0            0
2   3            0             0            1
3   4            0             1            0
4   5            0             0            0


## Transformer with function (wrapper) and arguments

In [5]:
data = {
    "id": [1, 2, 3, 4, 5,],
    "fruit": ["Apple", "Apple", "Peach", "Banana"],
}
df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()})

pipe = Pipeline(
    steps=[
        (
            "simple_one_hot_encode",
            FunctionTransformer(
                pd.get_dummies, kw_args={"dummy_na": True, "dtype": "float"}
            ),
        )
    ]
)
transformed_df = pipe.fit_transform(df)

print(transformed_df)

   id  fruit_Apple  fruit_Banana  fruit_Peach  fruit_nan
0   1          1.0           0.0          0.0        0.0
1   2          1.0           0.0          0.0        0.0
2   3          0.0           0.0          1.0        0.0
3   4          0.0           1.0          0.0        0.0
4   5          0.0           0.0          0.0        1.0


## Transfomer from existing sklearn class

In [6]:
class CustomOrdinalEncoder(OrdinalEncoder):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def transform(self, X, y=None):
        transformed_X = super().transform(X)
        new_X = pd.DataFrame(transformed_X, columns=self.feature_names_in_)

        return new_X


data = pd.DataFrame(
    {
        "fruits": ["Apple", "Pears", "Cherry"],
        "colors": ["Green", "Green", "Red"],
    }
)

enc = CustomOrdinalEncoder(dtype=int)
new_data = enc.fit_transform(data)

print(new_data)
print("Categories: ", enc.categories_)

   fruits  colors
0       0       0
1       2       0
2       1       1
Categories:  [array(['Apple', 'Cherry', 'Pears'], dtype=object), array(['Green', 'Red'], dtype=object)]
