In [4]:
import pandas as pd  # data processing
from sklearn.base import BaseEstimator, TransformerMixin
# import numpy as np  # linear algebra
# import matplotlib.pyplot as plt
# import seaborn as sns

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer

In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selects specific columns from a DataFrame.
    Example:
        selector = ColumnSelector(['Age', 'Test'])
        selector.fit_transform(df)
    """

    def __init__(self, columns_to_keep):
        """
        columns_to_keep (list): List of column names to be selected.
        """
        self.columns_to_keep = columns_to_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Returns:
        selected_columns (DataFrame): DataFrame with selected columns.
        """
        x_copy = X.copy()
        # selected_columns = X[self.columns_to_keep]
        # selected_columns = [x for x in x_copy if x in self.columns_to_keep]
        selected_columns = list(set(self.columns_to_keep) & set(X.columns))  # list-intersection
        return x_copy[selected_columns]

In [10]:
class Imputer(BaseEstimator, TransformerMixin):
    """
    Example: 
        imputer = Imputer(['Age'])
        imputer.fit_transform(df)
    """

    def __init__(self, columns_to_impute, **kwargs):
        self.columns_to_impute = columns_to_impute
        self.strategy = kwargs.get('strategy', 'mean')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        selected_columns = x_copy[self.columns_to_impute]
        imputer = SimpleImputer(strategy=self.strategy)
        x_copy.loc[:, self.columns_to_impute] = imputer.fit_transform(selected_columns)
        # X['Age'] = imputer.fit_transform(selected_columns)

        return X

In [12]:
class FeatureEncode(BaseEstimator, TransformerMixin):
    """
    Example:
        feature_encoder = FeatureEncode('Sex', val_replace={'male': 1, 'female': 0})
        feature_encoder.fit_transform(df)       
    """

    def __init__(self, column_name, **kwargs):
        self.column_name = column_name
        self.val_replace = kwargs.get('val_replace', {'male': 1, 'female': 0})

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        x_copy.loc[:, [self.column_name]] = x_copy[self.column_name].replace(self.val_replace)
        return x_copy

In [None]:
class Ohe(BaseEstimator, TransformerMixin):
    """
    One-hot encoding
    ohe = Ohe(column_name='Embarked', drop_original=True)
    ohe.fit_transform(pd.read_csv('./data.csv'))
    """

    def __init__(self, column_name, **kwargs):
        self.column_name = column_name
        self.drop = kwargs.get('drop', None)  # first
        self.drop_original = kwargs.get('drop_original', False)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        encoder = OneHotEncoder(sparse_output=False, drop=self.drop)
        matrix = encoder.fit_transform(x_copy.loc[:, [self.column_name]])

        # # Retrieve feature names
        feature_names = encoder.get_feature_names_out(input_features=[self.column_name])
        x_copy = pd.concat([df, pd.DataFrame(matrix, columns=feature_names)], axis=1)

        if self.drop_original:
            x_copy.drop(columns=[self.column_name], inplace=True)
        return x_copy