In [652]:
import pandas as pd  # data processing
import numpy as np  # linear algebra
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator


In [653]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
# from sklearn.feature_selection import SelectKBest,chi2
# from sklearn.tree import DecisionTreeClassifier

In [654]:
df = pd.read_csv('./data/titanic.csv')
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### Make pipline
- Drop columns 
- Impute Age
- Handle categorical data Sex
- OneHotEncode Embarked

In [835]:
from sklearn.base import BaseEstimator, TransformerMixin


class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selects specific columns from a DataFrame.
    Example:
        selector = ColumnSelector(['Age', 'Test'])
        selector.fit_transform(df)
    """

    def __init__(self, columns_to_keep):
        """
        columns_to_keep (list): List of column names to be selected.
        """
        self.columns_to_keep = columns_to_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Returns:
        selected_columns (DataFrame): DataFrame with selected columns.
        """
        x_copy = X.copy()
        # selected_columns = X[self.columns_to_keep]
        selected_columns = list(set(self.columns_to_keep) & set(X.columns))  # list-intersection
        return x_copy[selected_columns]


Unnamed: 0,Pclass,Sex,Survived,Age,Embarked
0,3,1,0,22.000000,S
1,1,0,1,38.000000,C
2,3,0,1,26.000000,S
3,1,0,1,35.000000,S
4,3,1,0,35.000000,S
...,...,...,...,...,...
886,2,1,0,27.000000,S
887,1,0,1,19.000000,S
888,3,0,0,29.699118,S
889,1,1,1,26.000000,C


In [804]:
# col = ColumnSelector(['Pclass', 'Sex', 'Age', 'Embarked', 'Survived', ])
# df = col.fit_transform(df)
# df

In [712]:
from sklearn.base import BaseEstimator, TransformerMixin


class Imputer(BaseEstimator, TransformerMixin):
    """
    Example: 
        imputer = Imputer(['Age'])
        imputer.fit_transform(df)
    """

    def __init__(self, columns_to_impute, **kwargs):
        self.columns_to_impute = columns_to_impute
        self.strategy = kwargs.get('strategy', 'mean')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        selected_columns = x_copy[self.columns_to_impute]
        imputer = SimpleImputer(strategy=self.strategy)
        x_copy.loc[:, self.columns_to_impute] = imputer.fit_transform(selected_columns)
        # X['Age'] = imputer.fit_transform(selected_columns)

        return X

In [715]:
# Imputer(['Age']).fit_transform(df)
# Imputer(['Embarked'], strategy='most_frequent').fit_transform(df)

Unnamed: 0,Pclass,Sex,Age,Embarked,Survived
0,3,1,22.000000,S,0
1,1,0,38.000000,C,1
2,3,0,26.000000,S,1
3,1,0,35.000000,S,1
4,3,1,35.000000,S,0
...,...,...,...,...,...
886,2,1,27.000000,S,0
887,1,0,19.000000,S,1
888,3,0,29.699118,S,0
889,1,1,26.000000,C,1


In [659]:
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureEncode(BaseEstimator, TransformerMixin):
    """
    Example:
        feature_encoder = FeatureEncode('Sex', val_replace={'male': 1, 'female': 0})
        feature_encoder.fit_transform(df)       
    """

    def __init__(self, column_name, **kwargs):
        self.column_name = column_name
        self.val_replace = kwargs.get('val_replace', {'male': 1, 'female': 0})

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        x_copy.loc[:, [self.column_name]] = x_copy[self.column_name].replace(self.val_replace)
        return x_copy

In [805]:
# df = FeatureEncode('Sex', val_replace={'male': 1, 'female': 0}).fit_transform(df)
# df

In [806]:
# OneHotEncode 
encoder = OneHotEncoder(sparse_output=False)
matrix = encoder.fit_transform(df[['Embarked']])

# Retrieve feature names
feature_names = encoder.get_feature_names_out(input_features=['Embarked'])
dfTemp = pd.concat([df, pd.DataFrame(matrix, columns=feature_names)], axis=1)

print("Missing:", df.Embarked.isna().value_counts())
print("Value counts:", df.Embarked.value_counts())
print("feature_names: ", feature_names)
print("feature_matrix: ", matrix)
print("matrix.Transposed: ", matrix.T)

dfTemp.drop(['Embarked'], axis=1, inplace=True)
dfTemp

Missing: Embarked
False    891
Name: count, dtype: int64
Value counts: Embarked
S    646
C    168
Q     77
Name: count, dtype: int64
feature_names:  ['Embarked_C' 'Embarked_Q' 'Embarked_S']
feature_matrix:  [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
matrix.Transposed:  [[0. 1. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 1. ... 1. 0. 0.]]


Unnamed: 0,Pclass,Sex,Age,Survived,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.000000,0,0.0,0.0,1.0
1,1,0,38.000000,1,1.0,0.0,0.0
2,3,0,26.000000,1,0.0,0.0,1.0
3,1,0,35.000000,1,0.0,0.0,1.0
4,3,1,35.000000,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0.0,0.0,1.0
887,1,0,19.000000,1,0.0,0.0,1.0
888,3,0,29.699118,0,0.0,0.0,1.0
889,1,1,26.000000,1,1.0,0.0,0.0


In [808]:
class Ohe(BaseEstimator, TransformerMixin):
    """
    One-hot encoding
    ohe = Ohe(column_name='Embarked', drop_original=True)
    ohe.fit_transform(pd.read_csv('./data.csv'))
    """

    def __init__(self, column_name, **kwargs):
        self.column_name = column_name
        self.drop = kwargs.get('drop', None)  # first
        self.drop_original = kwargs.get('drop_original', False)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()
        encoder = OneHotEncoder(sparse_output=False, drop=self.drop)
        matrix = encoder.fit_transform(x_copy.loc[:, [self.column_name]])

        # # Retrieve feature names
        feature_names = encoder.get_feature_names_out(input_features=[self.column_name])
        x_copy = pd.concat([df, pd.DataFrame(matrix, columns=feature_names)], axis=1)

        if self.drop_original:
            x_copy.drop(columns=[self.column_name], inplace=True)
        return x_copy

In [822]:
# Ohe('Embarked', drop_original=True).fit_transform(df)

ColumnSelector(['Pclass', 'Sex', 'Age', 'Embarked', 'Survived'])

TypeError: ColumnSelector() takes no arguments

In [891]:
# Pipeline components
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(sparse_output=False), ['Embarked'])
])

pipe = Pipeline([
    ('col_select', ColumnSelector(['Pclass', 'Sex', 'Age', 'Embarked', 'Survived'])),
    ('encode', FeatureEncode('Sex', val_replace={'male': 1, 'female': 0})),
    ('imputer_age', Imputer(['Age'])),
    ('imputer_emb', Imputer(['Embarked'], strategy='most_frequent')),
    ('ohe', Ohe('Embarked', drop_original=True)),
    # ('preprocessor', preprocessor)
])

df1 = pd.read_csv('./data/titanic.csv')
pipe.fit_transform(df1)

Unnamed: 0,Pclass,Sex,Age,Survived,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.000000,0,0.0,0.0,1.0
1,1,0,38.000000,1,1.0,0.0,0.0
2,3,0,26.000000,1,0.0,0.0,1.0
3,1,0,35.000000,1,0.0,0.0,1.0
4,3,1,35.000000,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0.0,0.0,1.0
887,1,0,19.000000,1,0.0,0.0,1.0
888,3,0,29.699118,0,0.0,0.0,1.0
889,1,1,26.000000,1,1.0,0.0,0.0


In [895]:
from sklearn.model_selection import train_test_split

x = df.loc[:, df.columns != 'Survived']
y = df.loc[:, ['Survived']]

x, y, x_test, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# x.info()
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 709 to 10
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    179 non-null    int64  
 1   Sex       179 non-null    object 
 2   Age       179 non-null    float64
 3   Embarked  179 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 7.0+ KB
