In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import (
    DropDuplicateFeatures, 
    DropConstantFeatures, 
    DropCorrelatedFeatures
    )

from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data = load_breast_cancer()

In [4]:
X = pd.DataFrame(data['data'])
X.columns = data['feature_names']
y = pd.Series(data['target'])

X['categ_1'] = np.where(X['mean smoothness'] > 0.1, 'low', 'high')
X['categ_2'] = X['mean radius'].apply(lambda x: str(int(x))) 

numerics = X.select_dtypes("number").columns.to_list()
categoricals = [col for col in X if col not in numerics]

In [5]:
numerical_pipe = Pipeline(
    steps = [
        ("imputation",  SimpleImputer(strategy="mean")),
        ("outliers", Winsorizer(capping_method="gaussian"))
    ]
)

In [6]:
categorical_pipe = Pipeline(
    steps = [
        ("imputation", SimpleImputer(strategy='constant', fill_value='Missing')),
        ("ohe", OneHotEncoder(top_categories=5))
    ]
)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, numerics),
        ('cat', categorical_pipe, categoricals)
    ]
)

In [8]:
selection = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("drop_constant", DropConstantFeatures()), 
            ("drop_duplicates", DropDuplicateFeatures()),
            ("remove_correlation", DropCorrelatedFeatures(threshold=0.8))
            ]
    )

In [9]:
X_transformed = selection.fit_transform(X)
print(X.shape, X_transformed.shape)

(569, 32) (569, 20)
