In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.datasets import fetch_openml

from dense_feature_mixer import DenseFeatureMixer


X, y = fetch_openml("titanic", version=1, return_X_y=True, as_frame=True)
X = X.drop(["name", "ticket", "cabin", "boat", "body"], axis=1).dropna()
y = y.reindex(X.index).astype(int)
numeric_vars = ["age", "fare"]
categorical_vars = ["sex", "sibsp", "parch", "embarked", "home.dest", "pclass"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [26]:
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999)
X_train[categorical_vars] = encoder.fit_transform(X_train[categorical_vars])
X_test[categorical_vars] = encoder.transform(X_test[categorical_vars])

num_transformer = make_column_transformer((StandardScaler(), numeric_vars), remainder="passthrough")

pipe = make_pipeline(DenseFeatureMixer(task="classification",
                                       categorical_vars=categorical_vars,
                                       unknown_category=999),
                     num_transformer,
                     LogisticRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7591240875912408