Ref: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [None]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(0)

Load data from https://www.openml.org/d/40945



In [None]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

split the data into training and testing sets



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

only select a subset of columns to simplify example.

In [None]:
# column names to use
subset_feature = ["embarked", "sex", "pclass", "age", "fare"]
X_train, X_test = X_train[subset_feature], X_test[subset_feature]

In [None]:
X_train.info()

We will train our classifier with the following features:

Numeric Features:

* ``age``: float;
* ``fare``: float.

Categorical Features:

* ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;
* ``sex``: categories encoded as strings ``{'female', 'male'}``;
* ``pclass``: ordinal integers ``{1, 2, 3}``.

We can observe that the `embarked` and `sex` columns were tagged as
`category` columns when loading the data with ``fetch_openml``. Therefore, we
can use this information to dispatch the categorical columns to the
``categorical_transformer`` and the remaining columns to the
``numerical_transformer``.

We create the preprocessing pipelines for both numeric and categorical data.
Note that ``pclass`` could either be treated as a categorical or numeric
feature.



In [None]:
from sklearn.compose import make_column_selector as selector

numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
# clf = Pipeline(
#     steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
# )

X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_test_processed = preprocessor.transform(X_test)
X_train_processed

In [None]:
!python --version

Finally, you need to train a classification model (feel free to choose any) on training data and check its performance on testing data. 

In [None]:
clf = LogisticRegression()

# train model
clf.fit(X_train_processed, y_train)

# make predictions on the testing data
y_predict = clf.predict(X_test_processed)

# check results
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

In [None]:
# save the iris classification model as a pickle file
prepro_pkl_file = "../model/titanic_preprocessor.pkl"
model_pkl_file = "../model/titanic_classifier_model.pkl"

with open(prepro_pkl_file, "wb") as file:
    pickle.dump(preprocessor, file)
with open(model_pkl_file, "wb") as file:
    pickle.dump(clf, file)

In [None]:
X_test.head()

In [None]:
# load model from pickle file
with open(prepro_pkl_file, "rb") as file:
    prepro = pickle.load(file)
with open(model_pkl_file, "rb") as file:
    model = pickle.load(file)

In [None]:
prepro_data = prepro.transform(X_test.head())
model.predict(prepro_data)

In [None]:
input = pd.DataFrame([["S", "male", 3, 38, 7.8958]], columns=X_test.columns)
prepro_data = prepro.transform(input)
model.predict(prepro_data)

In [None]:
input = pd.DataFrame([["S", "female", 3, 21, 7.8958]], columns=X_test.columns)
prepro_data = prepro.transform(input)
model.predict(prepro_data)