Ref: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [1]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

In [15]:
import numpy as np
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(0)

Load data from https://www.openml.org/d/40945



In [3]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

split the data into training and testing sets



In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

only select a subset of columns to simplify example.

In [5]:
# column names to use
subset_feature = ["embarked", "sex", "pclass", "age", "fare"]
X_train, X_test = X_train[subset_feature], X_test[subset_feature]

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 1118 to 684
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   embarked  1045 non-null   category
 1   sex       1047 non-null   category
 2   pclass    1047 non-null   int64   
 3   age       841 non-null    float64 
 4   fare      1046 non-null   float64 
dtypes: category(2), float64(2), int64(1)
memory usage: 35.0 KB


We will train our classifier with the following features:

Numeric Features:

* ``age``: float;
* ``fare``: float.

Categorical Features:

* ``embarked``: categories encoded as strings ``{'C', 'S', 'Q'}``;
* ``sex``: categories encoded as strings ``{'female', 'male'}``;
* ``pclass``: ordinal integers ``{1, 2, 3}``.

We can observe that the `embarked` and `sex` columns were tagged as
`category` columns when loading the data with ``fetch_openml``. Therefore, we
can use this information to dispatch the categorical columns to the
``categorical_transformer`` and the remaining columns to the
``numerical_transformer``.

We create the preprocessing pipelines for both numeric and categorical data.
Note that ``pclass`` could either be treated as a categorical or numeric
feature.



In [7]:
from sklearn.compose import make_column_selector as selector

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

Finally, you need to train a classification model (feel free to choose any) on training data and check its performance on testing data. 

In [8]:
# train model
clf.fit(X_train, y_train)

# make predictions on the testing data
y_predict = clf.predict(X_test)

# check results
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[143  19]
 [ 34  66]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       162
           1       0.78      0.66      0.71       100

    accuracy                           0.80       262
   macro avg       0.79      0.77      0.78       262
weighted avg       0.80      0.80      0.79       262



In [9]:
# save the iris classification model as a pickle file
model_pkl_file = "../model/titanic_classifier_model.pkl"  

with open(model_pkl_file, 'wb') as file:
    pickle.dump(clf, file)

In [10]:
X_test.head()

Unnamed: 0,embarked,sex,pclass,age,fare
1139,S,male,3,38.0,7.8958
533,S,female,2,21.0,21.0
459,S,male,2,42.0,27.0
1150,S,male,3,,14.5
393,S,male,2,25.0,31.5


In [14]:
# load model from pickle file
with open(model_pkl_file, 'rb') as file:  
    model = pickle.load(file)

In [22]:
model.predict(X_test.head())

array(['0', '1', '0', '0', '0'], dtype=object)

In [23]:
input = pd.DataFrame([['S','male',3,38,7.8958]], columns=X_test.columns)
model.predict(input)

array(['0'], dtype=object)

In [26]:
input = pd.DataFrame([['S','female',3,21,7.8958]], columns=X_test.columns)
model.predict(input)

array(['1'], dtype=object)