In [None]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.encoding import StringSimilarityEncoder

In [None]:
# Helper function for loading and preprocessing data
def load_titanic() -> pd.DataFrame:
    translate_table = str.maketrans('' , '', string.punctuation)
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['home.dest'] = (
        data['home.dest']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    data['name'] = (
        data['name']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    data['ticket'] = (
        data['ticket']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    return data

In [None]:
# Load dataset
data = load_titanic()

In [None]:
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['survived', 'sex', 'cabin', 'embarked'], axis=1),
    data['survived'],
    test_size=0.3,
    random_state=0
)

In [None]:
# set up the encoder
encoder = StringSimilarityEncoder(top_categories=2, variables=['name', 'home.dest', 'ticket'])

In [None]:
# fit the encoder
encoder.fit(X_train)

In [None]:
encoder.encoder_dict_

In [None]:
# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)

In [None]:
train_t

In [None]:
test_t

In [None]:
# plot encoded column - ticket
fig, ax = plt.subplots(2, 1);
train_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in train', ax=ax[0]);
test_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in test', ax=ax[1]);

In [None]:
# defining encoder that ignores NaNs
encoder = StringSimilarityEncoder(
    top_categories=2,
    handle_missinig='ignore',
    variables=['name', 'home.dest', 'ticket']
)

In [None]:
# refiting the encoder
encoder.fit(X_train)

In [None]:
encoder.encoder_dict_

In [None]:
# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)

In [None]:
train_t

In [None]:
test_t

In [None]:
# plot encoded column - home.dest
fig, ax = plt.subplots(2, 1);
train_t.plot(
    kind='scatter',
    x='home.dest_new york ny',
    y='home.dest_london',
    sharex=True,
    title='Home destination encoding in train',
    ax=ax[0]
);
test_t.plot(
    kind='scatter',
    x='home.dest_new york ny',
    y='home.dest_london',
    sharex=True,
    title='Home destination encoding in test',
    ax=ax[1]
);