In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
len(train_data), len(test_data)

(891, 418)

In [3]:
df_train = train_data.loc[~train_data['Embarked'].isna(), ['Survived','Pclass','Sex','Age','Fare','Cabin','Embarked']]
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

X_test = test_data[['Pclass','Sex','Age','Fare','Cabin','Embarked']]

In [4]:
def extract_cabin_letter(X):
    return X.applymap(lambda x : str(x)[0] if not pd.isna(x) else 'nan')

cabin_letter = FunctionTransformer(extract_cabin_letter)

In [5]:
columns_transformer_1 = make_column_transformer(
    (cabin_letter, ['Cabin']),
    (SimpleImputer(missing_values=np.nan, strategy='mean'), ['Age','Fare']),
    (OneHotEncoder(drop='if_binary'), ['Sex','Embarked']),
    remainder='passthrough')

In [6]:
columns_transformer_2 = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), [0]),  # 0 is 'Cabin'
    remainder='passthrough')

In [7]:
model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=4,
                               n_jobs=-1, random_state=42, verbose=0)

In [8]:
pipe_full = make_pipeline(columns_transformer_1, columns_transformer_2, model)

In [None]:
cross_val_score(pipe_full, X_train, y_train, cv=5, scoring='accuracy').mean()

In [10]:
pipe_full.fit(X_train, y_train)
pipe_full.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,