# Changes compared to previous Notebook

* I do not call fit_transform for the test data again in the pipeline - preprocessing the features and transforming in the pipeline are separate steps now
* I changed "0" values in the Fare to "NaN" and impute to the median afterwards
* I use a StandardScaler() for the Fare

In [50]:
import os

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, StandardScaler

# Build a Pipeline

## Central methods to call on the data

In [51]:
def load_titanic_data():
    return pd.read_csv(os.path.join('data', 'train.csv'))

def split_data(df):
    stratify_by = df["Pclass"]
    titanic_train, titanic_test = train_test_split(df, test_size=0.2, stratify=stratify_by, random_state=42)
    titanic_train_features = titanic_train.drop('Survived', axis=1)
    titanic_train_label = titanic_train['Survived']
    titanic_test_features = titanic_test.drop('Survived', axis=1)
    titanic_test_label = titanic_test['Survived']
    return titanic_train_features, titanic_train_label, titanic_test_features, titanic_test_label

In [52]:
titanic = load_titanic_data()
# we call the train set just "titanic"
X_train, y_train, X_test, y_test = split_data(titanic)

## Analyze the data

See Notebook titanic-kaggle-analyze

## Preprocess the data

In [53]:
def drop_columns(df):
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    return df.drop(columns_to_drop, axis=1)

In [54]:
def sex_pipeline():
    # we are going to use an OrdinalEncoder to make numerical data of the sex
    return Pipeline([
        ("encode", OrdinalEncoder())
    ])

In [55]:
def age_pipeline():
    # since there are lots of null values we are going to impute them
    return Pipeline([
        ("impute", SimpleImputer(strategy="median"))
    ])

In [56]:
def transform_sipsp_parch(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    mask = (df["SibSp"] == 0) & (df["Parch"] == 0)

    # Create a new column, initialized with 1
    df.loc[:, "Alone"] = 1

    df.loc[~mask, "Alone"] = 0 # set 0 where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [57]:
def embarked_pipeline():
    # we impute the null values with the most frequent and afterward encode it
    return  Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder())
    ])


In [58]:
def fare_pipeline():
    # we impute the null values with the median
    return Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

In [59]:
def preprocess_feature(X):
    # drop the columns
    X = drop_columns(X)

    # create column "Alone" from ["SibSp", "Parch"]
    transformer = FunctionTransformer(transform_sipsp_parch)
    X = transformer.transform(X)

    # since we have "0" values in Fare, we replace them with "NaN" (the imputer in the pipeline will change those afterwards)
    X['Fare'] = X['Fare'].replace(0, np.nan)

    return X


def apply_pipeline(X, pipeline):
    # here we do only a "transform", no "fit"!
    return pd.DataFrame(pipeline.transform(X), columns=pipeline.get_feature_names_out(), index=X.index)


transformers = [
    ('Sex', sex_pipeline(), ['Sex']),
    ('Age', age_pipeline(), ['Age']),
    ('Embarked', embarked_pipeline(), ['Embarked']),
    ('Fare', fare_pipeline(), ['Fare'])
]

# Create the ColumnTransformer
ct = ColumnTransformer(transformers, remainder="passthrough")

# Create the pipeline
preprocessing_pipeline = Pipeline([
    ("preprocessor", ct)
])

X_train = preprocess_feature(X_train)

# fit the pipeline on the preprocessed features
preprocessing_pipeline.fit(X_train)

# apply the pipeline
X_train = apply_pipeline(X_train, preprocessing_pipeline)

In [60]:
X_train.head(10)

In [61]:
# Save the data as train_preprocessed.csv

filepath = os.path.join('data', 'train_preprocessed.csv')
X_train_preprocessed = pd.DataFrame(X_train)
X_train_preprocessed["Survived"] = y_train
X_train_preprocessed.to_csv(filepath, index=False)

# Preprocess and save the test data

In [62]:
# preprocess the test data
X_test = preprocess_feature(X_test)

# apply the pipeline
X_test = apply_pipeline(X_test, preprocessing_pipeline)

X_test.head(10)

In [63]:
filepath = os.path.join('data', 'train_test_preprocessed.csv')
X_test_preprocessed = pd.DataFrame(X_test)
X_test_preprocessed["Survived"] = y_test
X_test_preprocessed.to_csv(filepath, index=False)

# Train and test the model

See Notebook titanic-kaggle-train-test.pynb

# Create result for Kaggle

Here we preprocess the test data

In [64]:
def load_titanic_test_data():
    return pd.read_csv(os.path.join('data', 'test.csv'))

X_final = load_titanic_test_data()

In [65]:
# we need to save the PassengerId
x_final_passenger_id = X_final["PassengerId"]

In [66]:
# preprocess the data
X_final = preprocess_feature(X_final)

# apply the pipeline
X_final = apply_pipeline(X_final, preprocessing_pipeline)

X_final["PassengerId"] = x_final_passenger_id

X_final.head(10)

In [67]:
filepath = os.path.join("data", "test_preprocessed.csv")
X_final.to_csv(filepath, index=False)