In [21]:
import os

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, StandardScaler, MinMaxScaler

# Build a Pipeline

## Central methods to call on the data

In [22]:
def load_titanic_data():
    return pd.read_csv(os.path.join('data', 'train.csv'))

def split_data(df):
    stratify_by = df["Pclass"]
    titanic_train, titanic_test = train_test_split(df, test_size=0.2, stratify=stratify_by, random_state=42)
    titanic_train_features = titanic_train.drop('Survived', axis=1)
    titanic_train_label = titanic_train['Survived']
    titanic_test_features = titanic_test.drop('Survived', axis=1)
    titanic_test_label = titanic_test['Survived']
    return titanic_train_features, titanic_train_label, titanic_test_features, titanic_test_label

In [23]:
titanic = load_titanic_data()
# we call the train set just "titanic"
X_train, y_train, X_test, y_test = split_data(titanic)

## Analyze the data

See Notebook titanic-kaggle-analyze

## Preprocess the data

In [24]:
def drop_columns(df):
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Age", "Embarked"]
    return df.drop(columns_to_drop, axis=1)

In [25]:
def sex_pipeline():
    # we are going to use an OrdinalEncoder to make numerical data of the sex
    return Pipeline([
        ("encode", OrdinalEncoder())
    ])

In [26]:
def age_pipeline():
    # since there are lots of null values we are going to impute them
    return Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

In [27]:
def transform_sipsp_parch(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    mask = (df["SibSp"] == 0) & (df["Parch"] == 0)

    # Create a new column, initialized with 1
    df.loc[:, "Alone"] = 1

    df.loc[~mask, "Alone"] = 0 # set 0 where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [28]:
def transform_sipsp_parch_only_0_1(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    sibSpMask = (df["SibSp"] == 0)
    parchMask = (df["Parch"] == 0)

    # Create two new columns, initialized with 1
    df.loc[:, "Wo_SibSp"] = 1
    df.loc[:, "Wo_Parch"] = 1

    df.loc[~sibSpMask, "Wo_SibSp"] = 0 # set 0 where the condition is not met
    df.loc[~parchMask, "Wo_Parch"] = 0 # set 0 where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [29]:
def embarked_pipeline():
    # we impute the null values with the most frequent and afterward encode it
    return  Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder())
    ])


In [30]:
def fare_pipeline():
    # we impute the null values with the median
    return Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

In [31]:
def preprocess_feature(X):
    # drop the columns
    X = drop_columns(X)

    # create column "Wo_SibSp" and "Wo_Parch" from ["SibSp", "Parch"]
    transformer = FunctionTransformer(transform_sipsp_parch_only_0_1)
    #X = transformer.transform(X)

    # since we have "0" values in Fare, we replace them with "NaN" (the imputer in the pipeline will change those afterwards)
    X['Fare'] = X['Fare'].replace(0, np.nan)

    return X


def apply_pipeline(X, pipeline):
    # here we do only a "transform", no "fit"!
    return pd.DataFrame(pipeline.transform(X), columns=pipeline.get_feature_names_out(), index=X.index)


transformers = [
    ('Sex', sex_pipeline(), ['Sex']),
    #('Age', age_pipeline(), ['Age']),
    #('Embarked', embarked_pipeline(), ['Embarked']),
    ('Fare', fare_pipeline(), ['Fare'])
]

# Create the ColumnTransformer
ct = ColumnTransformer(transformers, remainder="passthrough")

# Create the pipeline
preprocessing_pipeline = Pipeline([
    ("preprocessor", ct)
])

X_train = preprocess_feature(X_train)

# fit the pipeline on the preprocessed features
preprocessing_pipeline.fit(X_train)

# apply the pipeline
X_train = apply_pipeline(X_train, preprocessing_pipeline)

In [32]:
X_train.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch
820,0.0,1.257206,1.0,1.0,1.0
439,1.0,-0.45455,2.0,0.0,0.0
821,1.0,-0.492446,3.0,0.0,0.0
403,1.0,-0.344214,3.0,1.0,0.0
343,1.0,-0.402991,2.0,0.0,0.0
514,1.0,-0.516507,3.0,0.0,0.0
40,0.0,-0.475689,3.0,1.0,0.0
101,1.0,-0.508258,3.0,0.0,0.0
93,1.0,-0.246767,3.0,1.0,2.0
81,1.0,-0.475173,3.0,0.0,0.0


In [33]:
# Save the data as train_preprocessed.csv

filepath = os.path.join('data', 'train_preprocessed.csv')
X_train_preprocessed = pd.DataFrame(X_train)
X_train_preprocessed["Survived"] = y_train
X_train_preprocessed.to_csv(filepath, index=False)

# Preprocess and save the test data

In [34]:
# preprocess the test data
X_test = preprocess_feature(X_test)

# apply the pipeline
X_test = apply_pipeline(X_test, preprocessing_pipeline)

X_test.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch
132,0.0,-0.372056,3.0,1.0,0.0
3,0.0,0.424014,1.0,1.0,0.0
270,1.0,-0.031767,1.0,0.0,0.0
421,1.0,-0.511609,3.0,0.0,0.0
154,1.0,-0.520287,3.0,0.0,0.0
292,1.0,-0.405569,2.0,0.0,0.0
304,1.0,-0.505078,3.0,0.0,0.0
202,1.0,-0.537131,3.0,0.0,0.0
810,1.0,-0.508429,3.0,0.0,0.0
108,1.0,-0.508258,3.0,0.0,0.0


In [35]:
filepath = os.path.join('data', 'train_test_preprocessed.csv')
X_test_preprocessed = pd.DataFrame(X_test)
X_test_preprocessed["Survived"] = y_test
X_test_preprocessed.to_csv(filepath, index=False)

# Train and test the model

See Notebook titanic-kaggle-train-test.pynb

# Create result for Kaggle

Here we preprocess the test data

In [36]:
def load_titanic_test_data():
    return pd.read_csv(os.path.join('data', 'test.csv'))

X_final = load_titanic_test_data()

In [37]:
# we need to save the PassengerId
x_final_passenger_id = X_final["PassengerId"]

In [38]:
# preprocess the data
X_final = preprocess_feature(X_final)

# apply the pipeline
X_final = apply_pipeline(X_final, preprocessing_pipeline)

X_final["PassengerId"] = x_final_passenger_id

X_final.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch,PassengerId
0,1.0,-0.509631,3.0,0.0,0.0,892
1,0.0,-0.526732,3.0,1.0,0.0,893
2,1.0,-0.471306,2.0,0.0,0.0,894
3,1.0,-0.492446,3.0,0.0,0.0,895
4,0.0,-0.417685,3.0,1.0,1.0,896
5,1.0,-0.480845,3.0,0.0,0.0,897
6,0.0,-0.513756,3.0,0.0,0.0,898
7,1.0,-0.073014,2.0,1.0,1.0,899
8,0.0,-0.522005,3.0,0.0,0.0,900
9,1.0,-0.173038,3.0,2.0,0.0,901


In [39]:
filepath = os.path.join("data", "test_preprocessed.csv")
X_final.to_csv(filepath, index=False)