In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, StandardScaler, MinMaxScaler

# Build a Pipeline

## Central methods to call on the data

In [2]:
def load_titanic_data():
    return pd.read_csv(os.path.join('data', 'train.csv'))

def split_data(df):
    stratify_by = df["Pclass"]
    titanic_train, titanic_test = train_test_split(df, test_size=0.2, stratify=stratify_by, random_state=42)
    titanic_train_features = titanic_train.drop('Survived', axis=1)
    titanic_train_label = titanic_train['Survived']
    titanic_test_features = titanic_test.drop('Survived', axis=1)
    titanic_test_label = titanic_test['Survived']
    return titanic_train_features, titanic_train_label, titanic_test_features, titanic_test_label

In [16]:
titanic = load_titanic_data()
X_train, y_train, X_test, y_test = split_data(titanic)

## Analyze the data

See Notebook titanic-kaggle-analyze

## Preprocess the data

In [4]:
def drop_columns(df):
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Age", "Embarked"]
    return df.drop(columns_to_drop, axis=1)

In [5]:
def sex_pipeline():
    # we are going to use an OrdinalEncoder to make numerical data of the sex
    return Pipeline([
        ("encode", OrdinalEncoder())
    ])

In [6]:
def age_pipeline():
    # since there are lots of null values we are going to impute them
    return Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

In [7]:
def transform_sipsp_parch(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    mask = (df["SibSp"] == 0) & (df["Parch"] == 0)

    # Create a new column, initialized with 1
    df.loc[:, "Alone"] = 1

    df.loc[~mask, "Alone"] = 0 # set 0 where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [8]:
def transform_sipsp_parch_only_0_1(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    condition_no_sibsp = (df["SibSp"] == 0)
    condition_no_parents_children = (df["Parch"] == 0)

    # Create two new columns, initialized with 1
    df.loc[:, "Wo_SibSp"] = 1
    df.loc[:, "Wo_Parch"] = 1

    df.loc[~condition_no_sibsp, "Wo_SibSp"] = 0 # set 0 ("false") where the condition is not met
    df.loc[~condition_no_parents_children, "Wo_Parch"] = 0 # set 0 ("false") where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [9]:
def embarked_pipeline():
    # we impute the null values with the most frequent and afterward encode it
    return  Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder())
    ])


In [13]:
def fare_pipeline():
    # we impute the null values with the median
    return Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

In [14]:
def get_logarithm(x):
    return np.log(x)

In [17]:
def preprocess_feature(X):
    # drop the columns
    X = drop_columns(X)

    # create column "Wo_SibSp" and "Wo_Parch" from ["SibSp", "Parch"]
    #transformer = FunctionTransformer(transform_sipsp_parch_only_0_1)
    #X = transformer.transform(X)

    # since we have "0" values in Fare, we replace them with "NaN" (the imputer in the pipeline will change those afterwards)
    X['Fare'] = X['Fare'].replace(0, np.nan)

    # now we create the logarithm of the Fare
    X['Fare'] = X.Fare.apply(lambda x: get_logarithm(x))

    return X


def apply_pipeline(X, pipeline):
    # here we do only a "transform", no "fit"!
    # fit is used to "train" the pipeline (this is done outside this function)
    # transform is used to transform the data based on the learned parameters from the fit() step
    return pd.DataFrame(pipeline.transform(X), columns=pipeline.get_feature_names_out(), index=X.index)


transformers = [
    ('Sex', sex_pipeline(), ['Sex']),
    #('Age', age_pipeline(), ['Age']),
    #('Embarked', embarked_pipeline(), ['Embarked']),
    ('Fare', fare_pipeline(), ['Fare'])
]

# Create the ColumnTransformer
ct = ColumnTransformer(transformers, remainder="passthrough")

# Create the pipeline
preprocessing_pipeline = Pipeline([
    ("preprocessor", ct)
])

X_train = preprocess_feature(X_train)

# fit the pipeline on the preprocessed features
preprocessing_pipeline.fit(X_train)

# apply the pipeline
X_train = apply_pipeline(X_train, preprocessing_pipeline)

In [18]:
X_train.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch
820,0.0,1.71094,1.0,1.0,1.0
439,1.0,-0.63935,2.0,0.0,0.0
821,1.0,-0.846125,3.0,0.0,0.0
403,1.0,-0.196726,3.0,1.0,0.0
343,1.0,-0.409786,2.0,0.0,0.0
514,1.0,-1.001615,3.0,0.0,0.0
40,0.0,-0.749759,3.0,1.0,0.0
101,1.0,-0.945735,3.0,0.0,0.0
93,1.0,0.083715,3.0,1.0,2.0
81,1.0,-0.746927,3.0,0.0,0.0


In [19]:
# Save the data as train_preprocessed.csv

filepath = os.path.join('data', 'train_preprocessed.csv')
X_train_preprocessed = pd.DataFrame(X_train)
X_train_preprocessed["Survived"] = y_train
X_train_preprocessed.to_csv(filepath, index=False)

# Preprocess and save the test data

In [20]:
# preprocess the test data
X_test = preprocess_feature(X_test)

# apply the pipeline
X_test = apply_pipeline(X_test, preprocessing_pipeline)

X_test.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch
132,0.0,-0.292412,3.0,1.0,0.0
3,0.0,1.102797,1.0,1.0,0.0
270,1.0,0.524314,1.0,0.0,0.0
421,1.0,-0.968087,3.0,0.0,0.0
154,1.0,-1.028227,3.0,0.0,0.0
292,1.0,-0.420172,2.0,0.0,0.0
304,1.0,-0.924946,3.0,0.0,0.0
202,1.0,-1.155522,3.0,0.0,0.0
810,1.0,-0.946866,3.0,0.0,0.0
108,1.0,-0.945735,3.0,0.0,0.0


In [21]:
filepath = os.path.join('data', 'train_test_preprocessed.csv')
X_test_preprocessed = pd.DataFrame(X_test)
X_test_preprocessed["Survived"] = y_test
X_test_preprocessed.to_csv(filepath, index=False)

# Train and test the model

See Notebook titanic-kaggle-train-test.pynb

# Preprocess the Data which will be submitted to Kaggle

Here we preprocess the test data (the data which will be submitted to Kaggle)

In [22]:
def load_titanic_test_data():
    return pd.read_csv(os.path.join('data', 'test.csv'))

X_final = load_titanic_test_data()

In [23]:
# we need to save the PassengerId
x_final_passenger_id = X_final["PassengerId"]

In [24]:
# preprocess the data
X_final = preprocess_feature(X_final)

# apply the pipeline
X_final = apply_pipeline(X_final, preprocessing_pipeline)

X_final["PassengerId"] = x_final_passenger_id

X_final.head(10)

Unnamed: 0,Sex__Sex,Fare__Fare,remainder__Pclass,remainder__SibSp,remainder__Parch,PassengerId
0,1.0,-0.95484,3.0,0.0,0.0,892
1,0.0,-1.075171,3.0,1.0,0.0,893
2,1.0,-0.725919,2.0,0.0,0.0,894
3,1.0,-0.846125,3.0,0.0,0.0,895
4,0.0,-0.470373,3.0,1.0,1.0,896
5,1.0,-0.7785,3.0,0.0,0.0,897
6,0.0,-0.982655,3.0,0.0,0.0,898
7,1.0,0.45263,2.0,1.0,1.0,899
8,0.0,-1.040541,3.0,0.0,0.0,900
9,1.0,0.255917,3.0,2.0,0.0,901


In [25]:
filepath = os.path.join("data", "test_preprocessed.csv")
X_final.to_csv(filepath, index=False)