In [None]:
%%capture
!pip install pandas scikit-learn numpy matplotlib torch seaborn
!kaggle competitions download titanic
!unzip -o titanic.zip

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA, KernelPCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sb
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np

# Model I: Birkenhead Model

To get things going, we can make an easy first pass model that operates on the knowledge that women and children were prioritized evacuation under the [Birkenhead drill](https://en.wikipedia.org/wiki/Women_and_children_first).  Simply put, if you are a woman or under the age of 13, the model will predict that the individual survives.

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.sample(5)

In [None]:
child_age = 12.0

# Keep sex and age columns
reduced = train[["Sex", "Age", "Survived"]].copy()
reduced.Sex = (reduced.Sex == "female").astype(int)
reduced.Age = (reduced.Age <= child_age).astype(int)
reduced["predicted"] = reduced.Sex | reduced.Age
train_acc = (reduced.predicted == reduced.Survived).mean()
sex_only = (reduced.Sex == reduced.Survived).mean()
print(f"Birkenhead drill train accuracy: {train_acc * 100:.2f}%")
print(f"Sex only train accuracy: {sex_only * 100:.2f}%")

# Write out the submission
bh_test = test[["PassengerId", "Sex", "Age"]].copy()
bh_test.Sex = (bh_test.Sex == "female").astype(int)
bh_test.Age = (bh_test.Age <= child_age).astype(int)
bh_test["Survived"] = bh_test.Sex | bh_test.Age
bh_test.to_csv("birkenhead.csv", columns=["PassengerId", "Survived"], index=False)

In [None]:
#!kaggle competitions submit titanic -f birkenhead.csv -m "Naive Model"

Not bad!  For such a simple model we are nearly 80% accurate.  Adjusting for children didn't make much of a difference, but it did give us an extra half a percent or so.  

# Model II: Logistic Regression with Simple Imputation

Let's get all datapoints involved by using a simple linear model to classify survival.  We still have to deal with the case where data is missing from some columns, so let's apply the simplest imputation strategy we can derive: modal imputation.  We'll just fill all missing values with the most common value for that column and move on.  This will help establish a baseline for future approaches.

In [None]:
# What columns lack entries?
train.isna().sum()

In [None]:
def simple_data_prep(data: pd.DataFrame) -> pd.DataFrame:
    cleaned = data.copy(deep=True)
    cleaned.Sex = (cleaned.Sex == "female").astype(int)

    # Drop complex string fields like name, Cabin, and Ticket
    cleaned.drop(columns=["Name", "Cabin", "Ticket"], inplace=True)

    # Impute Age, Embarked
    cleaned.Age.fillna(cleaned.Age.median(), inplace=True)
    cleaned.Fare.fillna(cleaned.Fare.median(), inplace=True)
    cleaned.Embarked.fillna(cleaned.Embarked.mode().iloc[0], inplace=True)

    # Get dummy variables for Embarked
    cleaned = pd.get_dummies(cleaned, columns=["Embarked"])

    # Log fare to get rid of long tails
    cleaned.Fare = np.log10(cleaned.Fare + 1)

    # Normalize age
    cleaned.Age = cleaned.Age / cleaned.Age.max()
    return cleaned

In [None]:
train_clean = simple_data_prep(train)
test_clean = simple_data_prep(test)
train_clean

In [None]:
# Train using logistic regression
lgr = LogisticRegression(max_iter=1000, verbose=True)
x_cols = [x for x in train_clean.columns if x not in ["Survived", "PassengerId"]]
print(x_cols)
lgr.fit(train_clean[x_cols], y=train_clean.Survived)
train_acc = np.mean(lgr.predict(train_clean[x_cols]) == train_clean["Survived"]) * 100
print(f"Accuracy = {train_acc:.2f}")

test_clean["Survived"] = lgr.predict(test_clean[x_cols])
test_clean[["PassengerId", "Survived"]].to_csv("simple_lgr.csv", index=False)

In [None]:
#!kaggle competitions submit titanic -f simple_lgr.csv -m "Simple imputation and logistic regression"

# Sophisticated Imputation

Before trying out any more sophisticated models, let's try to clean up our data to get the most out of what we have.

## Breaking Down String Fields

String fields like Cabin, Ticket, Name, and Embarked cannot be inputs to most models at this point and must be parsed into numerical fields.  Fortunately, these fields have lots of rich structure that embed information about the passenger.  This section will deal with breaking information out of these fields into subfields that we can use to impute missing fields and ultimately train a model on.

### Passenger Names

Fortunately, passenger's names on the ticket reveal quite a bit of information about their status, which is highly correlated to their age.  For example:

- "Master" is a honorific for young men or boys, meaning the age is likely under 18 years.  
- Married women are given "Mrs" while unmarried women or girls are "Miss"
- Some married women have their husbands name in parentheses. Some do not, possibly implying a widow?  Widows tend to be older 

These features in the name field can be extracted to binary variables which can be used to impute ages.

### Tickets

Some tickets are simply numbered, but others have letters or other encodings beforehand.  To extract meaningful information from the ticket field, split out any letters, or what we will call "ticket modifiers" and numbers, or "ticket numbers".  Modifiers will be mapped to an integer encoding with 0 being no characters in front of a ticket number.  Tickets with only letters will have ticket number 0.

### Cabin Number

The least populated field, cabin number seems to encode a number of things - the deck of the ship, the room number, and possibly the number of cabins this passenger has purchased.  Some cabin entries only have a deck associated with them and some have multiple entries.  We can break this field down into three variables: 
- An integer encoding for the deck the passenger was assigned to
- The cabin number or first cabin number in the first cabin entry
- The number of cabin numbers or spaces in the cabin entry

In [None]:
# Determine which fields have missing entries in both datasets
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
def process_names(data: pd.DataFrame) -> pd.DataFrame:
    """Process passenger names into several columns that represent title information in a categorical
    variable

    Args:
        data (pd.DataFrame): A dataframe with column "Name"

    Returns:
        pd.DataFrame: A DataFrame with new columns derived from "Name"
    """
    modified = data.copy(deep=True)

    # Develop columns based on the passenger's name
    modified["master"] = (data.Name.str.contains("master", case=False)).astype(int)
    modified["mrs"] = (data.Name.str.contains("Mrs", case=True)).astype(int)
    modified["miss"] = (data.Name.str.contains("Miss", case=True)).astype(int)
    modified["widow"] = (modified["mrs"] & ~data.Name.str.contains("\(")).astype(int)
    modified["mr"] = (data.Name.str.contains("Mr.", case=True)).astype(int)
    modified.drop(columns=["Name"], inplace=True)
    return modified

In [None]:
def ticket_to_number(ticket):
    if isinstance(ticket, (int, float)):
        return int(ticket)
    else:
        no_letters = "".join(filter(str.isdigit, ticket))
        if len(no_letters):
            return int(no_letters)
        else:
            return 0


def process_ticket(data: pd.DataFrame) -> pd.DataFrame:
    modified = data.copy(deep=True)
    modified["ticket_modifier"] = modified.Ticket.apply(lambda x: int(len(list(filter(str.isalpha, x))) > 0))
    modified["ticket_number"] = modified.Ticket.apply(ticket_to_number)
    modified.drop(columns=["Ticket"], inplace=True)
    return modified

In [None]:
def get_number(cabin_entry):
    """Extract a cabin number if one exists or take the mean of all numbers found"""
    if isinstance(cabin_entry, (int, float)):
        return int(cabin_entry)
    else:
        entries = cabin_entry.split(" ")
        numbers = ["".join(filter(str.isdigit, x)) for x in entries]
        numbers = [int(x) for x in numbers if len(x)]
        if len(numbers) == 0:
            return np.nan
        else:
            return sum(numbers) / len(numbers)


def process_cabin(data: pd.DataFrame) -> pd.DataFrame:
    modified = data.copy(deep=True)
    # Develop columns based on the cabin
    with_cabins = modified[~modified.Cabin.isna()]
    modified.loc[~modified.Cabin.isna(), "cabin_entries"] = with_cabins.Cabin.apply(
        lambda x: len(str(x).split(" "))
    )

    # Develop an encoding for the deck level, leave missing ones as NA
    for idx, level in enumerate(["a", "b", "c", "d", "e", "f"]):
        modified.loc[
            ~modified.Cabin.isna() & with_cabins.Cabin.str.contains(level, case=False),
            "cabin_level",
        ] = (
            idx + 1
        )
    modified.drop(columns=["Cabin"], inplace=True)
    return modified

In [None]:
def process_string_fields(data: pd.DataFrame) -> pd.DataFrame:
    modified = data.copy(deep=True)

    # Process Sex to be binary
    modified.Sex = (modified.Sex == "female").astype(int)
    # Get dummy variables for embarked
    modified = pd.get_dummies(modified, columns=["Embarked"])
    modified = process_names(modified)
    modified = process_cabin(modified)
    modified = process_ticket(modified)
    return modified

Now that we have extracted values from string fields, let's look at correlations between different variables prior to imputation.  This will show us which variables may be important in the broader goal of classification.

In [None]:
train_imputed = process_string_fields(train)
train_imputed.drop(columns='PassengerId', inplace=True)
corr = train_imputed.corr()
sb.set(rc={'figure.figsize':(20,20)})
sb.heatmap(corr, cmap="Blues", annot=True)

As expected, there are major correlations between survival and sex, class, fare, and extracted proxies for these characteristics such as title.  There appears to be very weak correlations between survival and cabin, embarked, and ticket variables, which isn't terribly surprising, but some of these variables do correlate strongly with age, so we can leverage them to impute age using nearest neighbors methods.

In [None]:
desired_columns= ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'master', 'mrs', 'miss', 'widow', 'mr']

In [None]:
def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    modified = data.copy(deep=True)

    # Process string fields
    modified = process_string_fields(modified)

    imputer = KNNImputer(n_neighbors=3)
    modified[modified.columns] = imputer.fit_transform(modified.values)

    # Drop unwanted columns
    modified.drop(columns=[x for x in modified.columns if x not in desired_columns], inplace=True)

    return modified

Before training on this data, we have to normalize values as linear models do not work well with variables of different scale.  For large, long tailed distributions like Fare, we can take the log and normalize accordingly.  For all others, a simple rescaling will do the trick.

In [None]:
def normalize(data):
    modified = data.copy(deep=True)

    # Normalize fare by log10
    modified.Fare = np.log10(modified.Fare + 1)

    # Normalize all columns
    for column in modified.columns:
        min_max = (modified[column].max() - modified[column].min())
        if min_max != 0:
            modified[column] = (modified[column] - modified[column].min()) / min_max
    return modified

In [None]:
train_norm = normalize(preprocess(train))
test_norm = normalize(preprocess(test))

After formatting our training data, now we can fit a number of models to check what may be an optimal classifier.  In trials, the difference between a random forest, SVM, kernelized SVM, and logistic regression doesn't seem to be good.  Each shows some amount of overfitting, which may be a product of our imputation strategy.

In [None]:
train_split, val_split = train_test_split(train_norm, test_size=.2)
classifiers = [LogisticRegression(max_iter=500), SVC(), SVC(kernel='poly'), RandomForestClassifier()]
train_x = train_split[[x for x in train_split.columns if x not in ['Survived', 'PassengerId']]]
train_y = train_split['Survived']
val_x = val_split[[x for x in val_split.columns if x not in ['Survived', 'PassengerId']]]
val_y = val_split['Survived']

for classifier in classifiers:
    print('Fitting ' + classifier.__class__.__name__)
    classifier.fit(train_x, train_y)
    train_acc = np.mean(classifier.predict(train_x) == train_y) * 100
    val_acc = np.mean(classifier.predict(val_x) == val_y) * 100
    print(train_acc)
    print(val_acc)
    test_frame = test.copy(deep=True)
    test_frame['Survived'] = classifier.predict(test_norm)
    test_frame['Survived'] = test_frame['Survived'].astype(int)
    test_frame[['PassengerId', 'Survived']].to_csv(f'{classifier.__class__.__name__.lower()}_output.csv', index=False)

In [None]:
#!kaggle competitions submit titanic -f <output> -m "Age imputation with KNN"