# Titanic - Machine Learning from Disaster

In [1]:
import os, sys

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv("./data/train.csv")
train_X = train.drop(columns=["Survived", "PassengerId"])
train_y = train["Survived"]

In [4]:
test = pd.read_csv("./data/test.csv")
test_X = test.drop(columns=["PassengerId"])

## Data Preprocessing

In [5]:
def encode_sex(sex):
    if sex == "male":
        return 0
    else:
        return 1

In [6]:
def encode_embarked(embarked):
    if embarked == "S":
        return 0
    elif embarked == "C":
        return 1
    elif embarked == "Q":
        return 2
    else:
        return 3

In [7]:
def preprocess(df):
    copy = df.copy(deep=True)
    
    # drop ["Name", "Ticket", "Cabin"] features
    copy = copy.drop(columns=["Name", "Ticket", "Cabin"], axis=1)
    
    # encode ["Sex", "Embarked"] feature
    copy["Sex"] = copy["Sex"].apply(encode_sex)
    copy["Embarked"] = copy["Embarked"].apply(encode_embarked)
    
    # impute ["Age"] using mean
    copy["Age"] = copy["Age"].fillna(int(copy["Age"].mean()))
    
    # impute ["Fare"] using mean
    copy["Fare"] = copy["Fare"].fillna(copy["Fare"].mean())
    
    return copy

In [8]:
train_X_proc = preprocess(train_X)
test_X_proc = preprocess(test_X)

## Random Forest Baseline

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [10]:
clf = RandomForestClassifier()

In [11]:
np.mean(cross_val_score(clf, train_X_proc, train_y, cv=5))

0.8092628272986577

In [12]:
clf.fit(train_X_proc, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Prepare "submission.csv"

In [13]:
def prepare_submission(clf, test_X, ids):
    predictions = clf.predict(test_X)
    submission = pd.DataFrame({"PassengerId":ids, "Survived":predictions})
    return submission

In [14]:
submission = prepare_submission(clf, test_X_proc, test["PassengerId"])

In [15]:
submission.to_csv("./submissions/submission.csv", index=False)