# Titanic Survivors

Data analysis and prediction practice

In [1]:
import pandas as pd
import numpy as np

In [2]:
training = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [3]:
training.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Naive Prediction

Let's try predicting the survival from just gender and class, try to get to submitting SOME prediction as quickly as possible.

In [78]:
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers as reg
import csv

def prep_input(df):
    pclass = np.array(pd.get_dummies(df['Pclass']))
    sex = np.array(pd.get_dummies(df['Sex']))
    return np.concatenate((pclass, sex), axis=1)

def prep_input_with_age(df):
    x_train_1 = prep_input(df)
    age = np.array(list(df['Age']))
    just_nums = age[~np.isnan(age)]
    mu = just_nums.mean()
    sigma = just_nums.std()
    age[np.isnan(age)] = mu
    X_age = ((age - mu) / sigma).reshape(len(age), 1)
    X2_train = np.concatenate((x_train_1, X_age), axis=1)
    return X2_train

def simple_net(input_units):
    model = Sequential()
    model.add(Dense(units=15,activation='relu', input_dim=input_units))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model

def regularized_net(input_units):
    model = Sequential()
    model.add(Dense(units=15,activation='relu', input_dim=input_units, kernel_regularizer=reg.l2(0.001)))
    model.add(Dense(units=1, activation='sigmoid', kernel_regularizer=reg.l2(0.001)))
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model

def naive_net():
    return simple_net(5)

def age_net():
    return simple_net(6)

def naive_reg_net():
    return simple_net(5)

def age_reg_net():
    return simple_net(6)
    

def write_prediction_file(predictions, data):
    ids = np.array(list(data['PassengerId'])).reshape(len(data['PassengerId']), 1).astype(int)
    output = np.concatenate((ids, predictions), axis=1)
    with open("predictions.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(["PassengerId","Survived"])
        for row in output:
            writer.writerow(row)

In [6]:
x_train = prep_input(training)
y_train = np.array(list(training['Survived']))

In [44]:
model = simple_net(5)
model.fit(x_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1186f2588>

In [45]:
x_test = prep_input(test)
predictions = np.round(model.predict(x_test)).astype(int)

In [48]:
write_prediction_file(predictions, test)

## Including Age

Intuitively, children are allowed to get on boats first.  Including properly normalized age should be a useful feature.

In [50]:
X2_train = prep_input_with_age(training)

In [55]:
model2 = simple_net(6)
model2.fit(X2_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x118fa7160>

In [57]:
x2_test = prep_input_with_age(test)
predictions2 = np.round(model2.predict(x2_test)).astype(int)

In [59]:
write_prediction_file(predictions2, test)

In [33]:
x_train.shape

(891, 5)

In [34]:
X_age.shape

(891,)

## Model validation

so far no work has gone into validating the accuracy of these models on data it hasn't seen yet, which could allow overfitting.  Let's do some cross-validation.

In [72]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [79]:
naive_model = KerasClassifier(build_fn=naive_net, epochs=10, batch_size=32, verbose=0)
naive_reg_model = KerasClassifier(build_fn=naive_reg_net, epochs=10, batch_size=32, verbose=0)
age_model = KerasClassifier(build_fn=age_net, epochs=10, batch_size=32, verbose=0)
age_reg_model = KerasClassifier(build_fn=age_reg_net, epochs=10, batch_size=32, verbose=0)

naive_data = prep_input(training)
age_data = prep_input_with_age(training)
y_train = np.array(list(training['Survived']))

In [80]:
cross_val_score(naive_model, naive_data, y_train, cv=4)

array([ 0.70403588,  0.56502242,  0.61883408,  0.51801802])

In [81]:
cross_val_score(naive_reg_model, naive_data, y_train, cv=4)

array([ 0.76233184,  0.78923766,  0.70403587,  0.73423424])

In [82]:
cross_val_score(age_model, age_data, y_train, cv=4)

array([ 0.77130045,  0.69955157,  0.73542601,  0.72072072])

In [83]:
cross_val_score(age_reg_model, age_data, y_train, cv=4)

array([ 0.65022422,  0.60538117,  0.66816144,  0.77027027])