# Titanic: Machine Learning from Disaster

## Get Train/Test Data

In [1]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [2]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Scale/Transform Data

### Create DataFrameSelector to select Numerical/Categorical Features

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

### Create pipeline for numerical features

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", Imputer(strategy="median")),
    ])

In [5]:
num_pipeline.fit_transform(train_data)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

### Create imputer to handle null categorical values

In [6]:
# We also need an imputer for the string categorical columns
# (The regular imputer does not work on those)
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                       index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [7]:
from future_encoders import OneHotEncoder

### Create pipeline for categorical features

In [8]:
cat_pipeline = Pipeline([
    ('select_cat', DataFrameSelector(['Pclass', 'Sex', 'Embarked'])),
    ('imputer', MostFrequentImputer()),
    ('cat_encoder', OneHotEncoder(sparse=False))
])

### Merge numerical and categorical pipelines together

In [9]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

### Apply pipeline transformations to training and testing data

In [10]:
X_train = preprocess_pipeline.fit_transform(train_data)
y_train = train_data['Survived']
X_test = preprocess_pipeline.transform(test_data)

In [11]:
accuracy = []

## Create Neural Network

In [12]:
import tensorflow as tf

tf.reset_default_graph()
n_inputs = X_train.shape[1] #12
n_hidden1 = 25
n_hidden2 = 15
n_outputs = 2

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int64, shape=(None), name='y')

  from ._conv import register_converters as _register_converters


In [13]:
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.layers import dropout

is_training = tf.placeholder_with_default(False, shape=(), name='is_training')
keep_prob = 0.5
X_drop = dropout(X, keep_prob, is_training=is_training)

with tf.name_scope('dnn'):
    hidden1 = fully_connected(X, n_hidden1, scope='hidden1')
    hidden1_drop = dropout(hidden1, keep_prob, is_training=is_training)
    
    hidden2 = fully_connected(hidden1_drop, n_hidden2, scope='hidden2')
    hidden2_drop = dropout(hidden2, keep_prob, is_training=is_training)
    
    logits = fully_connected(hidden2_drop, n_outputs, scope='outputs', activation_fn=None)

In [14]:
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')
# softmax_cross_entropy_with_logits can handle one-hot encoding, while sparse_softmax_cross_entropy_with_loss handles
# ints from 0 to the number of classes-1

In [15]:
learning_rate = 0.001

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

In [16]:
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [17]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [18]:
import numpy as np

def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

## Train Neural Network

In [19]:
n_epochs = 2500
batch_size = 30

In [20]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        if epoch % 100 == 0:
            acc_total = accuracy.eval(feed_dict={X: X_train, y: y_train})
            print(epoch, "Batch accuracy:", acc_batch, "Train accuracy", acc_total)

    save_path = saver.save(sess, "./adam_model.ckpt")
    Z = logits.eval(feed_dict={X: X_test})
    y_preds = np.argmax(Z, axis=1)

0 Batch accuracy: 0.8 Train accuracy 0.6857464
100 Batch accuracy: 0.93333334 Train accuracy 0.82828283
200 Batch accuracy: 0.8333333 Train accuracy 0.83613914
300 Batch accuracy: 0.8666667 Train accuracy 0.84511787
400 Batch accuracy: 0.9 Train accuracy 0.84960717
500 Batch accuracy: 0.93333334 Train accuracy 0.85409653
600 Batch accuracy: 0.9 Train accuracy 0.8664422
700 Batch accuracy: 0.8666667 Train accuracy 0.8563412
800 Batch accuracy: 0.9 Train accuracy 0.86419755
900 Batch accuracy: 0.9 Train accuracy 0.86419755
1000 Batch accuracy: 0.8 Train accuracy 0.8765432
1100 Batch accuracy: 0.96666664 Train accuracy 0.88776654
1200 Batch accuracy: 1.0 Train accuracy 0.8855219
1300 Batch accuracy: 0.93333334 Train accuracy 0.8888889
1400 Batch accuracy: 0.93333334 Train accuracy 0.8821549
1500 Batch accuracy: 0.93333334 Train accuracy 0.8922559
1600 Batch accuracy: 1.0 Train accuracy 0.8967452
1700 Batch accuracy: 0.96666664 Train accuracy 0.89113355
1800 Batch accuracy: 0.8666667 Train

In [21]:
with tf.Session() as sess:
    saver.restore(sess, "./adam_model.ckpt")
    Z = logits.eval(feed_dict={X: X_test})
    y_predictions = np.argmax(Z, axis=1)

INFO:tensorflow:Restoring parameters from ./adam_model.ckpt


## Save submission

In [22]:
d = {'PassengerId':[i+892 for i in range(len(list(y_predictions)))],
       'Survived': list(y_predictions)}
df = pd.DataFrame(data=d, index=None)
df.to_csv('adam_submission.csv', index=False)

## XGBoost Implementation

In [27]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [34]:
xgb_clf = xgb.XGBClassifier(max_depth=2)
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [35]:
xgb_predictions = xgb_clf.predict(X_test)
accuracy = accuracy_score(xgb_clf.predict(X_train), y_train)
print('Training set accuracy:', accuracy)

Training set accuracy: 0.8507295173961841


  if diff:
  if diff:


In [26]:
d = {'PassengerId':[i+892 for i in range(len(list(xgb_predictions)))],
       'Survived': list(xgb_predictions)}
df = pd.DataFrame(data=d, index=None)
df.to_csv('xgb_submission.csv', index=False)

In [32]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score