In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline
import re # use to split "Cabin" variable

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import Imputer

import tensorflow as tf
from functools import partial

import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


In [2]:
# Import the data
train = pd.read_csv('train.csv')
train_survived = train['Survived']
train.drop(['PassengerId','Survived'],axis = 1, inplace = True)

test = pd.read_csv('test.csv')
PassengerId = test['PassengerId']
test.drop('PassengerId',axis = 1, inplace = True)

In [3]:
## split numerical and catigorical variable
train_num = train.select_dtypes(exclude = ["object"])
train_cat = train.select_dtypes(include = ['object'])

test_num = test.select_dtypes(exclude = ['object'])
test_cat = test.select_dtypes(include = ['object'])

In [4]:
train_num.isnull().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [5]:
imputer_train = Imputer(strategy="median")
imputer_train.fit(train_num)
X_train = imputer_train.transform(train_num)
train_num = pd.DataFrame(X_train, columns=train_num.columns,
                                  index = list(train_num.index.values))

imputer_test = Imputer(strategy="median")
imputer_test.fit(test_num)
X_test = imputer_test.transform(test_num)
test_num = pd.DataFrame(X_test, columns=test_num.columns,
                                  index = list(test_num.index.values))

In [6]:
train_cat.isnull().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

Get the title of the name, the front part of the name.

In [7]:
train_cat["Name_front"] = "None"
for i in range(len(train_cat)):
    for j in train_cat.Name[i].split():
        if "." in j:
            train_cat["Name_front"][i] = j
            
test_cat["Name_front"] = "None"
for i in range(len(test_cat)):
    for j in test_cat.Name[i].split():
        if "." in j:
            test_cat["Name_front"][i] = j

In [8]:
train_cat["Name_front"].value_counts()

Mr.          517
Miss.        182
Mrs.         124
Master.       40
Dr.            7
Rev.           6
Major.         2
Mlle.          2
Col.           2
Don.           1
Lady.          1
Ms.            1
Jonkheer.      1
Capt.          1
Sir.           1
L.             1
Mme.           1
Countess.      1
Name: Name_front, dtype: int64

In [9]:
test_cat["Name_front"].value_counts()

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Ms.          1
Dr.          1
Dona.        1
Name: Name_front, dtype: int64

In [10]:
## we will put those frequency smaller than 10 into one group
train_rare_name_list = list(train_cat["Name_front"].value_counts().index[4:])
for i in range(len(train_cat['Name_front'])):
    if train_cat['Name_front'][i] in train_rare_name_list:
        train_cat['Name_front'][i] = "Rare"

## do the same for the test set
test_rare_name_list = list(test_cat["Name_front"].value_counts().index[4:])
for i in range(len(test_cat['Name_front'])):
    if test_cat['Name_front'][i] in test_rare_name_list:
        test_cat['Name_front'][i] = "Rare"

train_cat.drop(['Name'], axis = 1, inplace = True)
test_cat.drop(['Name'], axis = 1, inplace = True)

In [11]:
train_cat["Name_front"].value_counts()

Mr.        517
Miss.      182
Mrs.       124
Master.     40
Rare        28
Name: Name_front, dtype: int64

In [12]:
## dealing with Cabin variable
train_cat["Cabin_front"] = "None"
for i in range(len(train_cat["Cabin"])):
    if not pd.isnull(train_cat["Cabin"][i]):
        match = re.match(r"([a-z]+)([0-9]+)", train_cat["Cabin"][i], re.I)
        if match:
            train_cat["Cabin_front"][i] = match.groups()[0]
            
test_cat["Cabin_front"] = "None"
for i in range(len(test_cat["Cabin"])):
    if not pd.isnull(test_cat["Cabin"][i]):
        match = re.match(r"([a-z]+)([0-9]+)", test_cat["Cabin"][i], re.I)
        if match:
            test_cat["Cabin_front"][i] = match.groups()[0]
            
train_cat.drop(["Cabin"], axis = 1, inplace = True)
test_cat.drop(["Cabin"], axis = 1, inplace = True)

In [13]:
test_cat["Cabin_front"].value_counts()

None    332
C        35
B        18
D        12
E         9
A         7
F         4
G         1
Name: Cabin_front, dtype: int64

In [14]:
train_cat["Cabin_front"].value_counts()

None    695
C        59
B        47
E        32
D        30
A        15
F         9
G         4
Name: Cabin_front, dtype: int64

In [15]:
train_cat.isnull().sum()

Sex            0
Ticket         0
Embarked       2
Name_front     0
Cabin_front    0
dtype: int64

In [16]:
## "Embarked" for train_cat set, using its most frequent
train_cat = train_cat.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [17]:
train_cat.isnull().sum()

Sex            0
Ticket         0
Embarked       0
Name_front     0
Cabin_front    0
dtype: int64

In [18]:
test_cat.isnull().sum()

Sex            0
Ticket         0
Embarked       0
Name_front     0
Cabin_front    0
dtype: int64

In [19]:
## deal with ticket variable
train_cat["Ticket_prep"] = "None"
for i in range(len(train_cat.Ticket)):
    train_cat["Ticket_prep"][i] = train_cat.Ticket[i].split()[0]
    
test_cat["Ticket_prep"] = "None"
for i in range(len(test_cat.Ticket)):
    test_cat["Ticket_prep"][i] = test_cat.Ticket[i].split()[0]
    
ticket_common_name = list(set(list(test_cat["Ticket_prep"].value_counts()[:15].index)) & 
                         set(list(train_cat["Ticket_prep"].value_counts()[:15].index)))

print(ticket_common_name)

['PC', 'SOTON/O.Q.', 'A/5.', 'CA', 'C.A.', 'CA.', 'W./C.']


In [20]:
train_cat["Ticket_prep"] = "None"
for i in range(len(train_cat.Ticket)):
    if train_cat.Ticket[i].split()[0] in ticket_common_name:
        train_cat["Ticket_prep"][i] = train_cat.Ticket[i].split()[0]

test_cat["Ticket_prep"] = "None"
for i in range(len(test_cat.Ticket)):
    if test_cat.Ticket[i].split()[0] in ticket_common_name:
        test_cat["Ticket_prep"][i] = test_cat.Ticket[i].split()[0]
        
train_cat.drop(["Ticket"], axis = 1, inplace = True)
test_cat.drop(["Ticket"], axis = 1, inplace = True)

In [21]:
train_cat["Ticket_prep"].value_counts()

None          766
PC             60
C.A.           27
W./C.           9
SOTON/O.Q.      8
CA.             8
A/5.            7
CA              6
Name: Ticket_prep, dtype: int64

In [22]:
test_cat.isnull().sum()

Sex            0
Embarked       0
Name_front     0
Cabin_front    0
Ticket_prep    0
dtype: int64

In [23]:
## get rid of the outliers in the train set
# Removie the outliers
from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(train_num[["Age",'Fare']])
y_noano = clf.predict(train_num[["Age",'Fare']])
y_noano = pd.DataFrame(y_noano, columns = ['Outlier'])
useful_index = y_noano[y_noano['Outlier'] == 1].index.values

train_num = train_num.iloc[useful_index]
train_num.reset_index(drop = True, inplace = True)

train_survived = train_survived.iloc[useful_index]

train_cat = train_cat.iloc[useful_index]
train_cat.reset_index(drop = True, inplace = True)

train = train.iloc[useful_index]
train.reset_index(drop = True, inplace = True)

In [24]:
test_num.Parch[test_num.Parch > 6] = 6

In [25]:
train_num_prepared = pd.concat([pd.get_dummies(train_num[["Pclass","SibSp","Parch"]].astype(str)),
                                train_num[["Age",'Fare']]], axis = 1)
test_num_prepared = pd.concat([pd.get_dummies(test_num[["Pclass","SibSp","Parch"]].astype(str)),
                                test_num[["Age",'Fare']]], axis = 1)

In [26]:
## conbind train and test set
train_cat_prepared = pd.get_dummies(train_cat)
test_cat_prepared = pd.get_dummies(test_cat)

train_prepared = pd.concat([train_num_prepared, train_cat_prepared],axis = 1)
test_prepared = pd.concat([test_num_prepared, test_cat_prepared],axis = 1)

train_prepared = train_prepared.reset_index(drop=True)
test_prepared = test_prepared.reset_index(drop=True)

In [27]:
print("the shape of the prepared train set is {}".format(train_prepared.shape))
print("the shape of the prepared test set is {}".format(test_prepared.shape))

the shape of the prepared train set is (801, 45)
the shape of the prepared test set is (418, 45)


## model fitting

In [28]:
train_prepared = train_prepared.values
test_prepared = test_prepared.values
train_survived = train_survived.values
X_train, X_valid, y_train, y_valid = train_test_split(train_prepared,
                                                      train_survived, test_size=0.33, random_state=42)

With normal Neural Nets

In [29]:
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [37]:
tf.reset_default_graph()

n_inputs = 45  # number of features
n_hidden1 = 32
n_hidden2 = 16
n_hidden3 = 4
n_outputs = 2

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.selu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.selu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.selu, name="hidden3")
    logits = tf.layers.dense(hidden3, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss") ##
     
learning_rate = 0.001

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 2001
batch_size = 40

In [38]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if epoch % 200 == 0:
            acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
            acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
            print(epoch, "Batch accuracy:", acc_batch, "Validation accuracy:", acc_valid)

0 Batch accuracy: 0.4878049 Validation accuracy: 0.61509436
200 Batch accuracy: 0.85365856 Validation accuracy: 0.76981133
400 Batch accuracy: 0.9512195 Validation accuracy: 0.76981133
600 Batch accuracy: 0.902439 Validation accuracy: 0.76981133
800 Batch accuracy: 0.9268293 Validation accuracy: 0.7773585
1000 Batch accuracy: 0.9512195 Validation accuracy: 0.76603776
1200 Batch accuracy: 0.9756098 Validation accuracy: 0.754717
1400 Batch accuracy: 0.9756098 Validation accuracy: 0.7433962
1600 Batch accuracy: 0.9268293 Validation accuracy: 0.7509434
1800 Batch accuracy: 0.9512195 Validation accuracy: 0.7433962
2000 Batch accuracy: 0.9268293 Validation accuracy: 0.7283019


### add batch normalization

In [45]:
tf.reset_default_graph()

n_inputs = 45 
n_hidden1 = 32
n_hidden2 = 16
n_hidden3 = 8
n_outputs = 2

batch_norm_momentum = 0.99

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.variance_scaling_initializer()

    my_batch_norm_layer = partial(
            tf.layers.batch_normalization,
            training=training,
            momentum=batch_norm_momentum)

    my_dense_layer = partial(
            tf.layers.dense, #  whether we could change the order
            kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    hidden3 = my_dense_layer(bn1, n_hidden2, name="hidden3")
    bn3 = tf.nn.elu(my_batch_norm_layer(hidden3))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.001
    
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

n_epochs = 2001
batch_size = 100

In [46]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops],
                     feed_dict={training: True, X: X_batch, y: y_batch})
        accuracy_bat = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        if epoch % 200 == 0:
            print(epoch, "Train accarcy:", accuracy_bat,
                  "Validation accuracy:", accuracy_val)

0 Train accarcy: 0.6635514 Validation accuracy: 0.6679245
200 Train accarcy: 0.92523366 Validation accuracy: 0.76981133
400 Train accarcy: 0.8785047 Validation accuracy: 0.7811321
600 Train accarcy: 0.93457943 Validation accuracy: 0.7735849
800 Train accarcy: 0.92523366 Validation accuracy: 0.7471698
1000 Train accarcy: 0.92523366 Validation accuracy: 0.75849056
1200 Train accarcy: 0.95327103 Validation accuracy: 0.7509434
1400 Train accarcy: 0.93457943 Validation accuracy: 0.7433962
1600 Train accarcy: 0.95327103 Validation accuracy: 0.7509434
1800 Train accarcy: 0.95327103 Validation accuracy: 0.7358491
2000 Train accarcy: 0.94392526 Validation accuracy: 0.75849056


### Add regularization
Dropout

In [440]:
n_inputs

45

In [47]:
tf.reset_default_graph()

n_inputs = 45 
n_hidden1 = 32
n_hidden2 = 16
n_hidden3 = 8
n_outputs = 2

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

training = tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate = 0.15  # == 1 - keep_prob
X_drop = tf.layers.dropout(X, dropout_rate, training=training)
he_init = tf.variance_scaling_initializer()

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
                              kernel_initializer=he_init, name="hidden1")
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu,
                              name="hidden2")
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
    hidden3 = tf.layers.dense(hidden2_drop, n_hidden3, activation=tf.nn.relu,
                              name="hidden3")
    hidden3_drop = tf.layers.dropout(hidden3, dropout_rate, training=training)
    logits = tf.layers.dense(hidden3_drop, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.001
with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [49]:
n_epochs = 2801
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
        accuracy_bat = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        if epoch % 100 == 0:
            print(epoch, "Train accarcy:", accuracy_bat,
                  "Validation accuracy:", accuracy_val)
    Z = logits.eval(feed_dict = {X: test_prepared})
    y_pred =np.argmax(Z, axis = 1)

0 Train accarcy: 0.71910113 Validation accuracy: 0.6415094
100 Train accarcy: 0.6853933 Validation accuracy: 0.6301887
200 Train accarcy: 0.71348315 Validation accuracy: 0.6301887
300 Train accarcy: 0.7022472 Validation accuracy: 0.6339623
400 Train accarcy: 0.6741573 Validation accuracy: 0.6528302
500 Train accarcy: 0.73595506 Validation accuracy: 0.6679245
600 Train accarcy: 0.7022472 Validation accuracy: 0.6792453
700 Train accarcy: 0.7303371 Validation accuracy: 0.7018868
800 Train accarcy: 0.8258427 Validation accuracy: 0.76603776
900 Train accarcy: 0.8258427 Validation accuracy: 0.7811321
1000 Train accarcy: 0.7752809 Validation accuracy: 0.8037736
1100 Train accarcy: 0.7752809 Validation accuracy: 0.79622644
1200 Train accarcy: 0.80898875 Validation accuracy: 0.8188679
1300 Train accarcy: 0.8146067 Validation accuracy: 0.7735849
1400 Train accarcy: 0.7977528 Validation accuracy: 0.79622644
1500 Train accarcy: 0.83707863 Validation accuracy: 0.75849056
1600 Train accarcy: 0.83707

In [52]:
n_epochs = 4701
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        sess.run(training_op, feed_dict={X: train_prepared, y: train_survived, training: True})
        accuracy_bat = accuracy.eval(feed_dict={X: train_prepared, y: train_survived})
        if epoch % 200 == 0:
            print(epoch, "Train accarcy:", accuracy_bat)
    Z = logits.eval(feed_dict = {X: test_prepared})
    y_pred =np.argmax(Z, axis = 1)

0 Train accarcy: 0.6416979
200 Train accarcy: 0.6691635
400 Train accarcy: 0.67790264
600 Train accarcy: 0.67540574
800 Train accarcy: 0.67540574
1000 Train accarcy: 0.67915106
1200 Train accarcy: 0.6828964
1400 Train accarcy: 0.6828964
1600 Train accarcy: 0.6866417
1800 Train accarcy: 0.6866417
2000 Train accarcy: 0.6841448
2200 Train accarcy: 0.6866417
2400 Train accarcy: 0.69538075
2600 Train accarcy: 0.71660423
2800 Train accarcy: 0.72659177
3000 Train accarcy: 0.7802746
3200 Train accarcy: 0.7852684
3400 Train accarcy: 0.7940075
3600 Train accarcy: 0.7977528
3800 Train accarcy: 0.8002497
4000 Train accarcy: 0.80524343
4200 Train accarcy: 0.80898875
4400 Train accarcy: 0.8227216
4600 Train accarcy: 0.8164794


In [53]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({'PassengerId': PassengerId,
                                   'Survived': y_pred })
StackingSubmission.to_csv("NN_4.csv", index=False)