# Section 3-2 - Deep Learning

For detailed steps on extracting and cleaning data, please review Sections 1-0 to 1-2.

## Pandas - Extracting data

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Pandas - Cleaning data

In [4]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]

df = df[cols]


train_data = df.values

In [5]:
df.head(10)

Unnamed: 0,Survived,PassengerId,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_C,Embarked_Q,Embarked_S
0,0,1,3,22.0,1,0,7.25,1,0,0,1
1,1,2,1,38.0,1,0,71.2833,0,1,0,0
2,1,3,3,26.0,0,0,7.925,0,0,0,1
3,1,4,1,35.0,1,0,53.1,0,0,0,1
4,0,5,3,35.0,0,0,8.05,1,0,0,1
5,0,6,3,29.699118,0,0,8.4583,1,0,1,0
6,0,7,1,54.0,0,0,51.8625,1,0,0,1
7,0,8,3,2.0,3,1,21.075,1,0,0,1
8,1,9,3,27.0,0,2,11.1333,0,0,0,1
9,1,10,2,14.0,1,0,30.0708,0,1,0,0


In [6]:
df[:1]

Unnamed: 0,Survived,PassengerId,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_C,Embarked_Q,Embarked_S
0,0,1,3,22,1,0,7.25,1,0,0,1


## TensorFlow - Training the model

In [61]:
import skflow
import random
import tensorflow as tf
from sklearn.cross_validation import train_test_split

random.seed(42) # to sample data the same way

X_train, X_test, y_train, y_test = train_test_split(train_data[:800, 2:], train_data[:800, 0], test_size=0.1, random_state=42)

# To set up custom decay, set learning_rate = <custom-function-name> when calling TensorFlowEstimator()
def exp_decay(global_step):
    return tf.train.exponential_decay(
        learning_rate=0.01, global_step=global_step,
        decay_steps=2, decay_rate=0.001)
def my_model(X, y):
    layers = skflow.ops.dnn(X, [40, 20, 10])
    return skflow.models.logistic_regression(layers, y)

# model = skflow.TensorFlowDNNClassifier(hidden_units=[20, 40, 10], n_classes=2, batch_size=128, steps=1000,
#                                       learning_rate=0.05)
model = skflow.TensorFlowEstimator(model_fn=my_model, n_classes=2, batch_size=64, steps=10000, learning_rate=0.01)
model.fit(X_train, y_train)
model.save('/tmp/tf_examples/my_model_1/')

Step #100, epoch #8, avg. train loss: 0.64138
Step #200, epoch #16, avg. train loss: 0.60496
Step #300, epoch #25, avg. train loss: 0.59895
Step #400, epoch #33, avg. train loss: 0.59931
Step #500, epoch #41, avg. train loss: 0.58970
Step #600, epoch #50, avg. train loss: 0.58940
Step #700, epoch #58, avg. train loss: 0.58570
Step #800, epoch #66, avg. train loss: 0.57638
Step #900, epoch #75, avg. train loss: 0.57474
Step #1000, epoch #83, avg. train loss: 0.57221
Step #1100, epoch #91, avg. train loss: 0.56770
Step #1200, epoch #100, avg. train loss: 0.56261
Step #1300, epoch #108, avg. train loss: 0.56148
Step #1400, epoch #116, avg. train loss: 0.55801
Step #1500, epoch #125, avg. train loss: 0.55661
Step #1600, epoch #133, avg. train loss: 0.54321
Step #1700, epoch #141, avg. train loss: 0.54437
Step #1800, epoch #150, avg. train loss: 0.53698
Step #1900, epoch #158, avg. train loss: 0.53476
Step #2000, epoch #166, avg. train loss: 0.52711
Step #2100, epoch #175, avg. train loss: 

## TensorFlow - Making predictions

In [62]:
from sklearn.metrics import accuracy_score

print ('Train accuracy')
print(accuracy_score(model.predict(X_train), y_train))


print ('Test accuracy')
print(accuracy_score(model.predict(X_test), y_test))

# X_test.shape

# y_test = train_data[800:, 0]
# y_prediction = model.predict(train_data[800:, 2:])
# print accuracy_score(y_prediction, y_test)
#print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Train accuracy
0.826388888889
Test accuracy
0.8125


# Prepare output of submission

In [63]:
df_test = pd.read_csv('../data/test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df_test['Age'] = df_test['Age'].fillna(age_mean)

fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            fare_means[x['Pclass']] if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')],
                axis=1)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

print df_test.head(10)
print df_test.shape

test_data = df_test.values


   PassengerId  Pclass   Age  SibSp  Parch     Fare  Gender  Embarked_C  \
0          892       3  34.5      0      0   7.8292       1           0   
1          893       3  47.0      1      0   7.0000       0           0   
2          894       2  62.0      0      0   9.6875       1           0   
3          895       3  27.0      0      0   8.6625       1           0   
4          896       3  22.0      1      1  12.2875       0           0   
5          897       3  14.0      0      0   9.2250       1           0   
6          898       3  30.0      0      0   7.6292       0           0   
7          899       2  26.0      1      1  29.0000       1           0   
8          900       3  18.0      0      0   7.2292       0           1   
9          901       3  21.0      2      0  24.1500       1           0   

   Embarked_Q  Embarked_S  
0           1           0  
1           0           1  
2           1           0  
3           0           1  
4           0           1  
5     



In [64]:

#output = model_linear_regression.predict(test_data[:, 1:])
output = model.predict(test_data[:,1:])

In [65]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('../results/titanic_3-3.csv', index=False)

## Appendix: Installation

For Mac:

For Ubuntu: