In [151]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os  

In [152]:
titanic_df = pd.read_csv("./data/train.csv")
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [153]:
titanic_df["sex_enc"] = np.where(titanic_df["Sex"]=="male", 1, 0)

In [154]:
mean_age = titanic_df["Age"].mean()
titanic_df["new_age"]= titanic_df["Age"].fillna(mean_age)

In [155]:
age_bins = [0, 20, 40, 60, 80, 100]
age_lbl = [20, 40, 60, 80, 100]
titanic_df['age_bin'] = pd.cut(titanic_df['new_age'], bins= age_bins, labels=age_lbl)
titanic_df = pd.get_dummies(titanic_df, columns=["age_bin"])

In [156]:
titanic_df = pd.get_dummies(titanic_df, columns=["Embarked"])
titanic_df["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [157]:
titanic_df = pd.get_dummies(titanic_df, columns=["Pclass"])

In [158]:
train, val, test = np.split(titanic_df.sample(frac=1), [int(0.8*len(titanic_df)), int(0.9*len(titanic_df))])
feature_list = ["sex_enc", "Parch", "SibSp", "Embarked_C", "Embarked_Q", "Embarked_S"
                , "Pclass_1", "Pclass_2", "Pclass_3", "age_bin_20", "age_bin_40", "age_bin_60", "age_bin_80", "age_bin_100"]

In [159]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Survived')
  ds = tf.data.Dataset.from_tensor_slices((dataframe[feature_list], labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [160]:
batch_size = 10
train_ds = df_to_dataset(train, shuffle=False, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [161]:
input_shape = (len(feature_list),)
inputs = tf.keras.layers.Input(shape=input_shape)
x = tf.keras.layers.Dense(32, activation="relu")(inputs)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, output)
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy"])
model.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1fe103551c8>

In [162]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.36911922693252563
Accuracy:  0.8333333134651184


In [164]:
test_data = pd.read_csv("./data/test.csv")
pred_in = test_data.copy()
pred_in["sex_enc"] = np.where(pred_in["Sex"]=="male", 1, 0)
pred_in["new_age"] = pred_in["Age"].fillna(mean_age)
pred_in = pd.get_dummies(pred_in, columns=["Embarked"])
pred_in = pd.get_dummies(pred_in, columns=["Pclass"])
pred_in['age_bin'] = pd.cut(pred_in['new_age'], bins= age_bins, labels=age_lbl)
pred_in = pd.get_dummies(pred_in, columns=["age_bin"])

In [165]:
pred_ds = tf.data.Dataset.from_tensor_slices(pred_in[feature_list])
pred_ds = pred_ds.batch(batch_size)

In [166]:
predictions = model.predict(pred_ds)
pred_pd = pd.DataFrame(predictions, columns=["pred_survived"])

In [171]:
pred_pd["Survived"] = np.where(pred_pd["pred_survived"]>0.5, 1, 0)

In [172]:
out = pd.merge(pred_in, pred_pd, left_index=True, right_index=True)

In [173]:
out_pd = out[["PassengerId","Survived"]]

In [174]:
os.makedirs('./data/', exist_ok=True)  
out_pd.to_csv('./data/out.csv',index=False)  