<h1>Titanic - Deep learning</h1>

In [1]:
# I case you are using a older version of Sklearn warnings will be generated
# Let's turn this off to keep the code clean
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

train_df = pd.read_csv('./data/train.csv')

In [3]:
# X has every column except Survived
X = train_df[[column for column in train_df.columns if not column == 'Survived']]
y = train_df['Survived']

In [4]:
# Let's clean up X
del X['PassengerId']
del X['Name']
del X['Ticket']

In [5]:
# Remove empty values
# The sklearn imputer replaces empty values with whatever we set it to be
# We import numpy because empty values get represnted as a numpy.nan object by default
import numpy as np
from sklearn.impute import SimpleImputer

# We replace all empty values (numpy.nan objects) inside the age column with the median
age_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X['Age'] = age_imputer.fit_transform(X[['Age']])


# We replace all empty values (numpy.nan objects) inside the cabin column with the Unknown
cabin_imputer = SimpleImputer(missing_values=np.nan, fill_value='Unknown', strategy='constant')
X['Cabin'] = cabin_imputer.fit_transform(X[['Cabin']])

# We replace all empty values (numpy.nan objects) inside the embarked column with the most frequent value
embarked_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X['Embarked'] = cabin_imputer.fit_transform(X[['Embarked']])

In [6]:
# Let's encode all the catigorical values. So that the machine learning model understands the categorical values
# We will use onehot encoding:
# https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f


# I wrote a function that will make our live easier. You can forget the syntax
from sklearn.preprocessing import OneHotEncoder

def encode(dataframe, column):
    encoder = OneHotEncoder()
    encoded_array = encoder.fit_transform(dataframe[[column]]).toarray()
    
    for index in range(encoded_array.shape[1]):
        dataframe[column + str(index)] = encoded_array[:, index]
    
    del dataframe[column]    
    return dataframe

X = encode(X, 'Cabin')
X = encode(X, 'Sex')
X = encode(X, 'Embarked')

In [7]:
# Create train and test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [8]:
# Deeeeeep learning time
# Install: conda install keras
from keras.models import Sequential
from keras.layers import Dense

# Define the layers

# Start a model
model = Sequential()

# Argument 1 is the amount if neurons.
# Activation: is the formula that gets applied at the end of a neuron
# Input_dim: amount of features. I this case the amount of columns
model.add(Dense(32, activation='relu', input_dim=len(X.columns))) # Input layer

model.add(Dense(30, activation='relu')) # Hidden layer
model.add(Dense(30, activation='relu')) # Hidden layer

model.add(Dense(1, activation='sigmoid')) # Output layer


# Compile the layers
model.compile(
              optimizer='adam', # The optimize algorithm
              loss='binary_crossentropy', # Because of our predict a binary (1=survived, 0=died)
              metrics=['accuracy'] # Aiming for the best accuracy
        )

Using TensorFlow backend.


In [9]:
# Train the model
model.fit(X_train, y_train, epochs=25) # Epochs is the amount of time the algorithm is going to retrain

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a3ce900d0>

In [10]:
# Predicting
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred.round())

0.7877094972067039