In [None]:
! pip install keras
import pandas as pd
import sklearn 
import numpy as np
import dotscience as ds
import matplotlib.pyplot as plt
import keras



from numpy.random import seed
seed(1)

%matplotlib inline


In [None]:
ds.interactive()

In [None]:
ds.start()

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()


In [None]:
df.Sex.unique()

In [None]:
df['Sex_binary'] = np.where(df['Sex'] == "female", 1, 0)
df.drop(columns='Sex', inplace=True)

In [None]:
# set Embarked_ternary to 1 for C, 2 for Q, 3 for S
df['Embarked_ternary'] = np.where(df["Embarked"] == "C", 1, 
         (np.where(df["Embarked"] == "Q", 2, 3)))

In [None]:
count_nan = len(df['Cabin']) - df['Cabin'].count()
print(count_nan)

OK, so most of the Cabin values are NaN, so we can probably drop this column.

In [None]:
df.drop(columns='Cabin', inplace=True)

Slightly more likely to survive if younger:

In [None]:
df.groupby(['Survived']).Age.mean()

Higher average fare amongst survivors:

In [None]:

df.groupby(['Survived']).Fare.mean()

How does survival vary with embarkation point?

In [None]:
print("number embarked at Cherbourg", len(df.loc[df['Embarked_ternary'] == 1]))
print("average survival status", df.loc[df['Embarked_ternary'] == 1].Survived.mean())
print("\n")
print("number embarked at Queenstown", len(df.loc[df['Embarked_ternary'] == 2]))
print("average survival status", df.loc[df['Embarked_ternary'] == 2].Survived.mean())
print("\n")
print("number embarked at Southampton", len(df.loc[df['Embarked_ternary'] == 3]))
print("average survival status", df.loc[df['Embarked_ternary'] == 3].Survived.mean())

OK, so there is some evidence that embarking at Cherbourg is correlated with survival. So let's turn embarkation_ternary into a binary condition: emarbarked_at_cherbourg

In [None]:
df['Embarked_at_cherbourg'] = np.where(df['Embarked_ternary'] == 1, 1, 0)
df.drop(columns="Embarked_ternary", inplace=True)

In [None]:
# how do the name lengths vary? And are they correlated with survival?

names_lens = [(name[1], len(name[1])) for name in df["Name"].iteritems()]
name_lens = [name_len[1] for name_len in names_lens]
plt.hist(name_lens)
plt.show()

In [None]:
# add an int value for name length
df['Name_len'] = [len(name[1]) for name in df["Name"].iteritems()]

In [None]:
df.corr()["Survived"]

woah, having a long name is almost as highly correlated with survival as passanger class!

In [None]:
# todo: engineer ticket class
# for now, we'll just drop it

df.drop(columns=['PassengerId', 'Embarked', 'Name', 'Ticket'], inplace=True)

In [None]:
df.dtypes

In [None]:
df.isnull().any()

We have some null ages. Let's see how many.

In [None]:
len(df.loc[df.Age.isnull()])

In [None]:
# naively fill nulls for now. Better to predict missing ages on basis of available data.

# test data also has nulls, so fill those in in same way.
df.fillna(df.mean(), inplace=True)

In [None]:
# get Survived/not Survived as  categorical value, to make compatible  with network and to use categorical_crossentropy as loss metric.
(train_passengers, train_labels) = (df.drop('Survived', axis=1), keras.utils.to_categorical(df['Survived'], num_classes=None, dtype='float32'))

In [None]:
train_labels.shape

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_passengers)
scaler.transform(train_passengers, copy=False)

In [None]:
# train_passengers is df
train_passengers[train_passengers.columns] = scaler.fit_transform(train_passengers[train_passengers.columns])

In [None]:
# make a function with preceding data-wrangling steps, so that we can perform the same operations on the test set
def wrangle(df):
    """
    takes a df with same format as training set.
    returns df in same format as modified training set
    """
    df['Sex_binary'] = np.where(df['Sex'] == "female", 1, 0)
    df.drop(columns='Sex', inplace=True)
    df['Embarked_ternary'] = np.where(df["Embarked"] == "C", 1, 
         (np.where(df["Embarked"] == "Q", 2, 3)))
    df.drop(columns='Cabin', inplace=True)
    df['Embarked_at_cherbourg'] = np.where(df['Embarked_ternary'] == 1, 1, 0)
    df.drop(columns="Embarked_ternary", inplace=True)
    names_lens = [(name[1], len(name[1])) for name in df["Name"].iteritems()]
    name_lens = [name_len[1] for name_len in names_lens]
    df['Name_len'] = [len(name[1]) for name in df["Name"].iteritems()]    
    df.drop(columns=['PassengerId', 'Embarked', 'Name', 'Ticket'], inplace=True)
    
    # naively fill in nulls for now
    df.fillna(df.mean(), inplace=True)
    return df

In [None]:
# load and transform test data
df_test = pd.read_csv("test.csv")
df_test = wrangle(df_test)

In [None]:
test_passengers = df_test
test_passengers[test_passengers.columns] = scaler.fit_transform(test_passengers[test_passengers.columns])

In [None]:
train_passengers.shape

In [None]:
train_labels.shape

In [None]:
from keras.layers import Dense, Activation, Dropout
from keras import callbacks


network = models.Sequential()

network.add(layers.Dense(input_dim=train_passengers.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros', kernel_regularizer=regularizers.l2(ds.parameter("reg_lambda", 0.005))))

network.add(Activation('relu'))

for i in range(0, 5):
    network.add(layers.Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    network.add(Activation('relu'))
    network.add(Dropout(.25))

network.add(layers.Dense(units=2))
network.add(Activation('softmax'))

network.compile(loss='categorical_crossentropy', optimizer=ds.parameter("optimizer", 'adam'), metrics=['accuracy'])

# TODO: record loss history: https://keras.io/callbacks/#example-recording-loss-history
# from keras.callbacks import ModelCheckpoint
# checkpointer = keras.callbacks.ModelCheckpoint(filepath="weights.h5", monitor='categorical_crossentropy', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1)

network.fit(train_passengers, train_labels, epochs=ds.parameter("epochs", 700), verbose=2, validation_split=0.1, callbacks=[callbacks.EarlyStopping(monitor='val_acc', patience=2)])

# network.fit(train_passengers, train_labels, epochs=700, verbose=2, validation_split=0.1)

#get the highest validation accuracy of the training epochs
acc = np.amax(network.history.history['acc'])

ds.add_summary('acc%', acc)
# ds.add_parameters(regulariser="none", epocs=700, batch_size="default", optimizer="adam")


print('Best validation acc of epoch:', acc)


With L2 regularisation:

In [None]:
plt.plot(network.history.history['acc'])
plt.plot(network.history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(network.history.history['loss'])
plt.plot(network.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

## With early stopping:

In [None]:
plt.plot(network.history.history['acc'])
plt.plot(network.history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(network.history.history['loss'])
plt.plot(network.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

## without early stopping:

In [None]:
plt.plot(network.history.history['acc'])
plt.plot(network.history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(network.history.history['loss'])
plt.plot(network.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# looks like overfitting -- add more regularisation

## Save weights

In [None]:
# Save weights to a TensorFlow Checkpoint file
# By default, this saves the model's weights in the TensorFlow checkpoint file format.
network.save_weights(ds.output('weights'))


In [None]:
results = network.predict(test_passengers)

In [None]:
results.shape

In [None]:
survived = []
for result in results:
    survived.append(result[1])


plt.scatter(survived, range(len(survived)))
plt.show()

In [None]:
# we need binary survival prediction, not a probability
binary_results = network.predict_classes(test_passengers)

In [None]:
binary_results

In [None]:
# get unwrangled version of test set with the passenger IDs
df_test = pd.read_csv("test.csv") 

In [None]:
# add column for binary survival status
df_test['Survived'] = binary_results


In [None]:
ds.publish("did it work?")

In [None]:
# write out results
df_test.drop(columns=['Pclass', 'Embarked', 'Name', 'Ticket', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Embarked'], inplace=True)
df_test.to_csv("predictions.csv", columns = ['PassengerId', 'Survived'], index=False)    
df_test.reset_index(drop=True, inplace=True)
