In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import tensorflow for later use in the NN
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

# Upload the data with pandas
full_train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
full_train_data

In [None]:
# Import the random module. We will need it for randomly initialising the NN and also randomly splitting the data into training and cross-validation
import random

# Randomly permute the passanger order
rand_passenger = [x for x in range(891)]
random.shuffle(rand_passenger)

# Split into train_data, cv_data and test_data at a ratio of roughly 3:1:1 resp.
passengers_train = rand_passenger[:-178]
# passengers_cv = rand_passenger[-356:-178]
passengers_cv = rand_passenger[-178:]

train_data = full_train_data.loc[passengers_train,:]
cv_data = full_train_data.loc[passengers_cv,:]
# test_data = full_train_data.loc[passengers_test,:]

train_data

In [None]:
# Here we will use a data set that has the Ticket, Cabin, Name,, PassengerId and Embarked removed
# It seems likely that these will not play a role in determining whether a passenger survived but we can check later to see if we were wrong

train_data_1 = train_data[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]
cv_data_1 = cv_data[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]
# test_data_1 = test_data[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]

train_data_1

In [None]:
# Here we will mean normalise the age and the fare. Everything else is roughly on the same range
age_mean = train_data_1['Age'].mean()
age_range = train_data_1['Age'].max() - train_data_1['Age'].min()

fare_mean = train_data_1['Fare'].mean()
fare_range = train_data_1['Fare'].max()-train_data_1['Fare'].min()

# The normalised training data is
train_data_1.loc[:,'Fare'] = (train_data_1.loc[:,'Fare'] - fare_mean)/fare_range
train_data_1.loc[(pd.notnull(train_data_1['Age'])),'Age'] = (train_data_1.loc[(pd.notnull(train_data_1['Age'])),'Age'] - age_mean)/age_range

train_data_1

In [None]:
# We also need to normalise the test and cv data
# We do this with the same mean and average calculated in the training data

# The normalised test data is
# test_data_1.loc[:,'Fare'] = (test_data_1.loc[:,'Fare'] - fare_mean)/fare_range
# test_data_1.loc[(pd.notnull(test_data_1['Age'])),'Age'] = (test_data_1.loc[(pd.notnull(test_data_1['Age'])),'Age'] -age_mean)/age_range

# The normalised cv data is
cv_data_1.loc[:,'Fare'] = (cv_data_1.loc[:,'Fare'] - fare_mean)/fare_range
cv_data_1.loc[(pd.notnull(cv_data_1['Age'])),'Age'] = (cv_data_1.loc[(pd.notnull(cv_data_1['Age'])),'Age'] - age_mean)/age_range

# test_data_1,
cv_data_1

In [None]:
# Some of the ages are missing. We need to find a way to replace them
# We will replace NaN with the average age (rounded) for that sex and Pclass
# We will just do the replacement with the train_data average rather than the full_train_data average

# The following splits the data into those with no age and those with age
no_age =  train_data_1[pd.isnull(train_data_1['Age'])]
with_age = train_data_1[pd.notnull(train_data_1['Age'])]


# No we calculate the mean age of people in each class and sex
with_age_1m = with_age[(with_age['Sex'] == 'male') & (with_age['Pclass'] == 1)]
average_1m = with_age_1m['Age'].mean()

with_age_2m = with_age[(with_age['Sex'] == 'male') & (with_age['Pclass'] == 2)]
average_2m = with_age_2m['Age'].mean()

with_age_3m = with_age[(with_age['Sex'] == 'male') & (with_age['Pclass'] == 3)]
average_3m = with_age_3m['Age'].mean()

with_age_1f = with_age[(with_age['Sex'] == 'female') & (with_age['Pclass'] == 1)]
average_1f = with_age_1f['Age'].mean()

with_age_2f = with_age[(with_age['Sex'] == 'female') & (with_age['Pclass'] == 2)]
average_2f = with_age_2f['Age'].mean()

with_age_3f = with_age[(with_age['Sex'] == 'female') & (with_age['Pclass'] == 3)]
average_3f = with_age_3f['Age'].mean()

average_1m, average_2m, average_3m, average_1f, average_2f, average_3f

In [None]:
# We replace all the NaN with the average
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'male') & (train_data_1['Pclass']==1),'Age'] = average_1m
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'male') & (train_data_1['Pclass']==2),'Age'] = average_2m
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'male') & (train_data_1['Pclass']==3),'Age'] = average_3m
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'female') & (train_data_1['Pclass']==1),'Age'] = average_1f
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'female') & (train_data_1['Pclass']==2),'Age'] = average_2f
train_data_1.loc[(pd.isnull(train_data_1['Age'])) & (train_data_1['Sex'] == 'female') & (train_data_1['Pclass']==3),'Age'] = average_3m

In [None]:
# We need to do the same with the test data and the cv data
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'male') & (test_data_1['Pclass']==1),'Age'] = average_1m
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'male') & (test_data_1['Pclass']==2),'Age'] = average_2m
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'male') & (test_data_1['Pclass']==3),'Age'] = average_3m
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'female') & (test_data_1['Pclass']==1),'Age'] = average_1f
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'female') & (test_data_1['Pclass']==2),'Age'] = average_2f
# test_data_1.loc[(pd.isnull(test_data_1['Age'])) & (test_data_1['Sex'] == 'female') & (test_data_1['Pclass']==3),'Age'] = average_3m

cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'male') & (cv_data_1['Pclass']==1),'Age'] = average_1m
cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'male') & (cv_data_1['Pclass']==2),'Age'] = average_2m
cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'male') & (cv_data_1['Pclass']==3),'Age'] = average_3m
cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'female') & (cv_data_1['Pclass']==1),'Age'] = average_1f
cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'female') & (cv_data_1['Pclass']==2),'Age'] = average_2f
cv_data_1.loc[(pd.isnull(cv_data_1['Age'])) & (cv_data_1['Sex'] == 'female') & (cv_data_1['Pclass']==3),'Age'] = average_3m

In [None]:
# We now want to change the strings male/female into something numerical
# I will take the Sex column and split it into 2 boolean columns male/female consisting of 1's and 0's

# For train data
is_male_train = (train_data_1['Sex']=='male')
is_female_train = (train_data_1['Sex']=='female')

# For test data
# is_male_test = (test_data_1['Sex']=='male')
# is_female_test = (test_data_1['Sex']=='female')

# For cv data
is_male_cv = (cv_data_1['Sex']=='male')
is_female_cv = (cv_data_1['Sex']=='female')


In [None]:
# We now want to join them all together (c stands for cleaned)

# The cleaned training data is
hold = pd.concat(
    [train_data_1,
     pd.Series(data = is_male_train*1, index = is_male_train.index, name = 'Male'), 
     pd.Series(data=is_female_train*1,index = is_female_train.index,name='Female')],
    axis=1)
c_train_data_1 = hold[['Survived','Pclass','Age','SibSp','Parch','Fare','Male','Female']]

# The cleaned test data is
# hold = pd.concat(
#     [test_data_1,
#      pd.Series(data = is_male_test*1, index = is_male_test.index, name = 'Male'),
#      pd.Series(data=is_female_test*1,index = is_female_test.index,name='Female')],
#     axis=1)
# c_test_data_1 = hold[['Survived','Pclass','Age','SibSp','Parch','Fare','Male','Female']]

# the cleaned cv data is
hold = pd.concat(
    [cv_data_1,
     pd.Series(data = is_male_cv*1, index = is_male_cv.index, name = 'Male'),
     pd.Series(data=is_female_cv*1,index = is_female_cv.index,name='Female')],
    axis=1)
c_cv_data_1 = hold[['Survived','Pclass','Age','SibSp','Parch','Fare','Male','Female']]

In [None]:
c_train_data_1

In [None]:
# We are now in a position to run the NN on this training data
# We will use the tensorflow to build and run the NN

model_1 = keras.Sequential([
    keras.Input(shape = (7,)),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(5,activation = 'relu'),
    keras.layers.Dense(5,activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [None]:
# The following compiles the model and determines the optimization algorithm as well as the metric to determine how well the model has done

model_1.compile(optimizer='adam',
               loss = 'mean_squared_error',
               metrics = ['accuracy'])

In [None]:
X = c_train_data_1[['Pclass','Age','SibSp','Parch','Fare','Male','Female']].astype('float32')
y = c_train_data_1['Survived'].astype('float32')

X_val = c_cv_data_1[['Pclass','Age','SibSp','Parch','Fare','Male','Female']].astype('float32')
y_val = c_cv_data_1['Survived'].astype('float32')

model_1.fit(X,y, epochs=70,validation_data = (X_val,y_val))

In [None]:
plt.plot(model_1.history.history['accuracy'])
plt.plot(model_1.history.history['val_accuracy'])

In [None]:
c_train_data_2 = c_train_data_1[['Survived','Pclass','Age','Male','Female']]
c_cv_data_2 = c_cv_data_1[['Survived','Pclass','Age','Male','Female']]
X = c_train_data_2[['Pclass','Age','Male','Female']].astype('float32')
y = c_train_data_2[['Survived']].astype('float32')

X_val = c_cv_data_2[['Pclass','Age','Male','Female']].astype('float32')
y_val = c_cv_data_2[['Survived']].astype('float32')

In [None]:
model_2 = keras.Sequential([
    keras.Input(shape = (4,)),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(10,activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model_2.compile(optimizer='adam',
               loss = 'mean_squared_error',
               metrics = ['accuracy','recall'])

model_2.fit(X,y, epochs=50,validation_data = (X_val,y_val))

plt.plot(model_2.history.history['accuracy'])
plt.plot(model_2.history.history['val_accuracy'])
plt.plot(model_2.history.history['recall'])
plt.plot(model_2.history.history['val_recall'])