In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

pd.set_option('display.max_columns', None)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

print(tf.__version__)

2.15.0


# Extract data

In [2]:
space_titanic_train = pd.read_csv('data/space_titanic_train.csv')
space_titanic_train.info()

FileNotFoundError: [Errno 2] No such file or directory: 'data/space_titanic_train.csv'

# Explore the data

In [None]:
space_titanic_train.describe(include='all')

In [None]:
fig, axes = plt.subplots(2,3, figsize=(13, 10))
axes = axes.flatten()


sns.histplot(data = space_titanic_train, x= 'Age', hue='Transported',  multiple='fill', palette='Set2', ax = axes[0])
sns.histplot(data = space_titanic_train, x= 'CryoSleep', hue='Transported',  multiple='fill', palette='Set2', ax = axes[1])
sns.histplot(data = space_titanic_train, x= 'HomePlanet', hue='Transported',  multiple='fill', palette='Set2', ax = axes[2])
sns.histplot(data = space_titanic_train, x= 'Destination', hue='Transported',  multiple='fill', palette='Set2', ax = axes[3])
sns.histplot(data = space_titanic_train, x= 'VIP', hue='Transported',  multiple='fill', palette='Set2', ax = axes[4])
sns.histplot(data = space_titanic_train, x= 'HomePlanet', hue='Transported',  multiple='stack', palette='Set2', ax = axes[5])


# Cleaning and filling missing values

In [None]:
space_titanic_train['VIP'] = space_titanic_train['VIP'].fillna(False)
space_titanic_train['VIP'] = space_titanic_train['VIP'].astype('bool')

space_titanic_train['CryoSleep'] = space_titanic_train['CryoSleep'].fillna(False)

space_titanic_train['Age'] = space_titanic_train['Age'].fillna(space_titanic_train['Age'].mode()[0])


space_titanic_train[['HomePlanet', 'Destination']] = space_titanic_train[['HomePlanet', 'Destination']].fillna('other')
space_titanic_train[['HomePlanet', 'Destination']] = space_titanic_train[['HomePlanet', 'Destination']].astype('string')

space_titanic_train[['cabin_1', 'cabin_2', 'cabin_3']] = space_titanic_train['Cabin'].str.split('/', expand=True)
space_titanic_train[['cabin_1', 'cabin_3']] = space_titanic_train[['cabin_1', 'cabin_3']].fillna('O')

space_titanic_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = space_titanic_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
space_titanic_train.info()

Potential additional feature engineering:
 - adult/child
 - travel as a family
 - total spend
 - spent anything 1/0

In [None]:
space_titanic_train = space_titanic_train.drop(['PassengerId', 'Name', 'Cabin', 'cabin_2'], axis = 1)
space_titanic_train.head()

Create train, test and validation sets

In [None]:
train, test = train_test_split(space_titanic_train, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

# Create input pipeline

In [None]:
def to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('Transported')
    df = {key: value.values[:,tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

Visualise a small batch

In [None]:
train_ds = to_dataset(train, batch_size=5)

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of HomePlanet:', train_features['HomePlanet'])
print('A batch of targets:', label_batch )

# Create preprocessing layers

### For numerical features

In [None]:
def get_normalization_layer(name, dataset):
  normalizer = layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

Let's look a the results for 'ShoppingMall'

In [None]:
shopping_mall = train_features['ShoppingMall']
layer = get_normalization_layer('ShoppingMall', train_ds)
layer(shopping_mall)

### For categorical features

In [None]:
def get_category_encoding_layer(name, dataset, max_tokens=None):
  index = layers.StringLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))


Let's look at the results for the column 'HomePlanet'

In [None]:
type_col = train_features['HomePlanet']
layer = get_category_encoding_layer('HomePlanet', train_ds)
layer(type_col)

In [None]:
batch_size = 256
train_ds = to_dataset(train, batch_size=batch_size)
test_ds = to_dataset(test, shuffle=False, batch_size=batch_size)
val_ds = to_dataset(val, shuffle=False, batch_size=batch_size)


In [None]:
numeric_feature_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
other_feature_columns = ['Age', 'CryoSleep', 'VIP']
categ_feature_columns = ['HomePlanet', 'Destination', 'cabin_1', 'cabin_3']

In [None]:
inputs = []
preprocessed = []

In [None]:
for name in numeric_feature_columns:
    numeric_column = tf.keras.Input(shape=(1,), name=name)
    norm_layer = get_normalization_layer(name, train_ds)
    encoded_numeric_column = norm_layer(numeric_column)
    inputs.append(numeric_column)
    preprocessed.append(encoded_numeric_column)

In [None]:
for name in categ_feature_columns:
    categ_column = tf.keras.Input(shape=(1,), dtype='string', name=name)
    categ_layer = get_category_encoding_layer(name, train_ds)
    encoded_categ_column = categ_layer(categ_column)
    inputs.append(categ_column)
    preprocessed.append(encoded_categ_column)

In [None]:
for name in other_feature_columns:
    col = tf.keras.Input(shape=(1,), dtype='float32', name=name)
    
    inputs.append(col)
    
    preprocessed.append(col)

Let's look at 'inputs' and 'preprocessed' lists before and after preprocessing

In [None]:
inputs

In [None]:
preprocessed

### Create, compile and train the model

In [None]:
all_features = tf.keras.layers.concatenate(preprocessed)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(2, activation='softmax')(x)

model = tf.keras.Model(inputs, output)
model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(), 
              metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

### Evaluate the model

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

In [None]:
prediction = model.predict(test_ds)
prediction_binary = np.where(prediction[:,1] > 0.5, True, False)

In [None]:
cm = confusion_matrix(test['Transported'], prediction_binary)

print("Confusion Matrix:")
print(cm)

Save the model for later use

In [None]:
model.save('space_titanic')
reloaded_model = tf.keras.models.load_model('space_titanic')

# Predict on unseen data

In [None]:
space_titanic_test = pd.read_csv('data/space_titanic_test.csv')
space_titanic_test.info()

### Prepare the data

Since the normlisation and encoding is performed in the preprocessing layer we only need to set the right column type, fill the missing values and drop unused columns.

In [None]:
space_titanic_test['VIP'] = space_titanic_test['VIP'].fillna(False)
space_titanic_test['VIP'] = space_titanic_test['VIP'].astype('bool')

space_titanic_test['CryoSleep'] = space_titanic_test['CryoSleep'].fillna(False)

space_titanic_test['Age'] = space_titanic_test['Age'].fillna(space_titanic_test['Age'].mode()[0])


space_titanic_test[['HomePlanet', 'Destination']] = space_titanic_test[['HomePlanet', 'Destination']].fillna('other')
space_titanic_test[['HomePlanet', 'Destination']] = space_titanic_test[['HomePlanet', 'Destination']].astype('string')

space_titanic_test[['cabin_1', 'cabin_2', 'cabin_3']] = space_titanic_test['Cabin'].str.split('/', expand=True)
space_titanic_test[['cabin_1', 'cabin_3']] = space_titanic_test[['cabin_1', 'cabin_3']].fillna('O')

space_titanic_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = space_titanic_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
space_titanic_test.info()

In [None]:
space_titanic_test = space_titanic_test.drop(['PassengerId', 'Name', 'Cabin', 'cabin_2'], axis = 1)

In [None]:
df = space_titanic_test.copy()
df = {key: value.values[:,tf.newaxis] for key, value in space_titanic_test.items()}
ds = tf.data.Dataset.from_tensor_slices((dict(df)))

### Predict from reloaded model

In [None]:
predictions = reloaded_model.predict(ds)

In [None]:
predictions

In [None]:
prediction = np.where(predictions[:, 1] > 0.5, True, False)

### Prepare submission file

In [None]:
pred_space_titanic_test = pd.read_csv('data/space_titanic_test.csv')
pred_space_titanic_test['Transported'] = prediction
pred_space_titanic_test
submission = pred_space_titanic_test[['PassengerId', 'Transported']]
submission.to_csv('claires_submission_titanic.csv', index=False)