In [None]:
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

# Titanic with TensorFlow

In this notebook you will learn how to classify structured data using Keras preprocessing layers. You will use [Keras](https://www.tensorflow.org/guide/keras) to define the model, and [preprocessing layers](https://www.tensorflow.org/guide/keras/preprocessing_layers) as a bridge to map from columns in a CSV to features used to train the model. 

The content is based on a [tutorial](https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers) from the TensorFlow team. Some utility functions used in this notebook are copied from this tutorial and we recommend to refer to it for more details.

You will train and deploy a Neural Network to predict which passengers survived the Titanic shipwreck (see [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/overview) at Kaggle).

#### Install Required TensorFlow version

Some code snippets used in this notebook only work with TensorFlow 2.3.1 and above.

Uncomment and run the below upgrade command if needed and restart the kernel afterwards via `Kernel > Restart` to use the updated packages. 

In [None]:
tf.__version__

In [None]:
# pip install --upgrade tensorflow

### Use Pandas to create a dataframe

Import training data from public Github URL and load it into a dataframe.

*In previous notebooks you imported the csv file from the project assets in Watson Studio. Below code snippet illustrates another way for the same task.*

In [None]:
dataset_url = "https://raw.githubusercontent.com/daka1510/hhz-artificial-intelligence-vl-s21/main/Notebooks/Titanic/train.csv"
csv_file = tf.keras.utils.get_file("train.csv", dataset_url, cache_dir=".")
dataframe = pd.read_csv(csv_file)

In [None]:
dataframe.head()

### Prepare data

Data preparation for this dataset was covered in depth in a different notebook (see [HHZ - Titanic Data Preparation](https://github.com/daka1510/ai-workshop-hhz/blob/master/Notebooks/Titanic/%5BHHZ%5D%20Titanic%20Data%20Preparation.ipynb)).

In [None]:
# check for missing values
dataframe.isnull().sum()

In [None]:
# drop column 'Cabin' since there are too many missing values
dataframe = dataframe.drop(["Cabin"], axis=1)

In [None]:
# impute missing values for 'Embarked': use most frequent value
dataframe["Embarked"] = dataframe["Embarked"].fillna(dataframe["Embarked"].mode().iloc[0])

In [None]:
# impute missing values for 'Age': use mean value
dataframe["Age"] = dataframe["Age"].fillna((dataframe["Age"].mean()))

In [None]:
# verify results
dataframe.isnull().sum()

### Split the dataframe into train, validation, and test

In [None]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(f"{len(train)} train examples")
print(f"{len(val)} validation examples")
print(f"{len(test)} test examples")

### Create an input pipeline using tf.data

Next, you will wrap the dataframes with tf.data, in order to shuffle and batch the data. If you were working with a very large CSV file (so large that it does not fit into memory), you would use tf.data to read it from disk directly. That is not covered in this tutorial.

Note: Below utility functions are copied from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers and have been adapted for our use-case.

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Survived")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

Now that you have created the input pipeline, let's call it to see the format of the data it returns. You have used a small batch size to keep the output readable.

In [None]:
batch_size = 5
train_ds = df_to_dataset(train, shuffle=False, batch_size=batch_size)

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print(f"Every feature: {list(train_features.keys())}")
print(f'A batch of ages: {train_features["Age"]}')
print(f"A batch of targets {label_batch}")

In [None]:
# check documentation for details:
# - https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Normalization
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
# check documentation for details:
# - https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/CategoryEncoding
# - https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/IntegerLookup
# - https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == "string":
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))

Let's use a few examples to get an understanding of how the encoding works:

*Create a normalization layer for 'Age'*

In [None]:
age_col = train_features["Age"]
age_col

In [None]:
layer = get_normalization_layer("Age", train_ds)
layer(age_col)

*Create a category encoding layer for 'Embarked' (string)*

In [None]:
embarked_col = train_features["Embarked"]
embarked_col

In [None]:
category_encoding_layer = get_category_encoding_layer("Embarked", train_ds, "string")
category_encoding_layer(embarked_col)

*Create a category encoding layer for 'Pclass' (numeric)*

In [None]:
pclass_col = train_features["Pclass"]
pclass_col

In [None]:
category_encoding_layer = get_category_encoding_layer("Pclass", train_ds, "int64")
category_encoding_layer(pclass_col)

#### Choose which columns to use

You have seen how to use several types of preprocessing layers. Now you will use them to train a model. You will be using [Keras-functional API](https://www.tensorflow.org/guide/keras/functional) to build the model. The Keras functional API is a way to create models that are more flexible than the [tf.keras.Sequential API](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential).

In [None]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
dataframe.info()

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in ["Age", "SibSp", "Parch", "Fare"]:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

In [None]:
# Categorical features encoded as integers.
pclass_col = tf.keras.Input(shape=(1,), name="Pclass", dtype="int64")
encoding_layer = get_category_encoding_layer("Pclass", train_ds, dtype="int64")
encoded_pclass_col = encoding_layer(pclass_col)

all_inputs.append(pclass_col)
encoded_features.append(encoded_pclass_col)

In [None]:
# Categorical features encoded as string.
categorical_cols = ["Sex", "Embarked"]
for header in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype="string")
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype="string")
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

#### Create, compile, and train the model

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x1 = tf.keras.layers.Dense(32, activation="relu")(all_features)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x1)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"])


In [None]:
model.summary()

In [None]:
history = model.fit(train_ds, epochs=50, validation_data=val_ds)

Review how training and validation accuracy evolved

In [None]:
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs = range(len(acc))

plt.plot(epochs, acc, "darkgreen", label="Training accuracy")
plt.plot(epochs, val_acc, "darkblue", label="Validation accuracy")
plt.plot(epochs, loss, "lightgreen", label="Training loss")
plt.plot(epochs, val_loss, "lightblue", label="Validation loss")
plt.title("Training and validation accuracy")
plt.legend(loc=0)
plt.figure()


plt.show()

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Try different neural network configurations (e.g. add additional layers, change the number of neurons per layer, or train for more epochs). Do you get a better accuracy than using the initial configuration?

Continue once you are satisfied with the results.

### Make local prediction

Save model to filesystem and reload it for test purposes.

In [None]:
!rm -rf titanic_model
model.save('titanic_model/1', save_format='tf')
reloaded_model = tf.keras.models.load_model('titanic_model/1')

Make a local prediction with the aid of the reloaded model

In [None]:
sample_passenger_df = dataframe.iloc[1:2]
sample_passenger_df

In [None]:
sample_passenger_df = sample_passenger_df.drop(["Survived", "Name", "Ticket", "PassengerId"], axis=1)

In [None]:
# drop target ('Survived') from passenger record
sample_passenger_dict = sample_passenger_df.to_dict(orient="records")[0]
sample_passenger_dict

In [None]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample_passenger_dict.items()}
predictions = reloaded_model.predict(input_dict)

print("This passenger had a %.1f percent probability of surviving the Titanic shipwreck." % (100 * predictions[0]))

In [None]:
predictions[0]


### Deploy the model

At the time of writing TensorFlow 2.3 was not yet supported by Watson Machine Learning (https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/pm_service_supported_frameworks.html?audience=wdp). In the following you will deploy your model with the aid of https://www.tensorflow.org/tfx/serving/docker.

Working through the below section is **optional** and requires a local Docker installation (installation instructions available at the referenced [link](https://www.tensorflow.org/tfx/serving/docker#install_docker)).

#### Archive the model directory and export it

In a first step, download and unzip the exported model to your local machine. In Watson Studio, below code snippets help to export the model.

In [None]:
# review content of the model directory
!ls -ll titanic_model/1

In [None]:
# zip model directory
!zip -r titanic_model_v1.zip titanic_model

In [None]:
# review directory content
!ls -ll

In [None]:
# If you run this notebook in Watson Studio, make sure to run "Insert project token" first
f = open("titanic_model_v1.zip", "rb")
project.save_data("titanic_model_v1.zip", f.read(), overwrite=True)

Now navigate to your project's asset list, download and unzip the file.

#### Deploy the model

Once Docker is installed, you can run below commands to start a serving image (update paths accordingly):

```
docker pull tensorflow/serving
docker run -t --rm -p 8501:8501 \
    -v "/Users/dkaulen/Desktop/TF/titanic_model:/models/titanic_model" \
    -e MODEL_NAME=titanic_model \
    tensorflow/serving

```

If everything worked, you should see output similar to

```
2020-11-29 17:26:24.774657: I tensorflow_serving/model_servers/server.cc:87] Building single TensorFlow model file config:  model_name: titanic_model model_base_path: /models/titanic_model
2020-11-29 17:26:24.779699: I tensorflow_serving/model_servers/server_core.cc:464] Adding/updating models.
2020-11-29 17:26:24.779750: I tensorflow_serving/model_servers/server_core.cc:575]  (Re-)adding model: titanic_model
2020-11-29 17:26:24.894871: I tensorflow_serving/core/basic_manager.cc:739] Successfully reserved resources to load servable {name: titanic_model version: 1}
2020-11-29 17:26:24.894991: I tensorflow_serving/core/loader_harness.cc:66] Approving load for servable version {name: titanic_model version: 1}
2020-11-29 17:26:24.895039: I tensorflow_serving/core/loader_harness.cc:74] Loading servable version {name: titanic_model version: 1}
2020-11-29 17:26:24.896290: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:31] Reading SavedModel from: /models/titanic_model/1
2020-11-29 17:26:24.917508: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:54] Reading meta graph with tags { serve }
2020-11-29 17:26:24.917573: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:234] Reading SavedModel debug info (if present) from: /models/titanic_model/1
2020-11-29 17:26:24.920399: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-11-29 17:26:24.977981: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:199] Restoring SavedModel bundle.
2020-11-29 17:26:25.081348: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:183] Running initialization op on SavedModel bundle at path: /models/titanic_model/1
2020-11-29 17:26:25.104055: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:303] SavedModel load for tags { serve }; Status: success: OK. Took 207773 microseconds.
2020-11-29 17:26:25.107537: I tensorflow_serving/servables/tensorflow/saved_model_warmup_util.cc:59] No warmup data file found at /models/titanic_model/1/assets.extra/tf_serving_warmup_requests
2020-11-29 17:26:25.114425: I tensorflow_serving/core/loader_harness.cc:87] Successfully loaded servable version {name: titanic_model version: 1}
2020-11-29 17:26:25.123920: I tensorflow_serving/model_servers/server.cc:367] Running gRPC ModelServer at 0.0.0.0:8500 ...
[warn] getaddrinfo: address family for nodename not supported
2020-11-29 17:26:25.126379: I tensorflow_serving/model_servers/server.cc:387] Exporting HTTP/REST API at:localhost:8501 ...
[evhttp_server.cc : 238] NET_LOG: Entering the event loop ...


```

You can now make a sample prediction as below. See https://www.tensorflow.org/tfx/serving/api_rest for details.
```
curl -d '{"instances": [{"Pclass":[3],"Sex":["male"],"Age":[22.0],"SibSp":[1],"Parch":[0],"Fare":[7.25],"Embarked":["S"]}]}' -X POST http://localhost:8501/v1/models/titanic_model:predict
```
If everything worked, you should see output similar to 
```
{
    "predictions": [[0.00236016512]]
}
```
The predicted probability should match the result of the local prediction you ran in your notebook.

In [None]:
# use below helper to  generate the command for the selected sample passenger
input_dict = {name: [value] for name, value in sample_passenger_dict.items()}
instances = {"instances": [input_dict]}
print(f"curl -d '{json.dumps(instances)}' -X POST http://localhost:8501/v1/models/titanic_model:predict")