### Keras Example
Logistic regression for structured data on the Titanic dataset

***
#### Environment
`conda activate tf-env`

***
#### Goals
- Explore Tensorflow via Keras API

***
#### References

https://keras.io/  
https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers  

#### Basic python imports

In [None]:
import numpy as np
import pandas as pd
np.set_printoptions(precision=3, suppress=True)
from IPython.display import display

#### Dataset load using sklearn API from https://www.openml.org site

https://www.openml.org/d/40945

If the URL does not work the dataset can be loaded from the data folder `./data/titanic/`. 

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
raw_dataset.head(5)

In [None]:
# Convert categorical numeric columns pclass , sibsp and parch to int values
# Drop un-used columns.
raw_dataset.pclass = raw_dataset.pclass.astype(int)
raw_dataset.sibsp = raw_dataset.sibsp.astype(int)
raw_dataset.parch = raw_dataset.parch.astype(int)
dataset = raw_dataset.copy().drop(columns=['name','ticket','cabin','boat', 'body', 'home.dest'])
display(dataset.describe().transpose())
display(dataset.info())
display(dataset.isna().sum())

In [None]:
# Remove NA
dataset = dataset.dropna()
#dataset.info()

### Prepare train, validation and test datasets

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
import tensorflow as tf

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('survived').astype(int)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))

In [None]:
batch_size = 100
train_ds = df_to_dataset(train, shuffle=True, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['age'])
print('A batch of targets:', label_batch )

In [None]:
test.head()

### Build the pipeline components

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in ['age', 'fare']:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

In [None]:
# # Categorical features encoded as integers.
int_categorical_col = ['pclass', 'sibsp', 'parch']
for header in int_categorical_col:
    numeric_cat_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype='int64',
                                             max_tokens=5)
    encoded_numeric_cat_col = encoding_layer(numeric_cat_col)
    all_inputs.append(numeric_cat_col)
    encoded_features.append(encoded_numeric_cat_col)


In [None]:
# Categorical features encoded as string.
categorical_cols = ['sex', 'embarked']
for header in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string',
                                               max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)


### Assemble the pipeline

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])


### Visualize the pipeline

In [None]:
# rankdir='LR' is used to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")


### Train the model

In [None]:
%%time
model.fit(train_ds, epochs=10, validation_data=val_ds)

### Evaluate the model

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)
print("Loss", loss)


### Save the model

In [None]:
model.save('my_keras_titanic_classifier')

### Reload the model

In [None]:
reloaded_model = tf.keras.models.load_model('my_keras_titanic_classifier')
loss, accuracy = reloaded_model.evaluate(test_ds)

In [None]:
def predict_survival(sample):
    input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
    predictions = reloaded_model.predict(input_dict)
    prob = tf.nn.sigmoid(predictions[0])
    print( "This person had a %.1f percent probability of survival." % (100 * prob))

### Predict using the model

In [None]:
%%time

predict_survival({
    'pclass': 1,
    'sex': 'female',
    'age': 7,
    'sibsp': 1,
    'parch': 2,
    'fare': 39.4000,
    'embarked': 'S'
})

predict_survival({
    'pclass': 1,
    'sex': 'male',
    'age': 7,
    'sibsp': 1,
    'parch': 2,
    'fare': 39.4000,
    'embarked': 'S'
})