## Setup 

Run these cells, but don't worry about understanding the implementation of these helper functions.

In [1]:
# Install the What-If Tool widget if running in Colab 

# If running in Colab then pip install, otherwise no need.
try:
  import google.colab
  !pip install --upgrade witwidget
except Exception:
  pass

In [2]:
# Define helper functions

import pandas as pd
import numpy as np
import tensorflow as tf
import functools

# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns=None):
    feature_spec = {}
    if columns == None:
        columns = df.columns.values.tolist()
    for f in columns:
        if df[f].dtype is np.dtype(np.int64):
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.int64)
        elif df[f].dtype is np.dtype(np.float64):
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.float32)
        else:
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.string)
    return feature_spec

# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
    ret = []
    for col in columns:
        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
            ret.append(tf.feature_column.numeric_column(col))
        else:
            ret.append(tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
    return ret

# An input function for providing input to a model from tf.Examples
def tfexamples_input_fn(examples, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None, 
                       batch_size=64):
    def ex_generator():
        for i in range(len(examples)):
            yield examples[i].SerializeToString()
    dataset = tf.data.Dataset.from_generator(
      ex_generator, tf.dtypes.string, tf.TensorShape([]))
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    return dataset

# Parses Tf.Example protos into features for the input function.
def parse_tf_example(example_proto, label, feature_spec):
    parsed_features = tf.io.parse_example(serialized=example_proto, features=feature_spec)
    target = parsed_features.pop(label)
    return parsed_features, target

# Converts a dataframe into a list of tf.Example protos.
def df_to_examples(df, columns=None):
    examples = []
    if columns == None:
        columns = df.columns.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(int(row[col]))
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            elif row[col] == row[col]:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        examples.append(example)
    return examples

# Converts a dataframe column into a column of 0's and 1's based on the provided test.
# Used to force label columns to be numeric for binary classification using a TF estimator.
def make_label_column_numeric(df, label_column, test):
  df[label_column] = np.where(test(df[label_column]), 1, 0)

## 1. The Dataset: Titanic data

#### The Titanic dataset is a classification dataset: it is used for a prediction task where the goal is to determine whether a person survived the 1912 shipwreck of the RMS Titanic. The list of attributes is as follows:

- output variable: **Survival**, 0 = did not survive, 1 = did survive
- input features:
    - **Pclass**: ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
    - **Sex**: male or female
    - **Age**: quantitative continuous variable
    - **Sibsp**: # of siblings / spouses aboard the Titanic
    - **Parch**: # of parents / children aboard the Titanic	
    - **Ticket**: ticket number
    - **Fare**: passenger fare
    - **Cabin**: cabin number
    - **Embarked**: port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [16]:
# # OLD CODE (introduced new errors, but fixed type issue)

# # Read training dataset from CSV

# import pandas as pd

# df = pd.read_csv("titanic-train.csv", na_values=['none'])
# df['Age'] = df['Age'].fillna(df['Age'].mean())
# df.fillna('', inplace=True)
# df["Age"] = df["Age"].apply(lambda x: 0 if x == "" else x)
# df["Cabin"] = df["Cabin"].apply(lambda x: "Unknown" if x == "" else x)
# df["Embarked"] = df["Embarked"].apply(lambda x: "Unknown" if x == "" else x)
# df["Pclass"] = df["Pclass"].apply(lambda x: str(x))

# # Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# # the column names, then set this to None.
# csv_columns = [
#   "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age",
#   "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

# df_train = df.iloc[:, :445]
# df_test = df.iloc[:, 445:]

# # df["Age"] = pd.Categorical(df["Age"])
# # df["Age"] = df["Age"].cat.codes
# #df["Age"] = df["Age"].astype(float).fillna(0.0)

# df.info()


# Read training dataset from CSV

import pandas as pd

# Set the path to the CSV containing the dataset to train on.
csv_path = "titanic-train.csv"

# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = [
  "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age",
  "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  892 non-null    object
 1   Survived     892 non-null    object
 2   Pclass       892 non-null    object
 3   Name         892 non-null    object
 4   Sex          892 non-null    object
 5   Age          715 non-null    object
 6   SibSp        892 non-null    object
 7   Parch        892 non-null    object
 8   Ticket       892 non-null    object
 9   Fare         892 non-null    object
 10  Cabin        205 non-null    object
 11  Embarked     890 non-null    object
dtypes: object(12)
memory usage: 83.8+ KB


In [17]:
# Use this cell for any data-related exploration

## 2. The Model: What-If Tool (Post-hoc framework)

#### TODO [ Description: what do we want to say about the tool ]

In [18]:
# Specify input columns and column to predict
import numpy as np

# Set the column in the dataset you wish for the model to predict
label_column = 'Survived'

# Set list of all columns from the dataset we will use for model input.
input_features = [
  "PassengerId", "Pclass", "Name", "Sex", "Age",
  "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]

In [19]:
# Convert dataset to tf.Example protos 

examples = df_to_examples(df_train)

In [20]:
# Create and train the classifier 

num_steps = 5000

# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)

# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec))
classifier.train(train_inpf, steps=num_steps)



TypeError: in user code:

    File "/Users/davisrule/opt/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/canned/linear.py", line 1668, in call  *
        return self.layer(features)
    File "/Users/davisrule/opt/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/canned/linear.py", line 1496, in call  *
        weighted_sum = fc_v2._create_weighted_sum(  # pylint: disable=protected-access

    TypeError: Failed to convert elements of ('Age', '22', '38', '26', '35', nan, '54', '2', '27', '14', '4', '58', '20', '39', '55', '31', '34', '15', '28', '8', '19', '40', '66', '42', '21', '18', '3', '7', '49', '29', '65', '28.5', '5', '11', '45', '17', '32', '16', '25', '0.83', '30', '33', '23', '24', '46', '59', '71', '37', '47', '14.5', '70.5', '32.5', '12', '9', '36.5', '51', '55.5', '40.5', '44', '1', '61', '56', '50', '36', '45.5', '20.5', '62', '41', '52', '63', '23.5', '0.92', '43', '60', '10', '64', '13', '48', '0.75', '53', '57', '80', '70', '24.5', '6', '0.67', '30.5', '0.42', '34.5', '74') to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


In [21]:
## OLD CODE (introduced new errors)

# # Invoke What-If Tool for test data and the trained model 

# num_datapoints = 445
# tool_height_in_px = 1000 

# from witwidget.notebook.visualization import WitConfigBuilder
# from witwidget.notebook.visualization import WitWidget

# # Load up the test dataset

# # make_label_column_numeric(df_test, label_column, lambda val: val == 'Survived')
# test_examples = df_to_examples(df_test[0:num_datapoints])

# # Setup the tool with the test examples and the trained classifier
# config_builder = WitConfigBuilder(test_examples).set_estimator_and_feature_spec(
#     classifier, feature_spec).set_label_vocab(['Did not survive', 'Survived'])
# WitWidget(config_builder, height=tool_height_in_px)


# Invoke What-If Tool for test data and the trained model 

num_datapoints = 2000 
tool_height_in_px = 1000 

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget

# Load up the test dataset
test_csv_path = "titanic-test.csv"
test_df = pd.read_csv(test_csv_path, names=csv_columns, skipinitialspace=True,
  skiprows=1)
test_examples = df_to_examples(test_df[0:num_datapoints])

# Setup the tool with the test examples and the trained classifier
config_builder = WitConfigBuilder(test_examples).set_estimator_and_feature_spec(
    classifier, feature_spec).set_label_vocab(['Did not survive', 'Survived'])
WitWidget(config_builder, height=tool_height_in_px)

WitWidget(config={'model_type': 'classification', 'label_vocab': ['Did not survive', 'Survived'], 'are_sequenc…