# Boosted Tree Model

This end-to-end walkthrough trains a logistic regression model using the tf.estimator API followed by saving the model and then re-loading it with a different name for predictions. See TensorFlow's example tutorial at:

https://www.tensorflow.org/tutorials/estimator/boosted_trees

**TF 2.0.x Only**

We can only predict real-valued outputs due to version constraints with the on-device build. You must run this notebook with TF 2.0.x.

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import clear_output

In [3]:
print (tf.__version__)

2.0.3


# Versions

2.0: Does not allow multi-classification on BoostedTrees

2.2: Allows mutli-class, but issues training the model with the current setup

2.4: Works! However, may NOT work on the TensorIO

**Load the dataset**

We will use the Titanic dataset with the (rather morbid) goal of predicting passenger survival, given characteristics such as gender, age, class, etc.

In [4]:
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')

In [5]:
LABEL = "fare" # for now let label stay as a string... other issues if we also convert this into class #s

for col in dftrain.columns:
    if dftrain[col].dtype == "object" and col != LABEL:
        dftrain[col] = dftrain[col].astype('category').cat.codes
        dfeval[col] = dfeval[col].astype('category').cat.codes        

In [6]:
dftrain.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,1,22.0,1,0,7.25,2,7,2,0
1,1,0,38.0,1,0,71.2833,0,2,0,0
2,1,0,26.0,0,0,7.925,2,7,2,1
3,1,0,35.0,1,0,53.1,0,2,2,0
4,0,1,28.0,0,0,8.4583,2,7,1,1


In [7]:
dfeval.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,1,35.0,0,0,8.05,2,6,2,1
1,0,1,54.0,0,0,51.8625,0,4,2,1
2,1,0,58.0,0,0,26.55,0,2,2,1
3,1,0,55.0,0,0,16.0,1,6,2,1
4,1,1,34.0,0,0,13.0,1,3,2,1


In [8]:
dftrain.dtypes

survived                int64
sex                      int8
age                   float64
n_siblings_spouses      int64
parch                   int64
fare                  float64
class                    int8
deck                     int8
embark_town              int8
alone                    int8
dtype: object

In [9]:
print(dftrain[LABEL])

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.4583
        ...   
622    10.5000
623     7.0500
624    30.0000
625    23.4500
626     7.7500
Name: fare, Length: 627, dtype: float64


**Feature Engineering for the Model**

Estimators use a system called feature columns to describe how the model should interpret each of the raw input features. An Estimator expects a vector of numeric inputs, and feature columns describe how the model should convert each feature.

In [10]:
feature_columns = []
NUMERIC_COLUMNS = ['age']
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 
                       'survived', 'alone', 'embark_town']

def one_hot_cat_column(feature_name, vocab):
    return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature_name,vocab))

for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = dftrain[feature_name].unique()
#     feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
    feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [11]:
# test encoding
example = dict(dftrain.head(1))
class_fc = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('class', [0, 1, 2]))
print('Feature value: "{}"'.format(example['class'].iloc[0]))
print('One-hot encoded: ', tf.keras.layers.DenseFeatures([class_fc])(example).numpy())

Feature value: "2"
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
One-hot encoded:  [[0. 0. 1.]]


The below two functions are responsible for feeding the data to the model for training and evaluation respectively.

In [12]:
def make_train_input_fn(df, num_epochs):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = df,
        y = df[LABEL],
        batch_size = 128,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1000
    )

In [13]:
def make_prediction_input_fn(df):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = df,
        y = None,
        batch_size = 128,
        shuffle = False,
        queue_capacity = 1000
    )

## Train the model

In [14]:
# Instantiate the pre-made estimator
n_batches = 1
model = tf.estimator.BoostedTreesRegressor(feature_columns, 
                                            n_batches_per_layer=n_batches,
                                            pruning_mode='none')

# Train the model
model.train(make_train_input_fn(dftrain, num_epochs=10)), # max_steps=100)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/3k/znx7sn113fg2z9z5_0mzfkn80000gn/T/tmp105zuege', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1372f4850>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions fo

(<tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor at 0x1371c7310>,)

**Predict on evaluation dataset**

In [15]:
predDicts = list(model.predict(make_prediction_input_fn(dfeval)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/3k/znx7sn113fg2z9z5_0mzfkn80000gn/T/tmp105zuege/model.ckpt-48
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Let's see first 10 predictions

In [16]:
preds = []
for pred in predDicts[:10]:
    # preds.append(np.argmax(pred["probabilities"]))
    preds.extend(pred["predictions"])
preds

[10.168745,
 37.892624,
 92.88942,
 10.930451,
 11.129454,
 10.585346,
 32.096348,
 9.020223,
 24.899055,
 9.352524]

In [17]:
print (predDicts[0])

{'predictions': array([10.168745], dtype=float32)}


**Save the model**

In [19]:
# tf.compat.v1.disable_eager_execution()

SHAPE = [None]

def serving_input_receiver_fn():
    x = {
        'survived': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='survived'),
        'sex': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='sex'),
        'age': tf.compat.v1.placeholder(tf.float32, shape=SHAPE, name='age'),
        'n_siblings_spouses': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='n_siblings_spouses'),
        'parch': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='parch'),
        'class': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='class'),
        'deck': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='deck'),
        'embark_town': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='embark_town'),
        'alone': tf.compat.v1.placeholder(tf.int64, shape=SHAPE, name='alone')
    }
    
    return tf.estimator.export.ServingInputReceiver(x, x)

inputFn = serving_input_receiver_fn

In [20]:
OUTDIR = 'saved_graph'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
modelPath = model.export_saved_model(OUTDIR, inputFn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Regression input must be a single string Tensor; got {'survived': <tf.Tensor 'survived:0' shape=(None,) dtype=int64>, 'sex': <tf.Tensor 'sex:0' shape=(None,) dtype=int64>, 'age': <tf.Tensor 'age:0' shape=(None,) dtype=float32>, 'n_siblings_spouses': <tf.Tens

## Test Prediction

We will use different model name just to be sure that we are not using the existing model.

In [23]:
savedModelPath = modelPath
importedModel = tf.saved_model.load(savedModelPath)

In [24]:
importedModel.signatures['predict']

<tensorflow.python.eager.wrap_function.WrappedFunction at 0x137a2b850>

**Predict using imported model**

To predict on an unseen data set using loaded estimator model, we will have to follow following small steps:


<ol>

<li>Loop through the whole dataset rows.

<ol>
<li>Create tf.train.Example() object. This object will be responsible for passing our data to the model for prediction.</li>
<li>Loop through all the columns and based on the datatype of the column add that column value to the example object using the appropriate type out of bytes_list, float_list, int64_list. More info about these types here: https://www.tensorflow.org/tutorials/load_data/tfrecord</li>
<li>Predict using this example object and the imported model. Note that this example object will serve the same purpose as passing a single row to a sklearn model for prediction.</li>
</ol>
</li>
</ol>

Below is the implementation of the same.

In [25]:
# Deleting the label column from dfeval since we will be passing the 
# dataset itself instead of a function which does it for us.

dfeval.drop(columns=[LABEL], inplace=True)

In [26]:
dfeval.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,class,deck,embark_town,alone
0,0,1,35.0,0,0,2,6,2,1
1,0,1,54.0,0,0,0,4,2,1
2,1,0,58.0,0,0,0,2,2,1
3,1,0,55.0,0,0,1,6,2,1
4,1,1,34.0,0,0,1,3,2,1


In [27]:
def predict(dfeval, importedModel):
    colNames = dfeval.columns
    dtypes = dfeval.dtypes
    predictions = []
    for row in dfeval.iterrows():
        example = tf.train.Example()
        for i in range(len(colNames)):
            dtype = dtypes[i]
            colName = colNames[i]
            value = row[1][colName].astype(dtype)
            if dtype == "object":
                value = bytes(value, "utf-8")
                example.features.feature[colName].bytes_list.value.extend(
                    [value])
            elif dtype == "float":
                example.features.feature[colName].float_list.value.extend(
                    [value])
            elif dtype == "int":
                example.features.feature[colName].int64_list.value.extend(
                    [value])
        # print (tf.constant([example.SerializeToString()]))
                
        predictions.append(
            importedModel.signatures["predict"](
                examples=tf.constant([example.SerializeToString()])
                )
        )
            
    return predictions

In [28]:
individual_ex = {
    'survived': tf.constant([0], dtype=tf.int64),
    'sex': tf.constant([1], dtype=tf.int64),
    'age': tf.constant([35.1], dtype=tf.float32),
    'n_siblings_spouses': tf.constant([0], dtype=tf.int64),
    'parch': tf.constant([0], dtype=tf.int64),
    'class': tf.constant([2], dtype=tf.int64),
    'deck': tf.constant([6], dtype=tf.int64),
    'embark_town': tf.constant([2], dtype=tf.int64),
    'alone': tf.constant([1], dtype=tf.int64),
}

predict_fn = importedModel.signatures['predict']
ps = predict_fn(**individual_ex)
ps['predictions'].numpy()[0][0]

10.168745