In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import functools
import io
import re

import numpy as np
import pandas as pd

import tensorflow as tf

In [3]:
tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(1)

### Load data

In [4]:
# download data
train_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/train.tfrecord'
train_path = tf.keras.utils.get_file(train_url.split('/')[-1], train_url)
test_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/test.tfrecord'
test_path = tf.keras.utils.get_file(test_url.split('/')[-1], test_url)

In [5]:
# load data
def parse_fn(record):
    features = {
        "terms": tf.VarLenFeature(dtype=tf.string),
        "labels": tf.FixedLenFeature(shape=[1], dtype=tf.float32)
    }
  
    parsed_features = tf.parse_single_example(record, features)

    terms = parsed_features['terms'].values
    labels = tf.cast(parsed_features['labels'], tf.int32)

    return  {'terms': terms}, labels

def input_fn(input_filenames, num_epochs=None, batch_size=25, shuffle=True):  
    ds = tf.data.TFRecordDataset(input_filenames)
    ds = ds.map(parse_fn)

    if shuffle:
        ds = ds.shuffle(100000)

    ds = ds.padded_batch(batch_size, ds.output_shapes).repeat(num_epochs)

    return ds.make_one_shot_iterator().get_next()

In [6]:
# download the vocabulary file.
terms_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/terms.txt'
terms_path = tf.keras.utils.get_file(terms_url.split('/')[-1], terms_url)

vocab = None
with io.open(terms_path, 'r', encoding='utf8') as f:
    vocab = list(set(f.read().split()))

### Build model with classifier

In [7]:
# set up feature columns
terms_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(key='terms', 
                                                                                 vocabulary_list=vocab)

terms_embedding_column = tf.feature_column.embedding_column(terms_feature_column, dimension=10)
feature_columns = [terms_embedding_column]

# create estimator spec
def make_model(features, labels, mode):

    net = tf.feature_column.input_layer(features, feature_columns)
    net = tf.layers.dense(net, units=10, activation=tf.nn.leaky_relu)
    net = tf.layers.dropout(net, rate=0.3, training=(mode == tf.estimator.ModeKeys.TRAIN))
    net = tf.layers.dense(net, units=10)
    logits = tf.layers.dense(net, 2)
    
    # compute predictions
    predicted_classes = tf.argmax(logits, 1)
    predicted_probs = tf.nn.softmax(logits)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class': predicted_classes,
            'prob': predicted_probs
        }
        return tf.estimator.EstimatorSpec(mode, 
                                          predictions=predictions,           
                                          export_outputs = {
                                              'predict': tf.estimator.export.PredictOutput(outputs=predictions)
                                          })

    # compute loss
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # create training op with cosine annealing for learning rate
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.cosine_decay(learning_rate=0.2, global_step=global_step, alpha=0.01, decay_steps=10000)
        optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
        optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    # compute evaluation metrics
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes),
        'auc': tf.metrics.auc(labels=labels, predictions=predicted_probs[:, 1])
    }
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)


# create estimator
classifier = tf.estimator.Estimator(model_fn=make_model)

In [8]:
# train
classifier.train(input_fn=lambda: input_fn([train_path], num_epochs=10))

<tensorflow.python.estimator.estimator.Estimator at 0x7f73fc899cc0>

### Evaluate classifier

In [9]:
train_stats = classifier.evaluate(input_fn=lambda: input_fn([train_path], num_epochs=1))
test_stats = classifier.evaluate(input_fn=lambda: input_fn([test_path], num_epochs=1))

train_stats = pd.DataFrame.from_dict(train_stats, orient='index', columns=['train'])
test_stats = pd.DataFrame.from_dict(test_stats, orient='index', columns=['test'])
stats = train_stats.join(test_stats)
stats

Unnamed: 0,train,test
accuracy,0.92372,0.86764
auc,0.973638,0.941962
loss,0.205227,0.335154
global_step,10000.0,10000.0


### Export classifier

In [10]:
def serving_input_receiver_fn():
    reviews = tf.placeholder(dtype=tf.string, shape=(None), name='reviews')

# NOTE: py_func is NOT serialized in the GraphDef, so can't use in serving!
#       need to find a different way to do some of our word pre-processing
#     @functools.partial(np.vectorize, otypes=[object])
#     def tokenize(x):
#         return '~'.join(re.compile(r"[\w]+|['.,!?;]").findall(x.decode('utf-8')))
    
#     terms = tf.py_func(tokenize, [reviews], tf.string, name='tokenize')
#     terms = tf.sparse_tensor_to_dense(tf.string_split(reviews, delimiter='~'), default_value='')

    terms = tf.sparse_tensor_to_dense(tf.string_split(reviews), default_value='')
    return tf.estimator.export.ServingInputReceiver({'terms': terms}, {'reviews': reviews})


export_path = classifier.export_savedmodel(export_dir_base='exports',
                                           serving_input_receiver_fn=serving_input_receiver_fn)

export_path = export_path.decode('utf-8')

In [11]:
!!saved_model_cli run --input_exprs 'reviews=["this is a terrible movie", "this is a great movie"]'  \
--dir $export_path --tag_set serve --signature_def predict

['2018-08-31 18:18:10.559376: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA',
 'Result for output key class:',
 '[0 1]',
 'Result for output key prob:',
 '[[1.0000000e+00 1.1786348e-10]',
 ' [3.6376296e-15 1.0000000e+00]]']