Merge pull request #2 from conversationai/master

Pulling back latest edits in fork
conversationai · Jul 19, 2018 · 118c6d4 · 118c6d4
2 parents 21ae5eb + 361457c
commit 118c6d4
Show file tree

Hide file tree

Showing 10 changed files with 1,392 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ This repository is contains example code to train machine learning models for te
 
 # Outline of the codebase
 
-* `experiments/` contains out ML training framework.
+* `experiments/` contains the ML training framework.
 * `annotator-models/` contains a Dawid-Skene implementation for modelling rater quality to produce better annotations.
 * `attention-colab/` contains an introductory ipython notebook for RNNs with attention, as presented at Devoxx talk ["Tensorflow, deep learning and modern RNN architectures, without a PhD by Martin Gorner"](https://www.youtube.com/watch?v=pzOzmxCR37I)
 * `kaggle-classification/` early experiments with Keras and Estimator for training on [the Jigsaw Toxicity Kaggle competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Will be superceeded by `experiments/` shortly.

diff --git a/experiments/tf_trainer/common/model_trainer.py b/experiments/tf_trainer/common/model_trainer.py
@@ -33,6 +33,8 @@
                            'Name of comet team that tracks results.')
 tf.app.flags.DEFINE_string('comet_project_name', None,
                            'Name of comet project that tracks results.')
+tf.app.flags.DEFINE_bool('enable_profiling', False,
+                           'Enable profiler hook in estimator.')
 
 tf.app.flags.mark_flag_as_required('train_path')
 tf.app.flags.mark_flag_as_required('validate_path')
@@ -94,8 +96,14 @@ def train_with_eval(self, steps, eval_period, eval_steps):
     num_itr = int(steps / eval_period)
 
     for _ in range(num_itr):
+      hooks = None
+      if FLAGS.enable_profiling:
+        hooks = [tf.train.ProfilerHook(save_steps=10,
+                                       output_dir=os.path.join(self._model_dir(), 'profiler'))]
       self._estimator.train(
-          input_fn=self._dataset.train_input_fn, steps=eval_period)
+          input_fn=self._dataset.train_input_fn,
+          steps=eval_period,
+          hooks=hooks)
       metrics = self._estimator.evaluate(
           input_fn=self._dataset.validate_input_fn, steps=eval_steps)
       if experiment is not None:

diff --git a/experiments/tf_trainer/common/serving_input.py b/experiments/tf_trainer/common/serving_input.py
@@ -7,9 +7,16 @@
 import tensorflow as tf
 from tensorflow.python.ops import array_ops
 
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_string("serving_format", "TFRECORDS",
+                           "Format of inputs in inference."
+                           "Can be either JSON or TFRECORDS.")
+
+
 def create_serving_input_fn(feature_preprocessor_init, text_feature_name, key_name):
 
-  def serving_input_fn():
+  def serving_input_fn_json():
     features_placeholders = {}
     features_placeholders[text_feature_name] = array_ops.placeholder(
         dtype=tf.string, name=text_feature_name)
@@ -26,4 +33,32 @@ def serving_input_fn():
         features,
         features_placeholders)
 
-  return serving_input_fn
+  def serving_input_fn_tfrecords():
+    serialized_example = tf.placeholder(
+        shape=[None],
+        dtype=tf.string,
+        name="input_example_tensor"
+    )
+    feature_spec = {
+        text_feature_name: tf.FixedLenFeature([], dtype=tf.string),
+        key_name: tf.FixedLenFeature([], dtype=tf.int64)
+    }
+
+    features = tf.parse_example(
+        serialized_example, feature_spec)
+    feature_preprocessor = feature_preprocessor_init()
+    features[text_feature_name] = feature_preprocessor(
+        features[text_feature_name])
+
+    return tf.estimator.export.ServingInputReceiver(
+        features,
+        serialized_example)
+
+  if FLAGS.serving_format == 'TFRECORDS':
+    return serving_input_fn_tfrecords
+  elif FLAGS.serving_format == 'JSON':
+    return serving_input_fn_json
+  else:
+    raise ValueError('Serving format not implemented.'
+        ' Should be one of ["JSON", "TFRECORDS"].'
+        )
diff --git a/experiments/tf_trainer/common/text_preprocessor.py b/experiments/tf_trainer/common/text_preprocessor.py
@@ -54,23 +54,20 @@ def _tokenize_tensor_op(text: types.Tensor) -> types.Tensor:
       '''Converts a string Tensor to an array of integers.
 
       Args:
-        text: must be a scalar string tensor (rank 0).
+        text: must be a 1-D Tensor string tensor.
 
       Returns:
-        A 1-D Tensor of word integers.
+        A 2-D Tensor of word integers.
       '''
 
       # TODO: Improve tokenizer.
       # TODO: Ensure utf-8 encoding. Currently the string is parsed with default encoding (unclear). 
-      words = tf.string_split([text])
+      words = tf.string_split(text)
       words_int_sparse = vocabulary_table.lookup(words)
-      words_int_dense = tf.sparse_to_dense(
-          words_int_sparse.indices,
-          words_int_sparse.dense_shape,
-          words_int_sparse.values,
+      words_int_dense = tf.sparse_tensor_to_dense(
+          words_int_sparse,
           default_value=0)
-
-      return tf.squeeze(words_int_dense)
+      return words_int_dense
 
     return _tokenize_tensor_op
 

diff --git a/experiments/tf_trainer/common/tfrecord_input.py b/experiments/tf_trainer/common/tfrecord_input.py
@@ -87,7 +87,9 @@ def _read_tf_example(self,
 
     text = parsed[self._text_feature]
     # I think this could be a feature column, but feature columns seem so beta.
-    preprocessed_text = feature_preprocessor(text)
+    expanded_text = tf.expand_dims(text, 0)
+    preprocessed_text = tf.squeeze(
+        feature_preprocessor(expanded_text))
     features = {self._text_feature: preprocessed_text}
     if self._round_labels:
       labels = {label: tf.round(parsed[label]) for label in self._labels}

diff --git a/experiments/tf_trainer/keras_cnn/run.py b/experiments/tf_trainer/keras_cnn/run.py
@@ -48,14 +48,14 @@ def main(argv):
 
   preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)
   nltk.download("punkt")
-  tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize)
+  tokenize_op_init = lambda: preprocessor.tokenize_tensor_op_py_func(nltk.word_tokenize)
 
   dataset = tfrecord_input.TFRecordInput(
       train_path=FLAGS.train_path,
       validate_path=FLAGS.validate_path,
       text_feature=text_feature_name,
       labels=LABELS,
-      feature_preprocessor=tokenize_op,
+      feature_preprocessor_init=tokenize_op_init,
       batch_size=FLAGS.batch_size)
 
   # TODO: Move embedding *into* Keras model.

diff --git a/experiments/tf_trainer/keras_gru_attention/run.deploy.sh b/experiments/tf_trainer/keras_gru_attention/run.deploy.sh
@@ -4,8 +4,7 @@
 # Edit these!
 MODEL_NAME=keras_gru_attention
 # By default, the model is the last one from the user.
-MODEL_SAVED_PATH_FOLDER=$(gsutil ls gs://kaggle-model-experiments/tf_trainer_runs/${USER}/${MODEL_NAME}/)
-MODEL_SAVED_PATH=${MODEL_SAVED_PATH_FOLDER}model_dir
+MODEL_SAVED_PATH=$(gsutil ls gs://kaggle-model-experiments/tf_trainer_runs/${USER}/${MODEL_NAME}/ | tail -1)
 
 # Create a new model.
 # Will raise an error if the model already exists.
@@ -17,5 +16,4 @@ MODEL_VERSION=v_$(date +"%Y%m%d_%H%M%S")
 gcloud ml-engine versions create $MODEL_VERSION \
   --model $MODEL_NAME \
   --origin $MODEL_SAVED_PATH \
-  --runtime-version 1.8
-
+  --runtime-version 1.8 
diff --git a/experiments/tf_trainer/keras_gru_attention/run.py b/experiments/tf_trainer/keras_gru_attention/run.py
@@ -32,7 +32,7 @@
                            "required for serving.")
 tf.app.flags.DEFINE_integer("batch_size", 64,
                             "The batch size to use during training.")
-tf.app.flags.DEFINE_integer("train_steps", 100,
+tf.app.flags.DEFINE_integer("train_steps", 1000,
                             "The number of steps to train for.")
 tf.app.flags.DEFINE_integer("eval_period", 50,
                             "The number of steps per eval period.")