Skip to content

Commit

Permalink
Merge pull request #2 from conversationai/master
Browse files Browse the repository at this point in the history
Pulling back latest edits in fork
  • Loading branch information
fprost committed Jul 19, 2018
2 parents 21ae5eb + 361457c commit 118c6d4
Show file tree
Hide file tree
Showing 10 changed files with 1,392 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This repository is contains example code to train machine learning models for te

# Outline of the codebase

* `experiments/` contains out ML training framework.
* `experiments/` contains the ML training framework.
* `annotator-models/` contains a Dawid-Skene implementation for modelling rater quality to produce better annotations.
* `attention-colab/` contains an introductory ipython notebook for RNNs with attention, as presented at Devoxx talk ["Tensorflow, deep learning and modern RNN architectures, without a PhD by Martin Gorner"](https://www.youtube.com/watch?v=pzOzmxCR37I)
* `kaggle-classification/` early experiments with Keras and Estimator for training on [the Jigsaw Toxicity Kaggle competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Will be superceeded by `experiments/` shortly.
Expand Down
10 changes: 9 additions & 1 deletion experiments/tf_trainer/common/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
'Name of comet team that tracks results.')
tf.app.flags.DEFINE_string('comet_project_name', None,
'Name of comet project that tracks results.')
tf.app.flags.DEFINE_bool('enable_profiling', False,
'Enable profiler hook in estimator.')

tf.app.flags.mark_flag_as_required('train_path')
tf.app.flags.mark_flag_as_required('validate_path')
Expand Down Expand Up @@ -94,8 +96,14 @@ def train_with_eval(self, steps, eval_period, eval_steps):
num_itr = int(steps / eval_period)

for _ in range(num_itr):
hooks = None
if FLAGS.enable_profiling:
hooks = [tf.train.ProfilerHook(save_steps=10,
output_dir=os.path.join(self._model_dir(), 'profiler'))]
self._estimator.train(
input_fn=self._dataset.train_input_fn, steps=eval_period)
input_fn=self._dataset.train_input_fn,
steps=eval_period,
hooks=hooks)
metrics = self._estimator.evaluate(
input_fn=self._dataset.validate_input_fn, steps=eval_steps)
if experiment is not None:
Expand Down
39 changes: 37 additions & 2 deletions experiments/tf_trainer/common/serving_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@
import tensorflow as tf
from tensorflow.python.ops import array_ops

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string("serving_format", "TFRECORDS",
"Format of inputs in inference."
"Can be either JSON or TFRECORDS.")


def create_serving_input_fn(feature_preprocessor_init, text_feature_name, key_name):

def serving_input_fn():
def serving_input_fn_json():
features_placeholders = {}
features_placeholders[text_feature_name] = array_ops.placeholder(
dtype=tf.string, name=text_feature_name)
Expand All @@ -26,4 +33,32 @@ def serving_input_fn():
features,
features_placeholders)

return serving_input_fn
def serving_input_fn_tfrecords():
serialized_example = tf.placeholder(
shape=[None],
dtype=tf.string,
name="input_example_tensor"
)
feature_spec = {
text_feature_name: tf.FixedLenFeature([], dtype=tf.string),
key_name: tf.FixedLenFeature([], dtype=tf.int64)
}

features = tf.parse_example(
serialized_example, feature_spec)
feature_preprocessor = feature_preprocessor_init()
features[text_feature_name] = feature_preprocessor(
features[text_feature_name])

return tf.estimator.export.ServingInputReceiver(
features,
serialized_example)

if FLAGS.serving_format == 'TFRECORDS':
return serving_input_fn_tfrecords
elif FLAGS.serving_format == 'JSON':
return serving_input_fn_json
else:
raise ValueError('Serving format not implemented.'
' Should be one of ["JSON", "TFRECORDS"].'
)
15 changes: 6 additions & 9 deletions experiments/tf_trainer/common/text_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,20 @@ def _tokenize_tensor_op(text: types.Tensor) -> types.Tensor:
'''Converts a string Tensor to an array of integers.
Args:
text: must be a scalar string tensor (rank 0).
text: must be a 1-D Tensor string tensor.
Returns:
A 1-D Tensor of word integers.
A 2-D Tensor of word integers.
'''

# TODO: Improve tokenizer.
# TODO: Ensure utf-8 encoding. Currently the string is parsed with default encoding (unclear).
words = tf.string_split([text])
words = tf.string_split(text)
words_int_sparse = vocabulary_table.lookup(words)
words_int_dense = tf.sparse_to_dense(
words_int_sparse.indices,
words_int_sparse.dense_shape,
words_int_sparse.values,
words_int_dense = tf.sparse_tensor_to_dense(
words_int_sparse,
default_value=0)

return tf.squeeze(words_int_dense)
return words_int_dense

return _tokenize_tensor_op

Expand Down
4 changes: 3 additions & 1 deletion experiments/tf_trainer/common/tfrecord_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ def _read_tf_example(self,

text = parsed[self._text_feature]
# I think this could be a feature column, but feature columns seem so beta.
preprocessed_text = feature_preprocessor(text)
expanded_text = tf.expand_dims(text, 0)
preprocessed_text = tf.squeeze(
feature_preprocessor(expanded_text))
features = {self._text_feature: preprocessed_text}
if self._round_labels:
labels = {label: tf.round(parsed[label]) for label in self._labels}
Expand Down
4 changes: 2 additions & 2 deletions experiments/tf_trainer/keras_cnn/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ def main(argv):

preprocessor = text_preprocessor.TextPreprocessor(embeddings_path)
nltk.download("punkt")
tokenize_op = preprocessor.tokenize_tensor_op(nltk.word_tokenize)
tokenize_op_init = lambda: preprocessor.tokenize_tensor_op_py_func(nltk.word_tokenize)

dataset = tfrecord_input.TFRecordInput(
train_path=FLAGS.train_path,
validate_path=FLAGS.validate_path,
text_feature=text_feature_name,
labels=LABELS,
feature_preprocessor=tokenize_op,
feature_preprocessor_init=tokenize_op_init,
batch_size=FLAGS.batch_size)

# TODO: Move embedding *into* Keras model.
Expand Down
6 changes: 2 additions & 4 deletions experiments/tf_trainer/keras_gru_attention/run.deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
# Edit these!
MODEL_NAME=keras_gru_attention
# By default, the model is the last one from the user.
MODEL_SAVED_PATH_FOLDER=$(gsutil ls gs://kaggle-model-experiments/tf_trainer_runs/${USER}/${MODEL_NAME}/)
MODEL_SAVED_PATH=${MODEL_SAVED_PATH_FOLDER}model_dir
MODEL_SAVED_PATH=$(gsutil ls gs://kaggle-model-experiments/tf_trainer_runs/${USER}/${MODEL_NAME}/ | tail -1)

# Create a new model.
# Will raise an error if the model already exists.
Expand All @@ -17,5 +16,4 @@ MODEL_VERSION=v_$(date +"%Y%m%d_%H%M%S")
gcloud ml-engine versions create $MODEL_VERSION \
--model $MODEL_NAME \
--origin $MODEL_SAVED_PATH \
--runtime-version 1.8

--runtime-version 1.8
2 changes: 1 addition & 1 deletion experiments/tf_trainer/keras_gru_attention/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"required for serving.")
tf.app.flags.DEFINE_integer("batch_size", 64,
"The batch size to use during training.")
tf.app.flags.DEFINE_integer("train_steps", 100,
tf.app.flags.DEFINE_integer("train_steps", 1000,
"The number of steps to train for.")
tf.app.flags.DEFINE_integer("eval_period", 50,
"The number of steps per eval period.")
Expand Down
Loading

0 comments on commit 118c6d4

Please sign in to comment.