From 410b44eef86d1edc96cc992ca317547538ada1a3 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 19 Dec 2023 15:12:00 -0800 Subject: [PATCH] Internal change. PiperOrigin-RevId: 592356708 --- seqio/benchmarks/preprocessors_benchmark.py | 188 ++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 seqio/benchmarks/preprocessors_benchmark.py diff --git a/seqio/benchmarks/preprocessors_benchmark.py b/seqio/benchmarks/preprocessors_benchmark.py new file mode 100644 index 00000000..6675eb30 --- /dev/null +++ b/seqio/benchmarks/preprocessors_benchmark.py @@ -0,0 +1,188 @@ +# Copyright 2023 The SeqIO Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Microbenchmarks for SeqIO preprocessors functions.""" + +import os + +import google_benchmark +from seqio import dataset_providers +from seqio import feature_converters +from seqio import preprocessors +from seqio import test_utils +from seqio import vocabularies +import tensorflow.compat.v2 as tf + + +Feature = dataset_providers.Feature + +_TEST_DIR = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'test_data' +) +_SENTENCEPIECE_VOCAB = vocabularies.SentencePieceVocabulary( + os.path.join(_TEST_DIR, 'sentencepiece', 'sentencepiece.model') +) +_OUTPUT_FEATURES = { + 'prefix': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'suffix': Feature(_SENTENCEPIECE_VOCAB, add_eos=False), +} + + +@google_benchmark.register +def rekey(state): + og_dataset = tf.data.Dataset.from_tensors( + {'text': 'That is good.', 'other': 'That is bad.'} + ) + while state: + _ = preprocessors.rekey(og_dataset, {'inputs': 'other', 'targets': 'text'}) + + +@google_benchmark.register +def tokenize(state): + og_dataset = tf.data.Dataset.from_tensors( + {'prefix': 'This is', 'suffix': 'a test.'} + ) + while state: + preprocessors.tokenize(og_dataset, output_features=_OUTPUT_FEATURES) + + +@google_benchmark.register +def tokenize_3_rank(state): + og_dataset = tf.data.Dataset.from_tensors({ + 'prefix': tf.ragged.constant( + [[['a', 'b'], ['c']], [['d', 'e'], ['f']], [['g', 'h'], ['i']]] + ), + 'suffix': tf.ragged.constant( + [[['j'], ['k', 'l', 'm']], [['n'], ['o', 'p']]] + ), + }) + while state: + preprocessors.tokenize(og_dataset, output_features=_OUTPUT_FEATURES) + + +@google_benchmark.register +def tokenize_and_append_eos(state): + og_dataset = tf.data.Dataset.from_tensors( + {'prefix': 'This is', 'suffix': 'a test.'} + ) + while state: + preprocessors.tokenize_and_append_eos( + og_dataset, output_features=_OUTPUT_FEATURES + ) + + +@google_benchmark.register +def append_eos(state): + """Microbenchmark for appending EOS.""" + og_dataset = tf.data.Dataset.from_tensors({ + 'inputs': [1, 2, 3], + 'targets': [4, 5, 6, 7], + 'arrows': [8, 9, 10, 11], + 'strings': [[14, 15], [16, 17], [18, 19]], + 'feathers': tf.ragged.constant([[20, 21], [], [22, 23, 24, 25, 26]]), + 'bows': [12, 13], + }) + output_features = { + 'inputs': Feature(_SENTENCEPIECE_VOCAB, add_eos=False), + 'targets': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'arrows': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'strings': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'feathers': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + } + while state: + _ = preprocessors.append_eos(og_dataset, output_features) + + +@google_benchmark.register +def append_eos_after_trim(state): + """Microbenchmark for appending EOS after trimming.""" + og_dataset = tf.data.Dataset.from_tensors({ + 'inputs': [1, 2, 3], + 'targets': [4, 5, 6, 7], + 'arrows': [8, 9, 10, 11], + 'strings': [[14, 15], [16, 17], [18, 19]], + 'feathers': tf.ragged.constant([[20, 21], [], [22, 23, 24, 25, 26]]), + 'bows': [12, 13], + }) + output_features = { + 'inputs': Feature(_SENTENCEPIECE_VOCAB, add_eos=False), + 'targets': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'arrows': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'strings': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + 'feathers': Feature(_SENTENCEPIECE_VOCAB, add_eos=True), + } + sequence_length = { + 'inputs': 4, + 'targets': 3, + 'arrows': 5, + 'strings': 3, + 'feathers': 4, + } + while state: + _ = preprocessors.append_eos_after_trim( + og_dataset, + output_features=output_features, + sequence_length=sequence_length, + ) + + +@google_benchmark.register +def truncate_inputs_left(state): + og_dataset = tf.data.Dataset.from_tensors({ + 'inputs': [1, 2, 3], + 'targets': [4, 5, 6, 7], + }) + sequence_length = {'inputs': 2, 'targets': 4} + while state: + _ = preprocessors.truncate_inputs_left(og_dataset, sequence_length) + + +@google_benchmark.register +def apply_feature_converter(state): + """Microbenchmark for applying feature converter.""" + x = {'inputs': [8, 7, 1, 0], 'targets': [4, 1, 0], 'redundant_feature': [0]} + ds = test_utils.create_default_dataset( + [x], feature_names=('inputs', 'targets', 'redundant_feature') + ) + sequence_length = {'inputs': 8, 'targets': 7} + feature_converter = feature_converters.EncDecFeatureConverter() + while state: + _ = preprocessors.apply_feature_converter( + ds, sequence_length=sequence_length, feature_converter=feature_converter + ) + + +# TODO(b/315985098): Ask mishragaurav@ for a good example and create a test. +# @google_benchmark.register +# def hash_and_tile_subtask_id(state): +# og_dataset = tf.data.Dataset.from_tensors({ +# 'inputs': 'This is', +# 'targets': 'a test.', +# 'provenance/task': 'test_task_name', +# }) +# while state: +# _ = preprocessors.hash_and_tile_subtask_id(og_dataset) + + +@google_benchmark.register +def preprocess_tensorflow_examples(state): + og_dataset = tf.data.Dataset.from_tensors({'text': 'Hello', 'label': 'World'}) + while state: + _ = preprocessors.preprocess_tensorflow_examples( + og_dataset, 'Input: {text}', 'Output: {label}' + ) + + +if __name__ == '__main__': + google_benchmark.main()