From f4650ca47c5a8730fa6228f213552db39cb37e87 Mon Sep 17 00:00:00 2001 From: Carlos Ezequiel Date: Wed, 4 Nov 2020 11:05:00 -0500 Subject: [PATCH] Release/2.0 (#56) * Update check_tfrecords to use new dataset load function. * Add tfrecord_dir to create_tfrecords output. * Restructure test image directory to match expected format. * Feature/dataclass (#44) * Added data classes for types. * Checking in progress. * Checking in more changes. * Converted types to classes and refactored schema into OO pattern. * Changed OrderedDict import to support py3.6. * Changed OrderedDict import to support py3.6. * Updated setup.py for version. * fixing setup.py * Patched requirements and setup. * Addressed comments in code review. * Addressed code comments round 2. * refactored IMAGE_CSV_SCHEMA. * Merged check_test.py from dev Co-authored-by: Carlos Ezequiel * Feature/structured data tutorial (#45) * Converted types to classes and refactored schema into OO pattern. * Add tutorial on structured data conversion. This changes types.FloatInput to use tf.float32 for its feature_spec attribute to address potential incompatibility with using tf.float64 type in TensorFlow Transform. Co-authored-by: Mike Bernico * Update structured data tutorial to use output dir. * Clarify need for proper header when using create_tfrecords. Fixes #47. * Clean up README and update image directory notebook. * Feature/test image dir (#49) * Restructure test image directory to match expected format. * Clean up README and update image directory notebook. * Fix minor issues * Add an explicit error message for missing train split * Configure automated tests for Jupyter notebooks. * Add convert_and_load function. Also refactor create_tfrecords to convert. * Refactor check and common modules to utils. * Add test targets for py files and notebooks. * Feature/convert and load (#55) * Add convert_and_load function. Also refactor create_tfrecords to convert. * Refactor check and common modules to utils. * Add test targets for py files and notebooks. * Update version in setup.py and release notes. * Fix issues with GCS path parsing. Co-authored-by: Mike Bernico Co-authored-by: Sergii Khomenko --- .github/workflows/python-cicd.yml | 11 +- .gitignore | 3 + Makefile | 13 +- README.md | 189 +-- RELEASE.md | 8 + requirements.txt | 3 + samples/Basic-TFRecorder-Usage.ipynb | 1026 ++++++++++++++++- samples/Convert-image-directory.ipynb | 200 ++++ samples/Convert-structured-data.ipynb | 400 +++++++ samples/Loading-a-TF-Dataset.ipynb | 27 +- ...FRecorder-with-Google-Cloud-Dataflow.ipynb | 4 +- setup.py | 8 +- tfrecorder/__init__.py | 8 +- tfrecorder/accessor.py | 12 +- tfrecorder/beam_image_test.py | 15 +- tfrecorder/beam_pipeline.py | 84 +- tfrecorder/beam_pipeline_test.py | 44 +- tfrecorder/cli.py | 8 +- tfrecorder/common.py | 42 - tfrecorder/common_test.py | 53 - tfrecorder/{client.py => converter.py} | 125 +- .../{client_test.py => converter_test.py} | 159 ++- tfrecorder/{dataset.py => dataset_loader.py} | 11 +- ...dataset_test.py => dataset_loader_test.py} | 34 +- tfrecorder/input_schema.py | 102 ++ .../{schema_test.py => input_schema_test.py} | 37 +- tfrecorder/schema.py | 187 --- tfrecorder/test_data/data.csv | 12 +- .../images/{ => TEST}/cat/cat-800x600-3.jpg | Bin .../images/{ => TEST}/goat/goat-640x427-3.jpg | Bin .../images/{ => TRAIN}/cat/cat-640x853-1.jpg | Bin .../{ => TRAIN}/goat/goat-640x640-1.jpg | Bin .../{ => VALIDATION}/cat/cat-800x600-2.jpg | Bin .../{ => VALIDATION}/goat/goat-320x320-2.jpg | Bin tfrecorder/test_utils.py | 13 +- tfrecorder/types.py | 68 +- tfrecorder/{check.py => utils.py} | 77 +- tfrecorder/{check_test.py => utils_test.py} | 92 +- 38 files changed, 2333 insertions(+), 742 deletions(-) create mode 100644 samples/Convert-image-directory.ipynb create mode 100644 samples/Convert-structured-data.ipynb delete mode 100644 tfrecorder/common.py delete mode 100644 tfrecorder/common_test.py rename tfrecorder/{client.py => converter.py} (76%) rename tfrecorder/{client_test.py => converter_test.py} (66%) rename tfrecorder/{dataset.py => dataset_loader.py} (92%) rename tfrecorder/{dataset_test.py => dataset_loader_test.py} (64%) create mode 100644 tfrecorder/input_schema.py rename tfrecorder/{schema_test.py => input_schema_test.py} (58%) delete mode 100644 tfrecorder/schema.py rename tfrecorder/test_data/images/{ => TEST}/cat/cat-800x600-3.jpg (100%) rename tfrecorder/test_data/images/{ => TEST}/goat/goat-640x427-3.jpg (100%) rename tfrecorder/test_data/images/{ => TRAIN}/cat/cat-640x853-1.jpg (100%) rename tfrecorder/test_data/images/{ => TRAIN}/goat/goat-640x640-1.jpg (100%) rename tfrecorder/test_data/images/{ => VALIDATION}/cat/cat-800x600-2.jpg (100%) rename tfrecorder/test_data/images/{ => VALIDATION}/goat/goat-320x320-2.jpg (100%) rename tfrecorder/{check.py => utils.py} (60%) rename tfrecorder/{check_test.py => utils_test.py} (60%) diff --git a/.github/workflows/python-cicd.yml b/.github/workflows/python-cicd.yml index 2a25bd4..9d80335 100644 --- a/.github/workflows/python-cicd.yml +++ b/.github/workflows/python-cicd.yml @@ -7,7 +7,6 @@ on: [push] jobs: build: - runs-on: ubuntu-latest strategy: matrix: @@ -23,10 +22,14 @@ jobs: run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Run all tests + run: | + export PYTHONPATH="$GITHUB_WORKSPACE" + make test + - name: Lint with pylint run: | make pylint - - name: Run tests - run: | - make test + diff --git a/.gitignore b/.gitignore index 1cda03e..2873027 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +build/ +dist/ +tfrecorder.egg-info .idea/ .ipynb_checkpoints/ .vscode/ diff --git a/Makefile b/Makefile index fae2a05..f69cb7e 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,17 @@ -all: init test pylint +all: init testnb test pylint init: pip install -r requirements.txt -test: +test: test-nb test-py + +test-py: nosetests --with-coverage -v --cover-package=tfrecorder +test-nb: + ls -1 samples/*.ipynb | grep -v '^.*Dataflow.ipynb' | xargs py.test --nbval-lax -p no:python + pylint: - pylint tfrecorder + pylint -j 0 tfrecorder -.PHONY: all init test pylint +.PHONY: all init test pylint diff --git a/README.md b/README.md index 72f8048..1ad9f2a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ TFRecorder can convert any Pandas DataFrame or CSV file into TFRecords. If your [Release Notes](RELEASE.md) ## Why TFRecorder? -Using the TFRecord storage format is important for optimal machine learning pipelines and getting the most from your hardware (in cloud or on prem). The TFRecorder project started inside [Google Cloud AI Services](https://cloud.google.com/consulting) when we realized we were writing TFRecord conversion code over and over again. +Using the TFRecord storage format is important for optimal machine learning pipelines and getting the most from your hardware (in cloud or on prem). The TFRecorder project started inside [Google Cloud AI Services](https://cloud.google.com/consulting) when we realized we were writing TFRecord conversion code over and over again. When to use TFRecords: * Your model is input bound (reading data is impacting training time). @@ -71,7 +71,7 @@ df.tensorflow.to_tfr(output_dir='/my/output/path') Google Cloud Platform Dataflow workers need to be supplied with the tfrecorder package that you would like to run remotely. To do so first download or build -the package (a python wheel file) and then specify the path the the file when +the package (a python wheel file) and then specify the path the file when tfrecorder is called. Step 1: Download or create the wheel file. @@ -109,7 +109,7 @@ Using Python interpreter: ```python import tfrecorder -tfrecorder.create_tfrecords( +tfrecorder.convert( source='/path/to/data.csv', output_dir='gs://my/bucket') ``` @@ -126,10 +126,9 @@ tfrecorder create-tfrecords \ ```python import tfrecorder -tfrecorder.create_tfrecords( +tfrecorder.convert( source='/path/to/image_dir', - output_dir='gs://my/bucket', -) + output_dir='gs://my/bucket') ``` The image directory should have the following general structure: @@ -159,7 +158,7 @@ images/ ### Loading a TF Dataset from TFRecord files -You can load a TensorFlow dataset from TFRecord files generated by TFRecorder +You can load a TensorFlow dataset from TFRecord files generated by TFRecorder on your local machine. ```python @@ -175,8 +174,9 @@ Using Python interpreter: ```python import tfrecorder -tfrecorder.check_tfrecords( - file_pattern='/path/to/tfrecords/train*.tfrecord.gz', +tfrecorder.inspect( + tfrecord_dir='/path/to/tfrecords/', + split='TRAIN', num_records=5, output_dir='/tmp/output') ``` @@ -187,16 +187,17 @@ representing the images encoded into TFRecords. Using the command line: ```bash -tfrecorder check-tfrecords \ - --file_pattern=/path/to/tfrecords/train*.tfrecord.gz \ +tfrecorder inspect \ + --tfrecord-dir=/path/to/tfrecords/ \ + --split='TRAIN' \ --num_records=5 \ --output_dir=/tmp/output ``` ## Default Schema -If you don't specify an input schema, TFRecorder expects data to be in the same format as -[AutoML Vision input](https://cloud.google.com/vision/automl/docs/prepare). +If you don't specify an input schema, TFRecorder expects data to be in the same format as +[AutoML Vision input](https://cloud.google.com/vision/automl/docs/prepare). This format looks like a Pandas DataFrame or CSV formatted as: | split | image_uri | label | @@ -205,139 +206,139 @@ This format looks like a Pandas DataFrame or CSV formatted as: where: * `split` can take on the values TRAIN, VALIDATION, and TEST -* `image_uri` specifies a local or Google Cloud Storage location for the image file. -* `label` can be either a text based label that will be integerized or integer +* `image_uri` specifies a local or Google Cloud Storage location for the image file. +* `label` can be either a text-based label that will be integerized or integer ## Flexible Schema -TFRecorder's flexible schema system allows you to use any schema you want for your input data. To support any input data schema, provide a schema map to TFRecorder. A TFRecorder schema_map creates a mapping between your dataframe column names and their types in the resulting -TFRecord. +TFRecorder's flexible schema system allows you to use any schema you want for your input data. -### Creating and using a schema map -A schema map is a Python dictionary that maps DataFrame column names to [supported -TFRecorder types.](#Supported-types) +For example, the default image CSV schema input can be defined like this: +```python +import pandas as pd +import tfrecorder +from tfrecorder import input_schema +from tfrecorder import types -For example, the default image CSV input can be defined like this: +image_csv_schema = input_schema.Schema({ + 'split': types.SplitKey, + 'image_uri': types.ImageUri, + 'label': types.StringLabel +}) -```python -from tfrecorder import schema +# You can then pass the schema to `tfrecorder.create_tfrecords`. -image_csv_schema = { - 'split': schema.split_key, - 'image_uri': schema.image_uri, - 'label': schema.string_label -} +df = pd.read_csv(...) +df.tensorflow.to_tfr( + output_dir='gs://my/bucket', + schema_map=image_csv_schema, + runner='DataflowRunner', + project='my-project', + region='us-central1') ``` -Once created a schema_map can be sent to TFRecorder. + +### Flexible Schema Example + +Imagine that you have a dataset that you would like to convert to TFRecords that +looks like this: + +| split | x | y | label | +|-------|-------|------|-------| +| TRAIN | 0.32 | 42 |1 | + +You can use TFRecorder as shown below: ```python import pandas as pd -from tfrecorder import schema import tfrecorder +from tfrecorder import input_schema +from tfrecorder import types + +# First create a schema map +schema = input_schema.Schema({ + 'split': types.SplitKey, + 'x': types.FloatInput, + 'y': types.IntegerInput, + 'label': types.IntegerLabel, +}) + +# Now call TFRecorder with the specified schema_map df = pd.read_csv(...) df.tensorflow.to_tfr( output_dir='gs://my/bucket', - schema_map=schema.image_csv_schema, + schema=schema, runner='DataflowRunner', project='my-project', region='us-central1') ``` +After calling TFRecorder's `to_tfr()` function, TFRecorder will create an Apache beam pipeline, either locally or in this case +using Google Cloud's Dataflow runner. This beam pipeline will use the schema map to identify the types you've associated with +each data column and process your data using [TensorFlow Transform](https://www.tensorflow.org/tfx/transform/get_started) and TFRecorder's image processing functions to convert the data into into TFRecords. ### Supported types -TFRecorder's schema system supports several types, all listed below. You can use -these types by referencing them in the schema map. Each type informs TFRecorder how -to treat your DataFrame columns. For example, the schema mapping -`my_split_key: schema.SplitKeyType` tells TFRecorder to treat the column `my_split_key` as -type `schema.SplitKeyType` and create dataset splits based on it's contents. -#### schema.ImageUriType -* Specifies the path to an image. When specified, TFRecorder -will load the specified image and store the image as a [base64 encoded](https://docs.python.org/3/library/base64.html) - [tf.string](https://www.tensorflow.org/tutorials/load_data/unicode) in the key 'image' -along with the height, width, and image channels as integers using they keys 'image_height', 'image_width', and 'image_channels'. -* A schema can contain only one imageUriType +TFRecorder's schema system supports several types. +You can use these types by referencing them in the schema map. +Each type informs TFRecorder how to treat your DataFrame columns. + +#### types.SplitKey -#### schema.SplitKeyType * A split key is required for TFRecorder at this time. * Only one split key is allowed. -* Specifies a split key that TFRecorder will use to partition the +* Specifies a split key that TFRecorder will use to partition the input dataset on. * Allowed values are 'TRAIN', 'VALIDATION, and 'TEST' -Note: If you do not want your data to be partitioned please include a split_key and -set all rows to TRAIN. +Note: If you do not want your data to be partitioned, include a column with +`types.SplitKey` and set all the elements to `TRAIN`. + +#### types.ImageUri + +* Specifies the path to an image. When specified, TFRecorder +will load the specified image and store the image as a [base64 encoded](https://docs.python.org/3/library/base64.html) + [tf.string](https://www.tensorflow.org/tutorials/load_data/unicode) in the key 'image' +along with the height, width, and image channels as integers using the keys 'image_height', 'image_width', and 'image_channels'. +* A schema can contain only one imageUri column + +#### types.IntegerInput -#### schema.IntegerInputType * Specifies an int input. * Will be scaled to mean 0, variance 1. -#### schema.FloatInputType +#### types.FloatInput + * Specifies an float input. * Will be scaled to mean 0, variance 1. -#### schema.CategoricalInputType +#### types.CategoricalInput + * Specifies a string input. * Vocabulary computed and output integerized. -#### schema.IntegerLabelType +#### types.IntegerLabel + * Specifies an integer target. * Not transformed. -#### schema.StringLabelType +#### types.StringLabel + * Specifies a string target. * Vocabulary computed and *output integerized.* -### Flexible Schema Example - -Imagine that you have a dataset that you would like to convert to TFRecords that -looks like this: - -| split | x | y | label | -|-------|-------|------|-------| -| TRAIN | 0.32 | 42 |1 | - -You can use TFRecorder as shown below: - -```python -import pandas as pd -import tfrecorder -from tfrecorder import schema - -# First create a schema map -schema_map = { - 'split':schema.SplitKeyType, - 'x':schema.FloatInputType, - 'y':schema.IntegerInputType, - 'label':schema.IntegerLabelType -} - -# Now call TFRecorder with the specified schema_map - -df = pd.read_csv(...) -df.tensorflow.to_tfr( - output_dir='gs://my/bucket', - schema_map=schema_map, - runner='DataflowRunner', - project='my-project', - region='us-central1') -``` -After calling TFRecorder's to_tfr() function, TFRecorder will create an Apache beam pipeline, either locally or in this case -using Google Cloud's Dataflow runner. This beam pipeline will use the schema map to identify the types you've associated with -each data column and process your data using [TensorFlow Transform](https://www.tensorflow.org/tfx/transform/get_started) and TFRecorder's image processing functions to convert the data into into TFRecords. - ## Contributing -Pull requests are welcome. Please see our [code of conduct](docs/code-of-conduct.md) and [contributing guide](docs/contributing.md). +Pull requests are welcome. +Please see our [code of conduct](docs/code-of-conduct.md) and [contributing guide](docs/contributing.md). ## Why TFRecorder? -Using the TFRecord storage format is important for optimal machine learning pipelines and getting the most from your hardware (in cloud or on prem). + +Using the TFRecord storage format is important for optimal machine learning pipelines and getting the most from your hardware (in cloud or on prem). TFRecords help when: * Your model is input bound (reading data is impacting training time). * Anytime you want to use tf.Dataset * When your dataset can't fit into memory - -In our work at [Google Cloud AI Services](https://cloud.google.com/consulting) we wanted to help our users spend their time writing AI/ML applications, and spend less time converting data. - +Need help with using AI in the cloud? +Visit [Google Cloud AI Services](https://cloud.google.com/consulting). diff --git a/RELEASE.md b/RELEASE.md index fedc739..0319e6f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,11 @@ +# Release 2.0 + +* Changes `create_tfrecords` and `check_tfrecords` to `convert` and `inspect` respectively +* Adds `convert_and_load` function +* Changes flexible schema to use `dataclasses` +* Adds automated testing for notebooks +* Minor fixes and usability improvements + # Hotfix 1.1.3 * Adds note regarding DataFrame header specification in README.md. diff --git a/requirements.txt b/requirements.txt index c669ff5..37ec989 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,6 @@ jupyter >= 1.0.0 tensorflow >= 2.3.1 pyarrow <0.18,>=0.17 frozendict >= 1.2 +dataclasses >= 0.5;python_version<"3.7" +nbval >= 0.9.6 +pytest >= 6.1.1 diff --git a/samples/Basic-TFRecorder-Usage.ipynb b/samples/Basic-TFRecorder-Usage.ipynb index 7afbf7b..6d0ebd1 100644 --- a/samples/Basic-TFRecorder-Usage.ipynb +++ b/samples/Basic-TFRecorder-Usage.ipynb @@ -7,75 +7,1049 @@ "# Basic TFRUtil Usage\n", "\n", "This notebook demonstrates the basic usage of TFRUtil. It is meant to be run from the /sample/ path and uses test images included with TFRUtil stored in /tfrutil/test_data.\n", - " \n", + "\n", "Before running this notebook, please install TFUtil with the command `python setup.py` from the repository root." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd \n", + "import os\n", + "import pathlib\n", + "\n", + "import pandas as pd\n", "import tfrecorder" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "OUTPUT_PATH=\"./out\" # YOUR LOCAL OUTPUT PATH HERE" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"data.csv\")" + "input_file = pathlib.Path(os.getcwd())/'../tfrecorder/test_data/data.csv'\n", + "output_dir = './out' # YOUR LOCAL OUTPUT PATH HERE" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
splitimage_urilabel
0TESTtfrecorder/test_data/images/TEST/cat/cat-800x6...cat
1TESTtfrecorder/test_data/images/TEST/goat/goat-640...goat
2TRAINtfrecorder/test_data/images/TRAIN/cat/cat-640x...cat
3TRAINtfrecorder/test_data/images/TRAIN/goat/goat-64...goat
4VALIDATIONtfrecorder/test_data/images/VALIDATION/cat/cat...cat
\n", + "
" + ], + "text/plain": [ + " split image_uri label\n", + "0 TEST tfrecorder/test_data/images/TEST/cat/cat-800x6... cat\n", + "1 TEST tfrecorder/test_data/images/TEST/goat/goat-640... goat\n", + "2 TRAIN tfrecorder/test_data/images/TRAIN/cat/cat-640x... cat\n", + "3 TRAIN tfrecorder/test_data/images/TRAIN/goat/goat-64... goat\n", + "4 VALIDATION tfrecorder/test_data/images/VALIDATION/cat/cat... cat" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df" + "df = pd.read_csv(input_file)\n", + "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Logging output to /tmp/tfrecorder-beam.log " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " var import_html = () => {\n", + " ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n", + " var link = document.createElement('link');\n", + " link.rel = 'import'\n", + " link.href = href;\n", + " document.head.appendChild(link);\n", + " });\n", + " }\n", + " if ('import' in document.createElement('link')) {\n", + " import_html();\n", + " } else {\n", + " var webcomponentScript = document.createElement('script');\n", + " webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n", + " webcomponentScript.type = 'text/javascript';\n", + " webcomponentScript.onload = function(){\n", + " import_html();\n", + " };\n", + " document.head.appendChild(webcomponentScript);\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'job_id': 'DirectRunner',\n", + " 'metrics': {'rows': 6, 'good_images': None, 'bad_images': 6},\n", + " 'tfrecord_dir': './out/tfrecorder-20201028-160301-to-tfr'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.tensorflow.to_tfr(output_dir=OUTPUT_PATH)" + "results = df.tensorflow.to_tfr(output_dir)\n", + "results" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-121014-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-123151-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-121047-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-132122-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-121052-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-132135-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-122052-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-132406-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-122403-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-132701-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-122505-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-133529-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-122646-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-133624-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-122743-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201027-173444-to-tfr\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtfrecorder-20201022-123126-to-tfr\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47mtfrecorder-20201028-160301-to-tfr\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m4068da78afd34722a84c51ceac547efa\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m64e013b30bc74404802fe2460761f588\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mcdcca595e2b641e3849589a01521bf9e\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/4068da78afd34722a84c51ceac547efa:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/4068da78afd34722a84c51ceac547efa/variables:\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/64e013b30bc74404802fe2460761f588:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/64e013b30bc74404802fe2460761f588/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/64e013b30bc74404802fe2460761f588/variables:\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/cdcca595e2b641e3849589a01521bf9e:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/tft_tmp/tftransform_tmp/cdcca595e2b641e3849589a01521bf9e/variables:\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-121014-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m3f3303365f734478af972a71d5ad8f3c\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mae6826008f984da98176a5c3ddcb25c4\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47md6b56908c11648cfa4534371dbb028b0\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/3f3303365f734478af972a71d5ad8f3c:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/3f3303365f734478af972a71d5ad8f3c/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/3f3303365f734478af972a71d5ad8f3c/variables:\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/ae6826008f984da98176a5c3ddcb25c4:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/ae6826008f984da98176a5c3ddcb25c4/variables:\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/d6b56908c11648cfa4534371dbb028b0:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/tft_tmp/tftransform_tmp/d6b56908c11648cfa4534371dbb028b0/variables:\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-121047-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m3698f3931fd14bb9bff583e64c54ffb6\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m38b82a6b79aa407db7893cfe1c77ecb7\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47md4824d2bad824e3ca30a0b1a4c0bf9b4\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/3698f3931fd14bb9bff583e64c54ffb6:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/3698f3931fd14bb9bff583e64c54ffb6/variables:\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/38b82a6b79aa407db7893cfe1c77ecb7:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/38b82a6b79aa407db7893cfe1c77ecb7/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/38b82a6b79aa407db7893cfe1c77ecb7/variables:\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/d4824d2bad824e3ca30a0b1a4c0bf9b4:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/tft_tmp/tftransform_tmp/d4824d2bad824e3ca30a0b1a4c0bf9b4/variables:\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-121052-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m37663f58e2004143b615f97402b28947\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mc96550c0a81b402480820390cde50a79\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mcea75278cf5c4e43b99c8298a541d291\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/37663f58e2004143b615f97402b28947:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/37663f58e2004143b615f97402b28947/variables:\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/c96550c0a81b402480820390cde50a79:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/c96550c0a81b402480820390cde50a79/variables:\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/cea75278cf5c4e43b99c8298a541d291:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/cea75278cf5c4e43b99c8298a541d291/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/tft_tmp/tftransform_tmp/cea75278cf5c4e43b99c8298a541d291/variables:\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-122052-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m3def6fa94a85469fba815ffce7bad60e\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m836430f3087143909ebe6f18152d2ba0\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47ma8e39089895748f3a7f014bf71c766c6\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/3def6fa94a85469fba815ffce7bad60e:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/3def6fa94a85469fba815ffce7bad60e/variables:\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/836430f3087143909ebe6f18152d2ba0:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/836430f3087143909ebe6f18152d2ba0/variables:\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/a8e39089895748f3a7f014bf71c766c6:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/a8e39089895748f3a7f014bf71c766c6/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/tft_tmp/tftransform_tmp/a8e39089895748f3a7f014bf71c766c6/variables:\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-122403-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m284f49ccb1ec48f081168bd4b85380d3\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m45d603331df040eb81f19a4cd0d87708\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47ma32121048ec347b1a46632d69345bb72\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/284f49ccb1ec48f081168bd4b85380d3:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/284f49ccb1ec48f081168bd4b85380d3/variables:\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/45d603331df040eb81f19a4cd0d87708:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/45d603331df040eb81f19a4cd0d87708/variables:\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/a32121048ec347b1a46632d69345bb72:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/a32121048ec347b1a46632d69345bb72/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/tft_tmp/tftransform_tmp/a32121048ec347b1a46632d69345bb72/variables:\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-122505-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m20307815fc944228be997a062f893ff4\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m51489082db1646b8a19ca2c21a092ca0\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m8001daeb500344e692ee9cc7d5a37e41\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/20307815fc944228be997a062f893ff4:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/20307815fc944228be997a062f893ff4/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/20307815fc944228be997a062f893ff4/variables:\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/51489082db1646b8a19ca2c21a092ca0:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/51489082db1646b8a19ca2c21a092ca0/variables:\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/8001daeb500344e692ee9cc7d5a37e41:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/tft_tmp/tftransform_tmp/8001daeb500344e692ee9cc7d5a37e41/variables:\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-122646-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m0a7173cf1670405db8519cc86e17c845\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m440c9bef6def42e3acf7b327b6295170\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mf6bc80df88424de69e461e9b8b3cc2ea\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/0a7173cf1670405db8519cc86e17c845:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/0a7173cf1670405db8519cc86e17c845/variables:\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/440c9bef6def42e3acf7b327b6295170:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/440c9bef6def42e3acf7b327b6295170/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/440c9bef6def42e3acf7b327b6295170/variables:\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/f6bc80df88424de69e461e9b8b3cc2ea:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/tft_tmp/tftransform_tmp/f6bc80df88424de69e461e9b8b3cc2ea/variables:\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-122743-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m1cb2ecd7d5ae47c1a6fc2d5bea3ce504\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m3877b46d90094b89a8f95fa1d625525a\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m9a83c37d67f54e1cab8ab824c53926b6\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/1cb2ecd7d5ae47c1a6fc2d5bea3ce504:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/1cb2ecd7d5ae47c1a6fc2d5bea3ce504/variables:\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/3877b46d90094b89a8f95fa1d625525a:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/3877b46d90094b89a8f95fa1d625525a/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/3877b46d90094b89a8f95fa1d625525a/variables:\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/9a83c37d67f54e1cab8ab824c53926b6:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/tft_tmp/tftransform_tmp/9a83c37d67f54e1cab8ab824c53926b6/variables:\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-123126-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m383f6db81e674c18aa598803b2735b12\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m765e3ac1b7794cc19b90025229929c91\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47ma9c454acb0b94209b42b589953dc40dd\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/383f6db81e674c18aa598803b2735b12:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/383f6db81e674c18aa598803b2735b12/variables:\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/765e3ac1b7794cc19b90025229929c91:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/765e3ac1b7794cc19b90025229929c91/variables:\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/a9c454acb0b94209b42b589953dc40dd:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/a9c454acb0b94209b42b589953dc40dd/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/tft_tmp/tftransform_tmp/a9c454acb0b94209b42b589953dc40dd/variables:\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-123151-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr:\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m1b7a7957b4c84bb8817a610ae5597254\u001B[m\u001B[m \u001B[1m\u001B[34m\u001B[47m3ab241e4838c47faa1307430f674cba0\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp/tftransform_tmp/1b7a7957b4c84bb8817a610ae5597254:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp/tftransform_tmp/1b7a7957b4c84bb8817a610ae5597254/variables:\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp/tftransform_tmp/3ab241e4838c47faa1307430f674cba0:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132122-to-tfr/tft_tmp/tftransform_tmp/3ab241e4838c47faa1307430f674cba0/variables:\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m617158aef51446619f1193dfa6ee30fa\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m882ecbb5ad6f4e0a8692237ed2af6d72\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47md2a2597fb5764de39563d1e3828b2cd4\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/617158aef51446619f1193dfa6ee30fa:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/617158aef51446619f1193dfa6ee30fa/variables:\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/882ecbb5ad6f4e0a8692237ed2af6d72:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/882ecbb5ad6f4e0a8692237ed2af6d72/variables:\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/d2a2597fb5764de39563d1e3828b2cd4:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/d2a2597fb5764de39563d1e3828b2cd4/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/tft_tmp/tftransform_tmp/d2a2597fb5764de39563d1e3828b2cd4/variables:\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-132135-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m4a28588557c1419895114bc6edb695be\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m542d64a62cf74f8cbe9a8916790419ce\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m5cd38819d7c24c23a6ede32bd4547a2a\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/4a28588557c1419895114bc6edb695be:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/4a28588557c1419895114bc6edb695be/variables:\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/542d64a62cf74f8cbe9a8916790419ce:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/542d64a62cf74f8cbe9a8916790419ce/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/542d64a62cf74f8cbe9a8916790419ce/variables:\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/5cd38819d7c24c23a6ede32bd4547a2a:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/tft_tmp/tftransform_tmp/5cd38819d7c24c23a6ede32bd4547a2a/variables:\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-132406-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m201f1f94018a490bbf5877bcf6b5c5db\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m6438afee6e484bbd810a20d437f84a84\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mc338e97fe31b4fc084327897c9a07ecc\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/201f1f94018a490bbf5877bcf6b5c5db:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/201f1f94018a490bbf5877bcf6b5c5db/variables:\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/6438afee6e484bbd810a20d437f84a84:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/6438afee6e484bbd810a20d437f84a84/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/6438afee6e484bbd810a20d437f84a84/variables:\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/c338e97fe31b4fc084327897c9a07ecc:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/tft_tmp/tftransform_tmp/c338e97fe31b4fc084327897c9a07ecc/variables:\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-132701-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m89e69e97f7bb4ab791373df35a281826\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mc957d83ca4f340b6882ce01e84e225c7\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mec546d0b3388408b90d460a19ec2e16b\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/89e69e97f7bb4ab791373df35a281826:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/89e69e97f7bb4ab791373df35a281826/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/89e69e97f7bb4ab791373df35a281826/variables:\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/c957d83ca4f340b6882ce01e84e225c7:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/c957d83ca4f340b6882ce01e84e225c7/variables:\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/ec546d0b3388408b90d460a19ec2e16b:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/tft_tmp/tftransform_tmp/ec546d0b3388408b90d460a19ec2e16b/variables:\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-133529-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m220e294d5b9a44ba90e84f357b24ec28\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m91b0706c6a334575b676aa43a21b29ea\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mac1204e611dd42228c5a4a04d5dd92ef\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/220e294d5b9a44ba90e84f357b24ec28:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/220e294d5b9a44ba90e84f357b24ec28/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/220e294d5b9a44ba90e84f357b24ec28/variables:\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/91b0706c6a334575b676aa43a21b29ea:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/91b0706c6a334575b676aa43a21b29ea/variables:\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/ac1204e611dd42228c5a4a04d5dd92ef:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/tft_tmp/tftransform_tmp/ac1204e611dd42228c5a4a04d5dd92ef/variables:\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201022-133624-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m4c3803300da547198d92c67e7e22b276\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m6a1eb8662ccc4409874be42eac9c8819\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47m7a7ae073f1924e0e9e25fbf07adc03bb\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/4c3803300da547198d92c67e7e22b276:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/4c3803300da547198d92c67e7e22b276/variables:\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/6a1eb8662ccc4409874be42eac9c8819:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/6a1eb8662ccc4409874be42eac9c8819/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/6a1eb8662ccc4409874be42eac9c8819/variables:\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/7a7ae073f1924e0e9e25fbf07adc03bb:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/tft_tmp/tftransform_tmp/7a7ae073f1924e0e9e25fbf07adc03bb/variables:\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201027-173444-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr:\n", + "discarded-data-00000-of-00001 \u001B[1m\u001B[34m\u001B[47mtransform_fn\u001B[m\u001B[m\n", + "test-00000-of-00001.tfrecord.gz \u001B[1m\u001B[34m\u001B[47mtransformed_metadata\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mtft_tmp\u001B[m\u001B[m validation-00000-of-00001.tfrecord.gz\n", + "train-00000-of-00001.tfrecord.gz\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47mtftransform_tmp\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp:\n", + "\u001B[1m\u001B[34m\u001B[47m70019930e8eb4d1a8fe3d6b076f38e00\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mb6f679e2f2784489a9b8dfe767fc702b\u001B[m\u001B[m\n", + "\u001B[1m\u001B[34m\u001B[47mbb0deba40f3e448289ca27fc9abdcf6a\u001B[m\u001B[m\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/70019930e8eb4d1a8fe3d6b076f38e00:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/70019930e8eb4d1a8fe3d6b076f38e00/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/70019930e8eb4d1a8fe3d6b076f38e00/variables:\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/b6f679e2f2784489a9b8dfe767fc702b:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/b6f679e2f2784489a9b8dfe767fc702b/variables:\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/bb0deba40f3e448289ca27fc9abdcf6a:\n", + "saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/tft_tmp/tftransform_tmp/bb0deba40f3e448289ca27fc9abdcf6a/variables:\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/transform_fn:\n", + "\u001B[1m\u001B[34m\u001B[47massets\u001B[m\u001B[m saved_model.pb \u001B[1m\u001B[34m\u001B[47mvariables\u001B[m\u001B[m\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/transform_fn/assets:\n", + "vocab_compute_and_apply_vocabulary_vocabulary\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/transform_fn/variables:\n", + "\n", + "./out/tfrecorder-20201028-160301-to-tfr/transformed_metadata:\n", + "schema.pbtxt\n" + ] + } + ], "source": [ - "!ls -R ./out" + "!ls -R $output_dir" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "# That's it!\n", "\n", "As you can see, TFRUtil has taken the supplied CSV and transformed it into TFRecords, ready for consumption, along with the transform function" ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -94,9 +1068,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/samples/Convert-image-directory.ipynb b/samples/Convert-image-directory.ipynb new file mode 100644 index 0000000..71a06b7 --- /dev/null +++ b/samples/Convert-image-directory.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert image directory to TFRecord files" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error importing tfx_bsl_extension.arrow.array_util. Some tfx_bsl functionalities are not available" + ] + } + ], + "source": [ + "import os\n", + "import pathlib\n", + "import requests\n", + "import shutil\n", + "import tempfile\n", + "\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tfrecorder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert sample image directory" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "image_dir = pathlib.Path(os.getcwd())/'../tfrecorder/test_data/images'\n", + "assert image_dir.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " var import_html = () => {\n", + " ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n", + " var link = document.createElement('link');\n", + " link.rel = 'import'\n", + " link.href = href;\n", + " document.head.appendChild(link);\n", + " });\n", + " }\n", + " if ('import' in document.createElement('link')) {\n", + " import_html();\n", + " } else {\n", + " var webcomponentScript = document.createElement('script');\n", + " webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n", + " webcomponentScript.type = 'text/javascript';\n", + " webcomponentScript.onload = function(){\n", + " import_html();\n", + " };\n", + " document.head.appendChild(webcomponentScript);\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'job_id': 'DirectRunner', 'metrics': {'rows': 6, 'good_images': 6, 'bad_images': None}, 'tfrecord_dir': '/tmp/tfrecords/tfrecorder-20201027-173455-create-tfrecords'}\n" + ] + } + ], + "source": [ + "output_dir = pathlib.Path('/tmp/tfrecords')\n", + "results = tfrecorder.convert(str(image_dir), output_dir)\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a TF dataset from generated TFRecords" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = tfrecorder.load(results['tfrecord_dir'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['image', 'image_channels', 'image_height', 'image_name', 'image_width', 'label', 'split'])\n" + ] + } + ], + "source": [ + "for x in datasets['TRAIN'].take(1):\n", + " print(x.keys())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/samples/Convert-structured-data.ipynb b/samples/Convert-structured-data.ipynb new file mode 100644 index 0000000..0df4e36 --- /dev/null +++ b/samples/Convert-structured-data.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert structured data to TFRecords " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error importing tfx_bsl_extension.arrow.array_util. Some tfx_bsl functionalities are not available" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import pathlib\n", + "\n", + "import tensorflow as tf\n", + "\n", + "import tfrecorder\n", + "from tfrecorder import input_schema\n", + "from tfrecorder import types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load [Titanic](https://www.openml.org/d/40945) dataset " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading data from https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv\n", + "49152/44225 [=================================] - 1s 12us/step\n" + ] + } + ], + "source": [ + "data_path = pathlib.Path('/tmp/datasets/titanic.csv')\n", + "if not data_path.exists():\n", + " tf.keras.utils.get_file(\n", + " 'titanic.csv',\n", + " origin='https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv',\n", + " extract=False,\n", + " cache_dir='/tmp', cache_subdir='datasets')\n", + " \n", + "assert data_path.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(str(data_path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add `split` column " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassNameSexAgeSiblings/Spouses AboardParents/Children AboardFaresplit
003Mr. Owen Harris Braundmale22.0107.2500TRAIN
111Mrs. John Bradley (Florence Briggs Thayer) Cum...female38.01071.2833TRAIN
213Miss. Laina Heikkinenfemale26.0007.9250TRAIN
311Mrs. Jacques Heath (Lily May Peel) Futrellefemale35.01053.1000TRAIN
403Mr. William Henry Allenmale35.0008.0500TRAIN
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Name \\\n", + "0 0 3 Mr. Owen Harris Braund \n", + "1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum... \n", + "2 1 3 Miss. Laina Heikkinen \n", + "3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle \n", + "4 0 3 Mr. William Henry Allen \n", + "\n", + " Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare \\\n", + "0 male 22.0 1 0 7.2500 \n", + "1 female 38.0 1 0 71.2833 \n", + "2 female 26.0 0 0 7.9250 \n", + "3 female 35.0 1 0 53.1000 \n", + "4 male 35.0 0 0 8.0500 \n", + "\n", + " split \n", + "0 TRAIN \n", + "1 TRAIN \n", + "2 TRAIN \n", + "3 TRAIN \n", + "4 TRAIN " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['split'] = 'TRAIN'\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert to TFRecords " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " var import_html = () => {\n", + " ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n", + " var link = document.createElement('link');\n", + " link.rel = 'import'\n", + " link.href = href;\n", + " document.head.appendChild(link);\n", + " });\n", + " }\n", + " if ('import' in document.createElement('link')) {\n", + " import_html();\n", + " } else {\n", + " var webcomponentScript = document.createElement('script');\n", + " webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n", + " webcomponentScript.type = 'text/javascript';\n", + " webcomponentScript.onload = function(){\n", + " import_html();\n", + " };\n", + " document.head.appendChild(webcomponentScript);\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results = tfrecorder.convert(\n", + " df, \n", + " './tfrecords', \n", + " schema=input_schema.Schema({\n", + " 'Survived': types.IntegerInput,\n", + " 'Pclass': types.IntegerInput,\n", + " 'Name': types.StringInput,\n", + " 'Sex': types.StringInput,\n", + " 'Age': types.FloatInput,\n", + " 'Siblings/Spouses Aboard': types.IntegerInput,\n", + " 'Parents/Children Aboard': types.IntegerInput,\n", + " 'Fare': types.FloatInput,\n", + " 'split': types.SplitKey,\n", + " })\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'job_id': 'DirectRunner', 'metrics': {'rows': 887, 'good_images': None, 'bad_images': None}, 'tfrecord_dir': './tfrecords/tfrecorder-20201027-173544-create-tfrecords'}\n" + ] + } + ], + "source": [ + "print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a dataset from the generated TFRecord files " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = tfrecorder.load(results['tfrecord_dir'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Age: \n", + "Fare: \n", + "Name: \n", + "Parents/Children Aboard: \n", + "Pclass: \n", + "Sex: \n", + "Siblings/Spouses Aboard: \n", + "Survived: \n", + "split: \n" + ] + } + ], + "source": [ + "for x in datasets['TRAIN'].take(1):\n", + " for k, v in x.items():\n", + " print(f'{k}: {v.dtype}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/Loading-a-TF-Dataset.ipynb b/samples/Loading-a-TF-Dataset.ipynb index ef640fe..d73c147 100644 --- a/samples/Loading-a-TF-Dataset.ipynb +++ b/samples/Loading-a-TF-Dataset.ipynb @@ -8,7 +8,7 @@ "\n", "This notebook briefly demonstrates how to load a TF Dataset from TFRecord files generated by TFRecorder.\n", "Note that currently, the TFRecord files must be in a directory on your local machine.\n", - "The directory is expected to have the following structure, based on TFRecorder's `create_tfrecords` default output:\n", + "The directory is expected to have the following structure, based on TFRecorder's `convert` default output:\n", "```\n", "tfrecord_dir/\n", " train-*.tfrecord.gz\n", @@ -79,7 +79,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's examine the contains of the training set " + "Let's examine the contains of the training set" ] }, { @@ -109,6 +109,25 @@ "for d in train:\n", " print(d['image_name'])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -127,9 +146,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/samples/Using-TFRecorder-with-Google-Cloud-Dataflow.ipynb b/samples/Using-TFRecorder-with-Google-Cloud-Dataflow.ipynb index 15c8f3e..3ee3d71 100644 --- a/samples/Using-TFRecorder-with-Google-Cloud-Dataflow.ipynb +++ b/samples/Using-TFRecorder-with-Google-Cloud-Dataflow.ipynb @@ -133,9 +133,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/setup.py b/setup.py index 83517a6..fa50856 100644 --- a/setup.py +++ b/setup.py @@ -15,13 +15,13 @@ # limitations under the License. """Package setup.""" - +import sys from setuptools import find_packages from setuptools import setup # Semantic versioning (PEP 440) -VERSION = '1.1.2' +VERSION = '2.0' REQUIRED_PACKAGES = [ "apache-beam[gcp] >= 2.22.0", @@ -42,6 +42,10 @@ "tensorflow_transform >= 0.22", ] +if sys.version_info < (3,7,0,0,0): + print("Version less than 3.7, appending dataclasses") + REQUIRED_PACKAGES.append("dataclasses >= 0.5") + setup( name='tfrecorder', diff --git a/tfrecorder/__init__.py b/tfrecorder/__init__.py index 391b2b9..df89039 100644 --- a/tfrecorder/__init__.py +++ b/tfrecorder/__init__.py @@ -15,7 +15,9 @@ # limitations under the License. """Imports.""" + from tfrecorder import accessor -from tfrecorder.client import create_tfrecords -from tfrecorder.check import check_tfrecords -from tfrecorder.dataset import load +from tfrecorder.converter import convert +from tfrecorder.dataset_loader import load +from tfrecorder.converter import convert_and_load +from tfrecorder.utils import inspect diff --git a/tfrecorder/accessor.py b/tfrecorder/accessor.py index f2a13b5..7ae67a6 100644 --- a/tfrecorder/accessor.py +++ b/tfrecorder/accessor.py @@ -25,9 +25,9 @@ import pandas as pd from IPython.core import display -from tfrecorder import client +from tfrecorder import converter from tfrecorder import constants -from tfrecorder import schema +from tfrecorder import input_schema @pd.api.extensions.register_dataframe_accessor('tensorflow') @@ -41,7 +41,7 @@ def __init__(self, pandas_obj): def to_tfr( self, output_dir: str, - schema_map: Dict[str, schema.SchemaMap] = schema.image_csv_schema, + schema: input_schema.Schema = input_schema.IMAGE_CSV_SCHEMA, runner: str = 'DirectRunner', project: Optional[str] = None, region: Optional[str] = None, @@ -65,7 +65,7 @@ def to_tfr( num_shards=10) Args: - schema_map: A dict mapping column names to supported types. + schema: An instance of input_schema.Schema that describes the schema. output_dir: Local directory or GCS Location to save TFRecords to. Note: GCS required for DataflowRunner runner: Beam runner. Can be DirectRunner or DataflowRunner. @@ -86,10 +86,10 @@ def to_tfr( display.HTML( 'Logging output to /tmp/{} '.format(constants.LOGFILE))) - r = client.create_tfrecords( + r = converter.convert( self._df, output_dir=output_dir, - schema_map=schema_map, + schema=schema, runner=runner, project=project, region=region, diff --git a/tfrecorder/beam_image_test.py b/tfrecorder/beam_image_test.py index 4da6d76..aaebc82 100644 --- a/tfrecorder/beam_image_test.py +++ b/tfrecorder/beam_image_test.py @@ -27,7 +27,9 @@ from tfrecorder import beam_image from tfrecorder import test_utils -from tfrecorder import schema +from tfrecorder import input_schema + +RANDOM_SEED = 42 class BeamImageTests(unittest.TestCase): @@ -36,7 +38,8 @@ class BeamImageTests(unittest.TestCase): def setUp(self): self.pipeline = test_utils.get_test_pipeline() self.df = test_utils.get_test_df() - self.image_file = 'tfrecorder/test_data/images/cat/cat-640x853-1.jpg' + self.image_file = self.df.image_uri.sample( + random_state=RANDOM_SEED).values[0] def test_load(self): """Tests the image loading function.""" @@ -46,7 +49,7 @@ def test_load(self): def test_file_not_found_load(self): """Test loading an image that doesn't exist.""" with self.assertRaises(OSError): - _ = beam_image.load('tfrecorder/test_data/images/cat/food.jpg') + _ = beam_image.load('/some/image/dir/food.jpg') def test_mode_to_channel(self): """Tests `mode_to_channel`.""" @@ -83,10 +86,8 @@ def test_extract_image_dofn(self): with self.pipeline as p: - converter = schema.get_tft_coder(['split', 'image_uri', 'label'], - schema.image_csv_schema) - - + schema = input_schema.IMAGE_CSV_SCHEMA + converter = schema.get_input_coder() extract_images_fn = beam_image.ExtractImagesDoFn('image_uri') data = ( diff --git a/tfrecorder/beam_pipeline.py b/tfrecorder/beam_pipeline.py index 94157b7..7f77527 100644 --- a/tfrecorder/beam_pipeline.py +++ b/tfrecorder/beam_pipeline.py @@ -33,36 +33,10 @@ from tensorflow_transform import beam as tft_beam from tfrecorder import beam_image -from tfrecorder import common -from tfrecorder import schema +from tfrecorder import input_schema from tfrecorder import types -def _get_job_name(job_label: str = None) -> str: - """Returns Beam runner job name. - - Args: - job_label: A user defined string that helps define the job. - - Returns: - A job name compatible with apache beam runners, including a time stamp to - insure uniqueness. - """ - - job_name = 'tfrecorder-' + common.get_timestamp() - if job_label: - job_label = job_label.replace('_', '-') - job_name += '-' + job_label - - return job_name - - -def _get_job_dir(output_path: str, job_name: str) -> str: - """Returns Beam processing job directory.""" - - return os.path.join(output_path, job_name) - - def _get_pipeline_options( runner: str, job_name: str, @@ -102,13 +76,15 @@ def _partition_fn( split_key: str = 'split') -> int: """Returns index used to partition an element from a PCollection.""" del unused_num_partitions - dataset_type = element[split_key].decode('utf-8') + dataset_type = element[split_key] + if isinstance(dataset_type, bytes): + dataset_type = element[split_key].decode('utf-8') try: - index = schema.SplitKeyType.allowed_values.index(dataset_type) + index = types.SplitKey.allowed_values.index(dataset_type) except ValueError as e: logging.warning('Unable to index dataset type %s: %s.', dataset_type, str(e)) - index = schema.SplitKeyType.allowed_values.index('DISCARD') + index = types.SplitKey.allowed_values.index('DISCARD') return index def _get_write_to_tfrecord(output_dir: str, @@ -147,7 +123,7 @@ def _preprocessing_fn(inputs: Dict[str, Any], outputs = {} for name, supported_type in schema_map.items(): - if supported_type.type_name == 'string_label': + if supported_type == types.StringLabel: outputs[name] = tft.compute_and_apply_vocabulary(inputs[name]) else: outputs[name] = inputs[name] @@ -188,13 +164,13 @@ def get_split_counts(df: pd.DataFrame, split_key: str): def _transform_and_write_tfr( dataset: pvalue.PCollection, tfr_writer: Callable[[], beam.io.tfrecordio.WriteToTFRecord], - raw_metadata: types.BeamDatasetMetadata, + metadata: types.BeamDatasetMetadata, preprocessing_fn: Optional[Callable] = None, transform_fn: Optional[types.TransformFn] = None, label: str = 'data'): """Applies TF Transform to dataset and outputs it as TFRecords.""" - dataset_metadata = (dataset, raw_metadata) + dataset_metadata = (dataset, metadata) if transform_fn: transformed_dataset, transformed_metadata = ( @@ -222,29 +198,26 @@ def _transform_and_write_tfr( # pylint: disable=too-many-locals def build_pipeline( df: pd.DataFrame, - job_label: str, + job_dir: str, runner: str, project: str, region: str, - output_dir: str, compression: str, num_shards: int, - schema_map: Dict[str, collections.namedtuple], + schema: input_schema.Schema, tfrecorder_wheel: str, dataflow_options: Dict[str, Any]) -> beam.Pipeline: """Runs TFRecorder Beam Pipeline. Args: df: Pandas DataFrame - job_label: User description for the beam job. + job_dir: GCS or Local Path for output. runner: Beam Runner: (e.g. DataflowRunner, DirectRunner). project: GCP project ID (if DataflowRunner) region: GCP compute region (if DataflowRunner) - output_dir: GCS or Local Path for output. compression: gzip or None. num_shards: Number of shards. - schema_map: A schema map (Dictionary mapping Dataframe columns to types) - used to derive the input and target schema. + schema: A Schema object defining the input schema. tfrecorder_wheel: Path to TFRecorder wheel for DataFlow dataflow_options: Dataflow Runner Options (optional) @@ -254,8 +227,7 @@ def build_pipeline( Note: These inputs must be validated upstream (by client.create_tfrecord()) """ - job_name = _get_job_name(job_label) - job_dir = _get_job_dir(output_dir, job_name) + _, job_name = os.path.split(job_dir) options = _get_pipeline_options( runner, job_name, @@ -268,7 +240,7 @@ def build_pipeline( p = beam.Pipeline(options=options) with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')): - converter = schema.get_tft_coder(df.columns, schema_map) + converter = schema.get_input_coder() flatten_rows = ToCSVRows() # Each element in the data PCollection will be a dict @@ -282,7 +254,7 @@ def build_pipeline( ) # Extract images if an image_uri key exists. - image_uri_key = schema.get_key(schema.ImageUriType, schema_map) + image_uri_key = schema.image_uri_key if image_uri_key: extract_images_fn = beam_image.ExtractImagesDoFn(image_uri_key) @@ -291,8 +263,8 @@ def build_pipeline( | 'ReadImage' >> beam.ParDo(extract_images_fn) ) - # If the schema contains a valid split key, partition the dataset. - split_key = schema.get_key(schema.SplitKeyType, schema_map) + # Get the split key from schema. + split_key = schema.split_key # Note: This will not always reflect actual number of samples per dataset # written as TFRecords. The succeeding `Partition` operation may mark @@ -301,45 +273,43 @@ def build_pipeline( # file for that split, albeit empty. split_counts = get_split_counts(df, split_key) - # Raw metadata is the TFT metadata after image insertion but before TFT - # e.g Image columns have been added if necessary. - raw_metadata = schema.get_raw_metadata(df.columns, schema_map) - # Require training set to be available in the input data. The transform_fn # and transformed_metadata will be generated from the training set and # applied to the other datasets, if any - assert 'TRAIN' in split_counts + if 'TRAIN' not in split_counts: + raise AttributeError('`TRAIN` set expected to be present in splits') # Split dataset into train, validation, test sets. partition_fn = functools.partial(_partition_fn, split_key=split_key) train_data, val_data, test_data, discard_data = ( data | 'SplitDataset' >> beam.Partition( - partition_fn, len(schema.SplitKeyType.allowed_values))) + partition_fn, len(types.SplitKey.allowed_values))) - raw_schema_map = schema.get_raw_schema_map(schema_map=schema_map) preprocessing_fn = functools.partial( _preprocessing_fn, - schema_map=raw_schema_map) + schema_map=schema.pre_tft_schema_map) tfr_writer = functools.partial( _get_write_to_tfrecord, output_dir=job_dir, compress=compression, num_shards=num_shards) + pre_tft_metadata = schema.get_pre_tft_metadata() + transform_fn = _transform_and_write_tfr( train_data, tfr_writer, preprocessing_fn=preprocessing_fn, - raw_metadata=raw_metadata, + metadata=pre_tft_metadata, label='Train') if 'VALIDATION' in split_counts: _transform_and_write_tfr( val_data, tfr_writer, transform_fn=transform_fn, - raw_metadata=raw_metadata, + metadata=pre_tft_metadata, label='Validation') if 'TEST' in split_counts: _transform_and_write_tfr( test_data, tfr_writer, transform_fn=transform_fn, - raw_metadata=raw_metadata, + metadata=pre_tft_metadata, label='Test') _ = ( diff --git a/tfrecorder/beam_pipeline_test.py b/tfrecorder/beam_pipeline_test.py index 29d9ad2..3885485 100644 --- a/tfrecorder/beam_pipeline_test.py +++ b/tfrecorder/beam_pipeline_test.py @@ -26,11 +26,13 @@ import apache_beam as beam import frozendict import tensorflow as tf +import tensorflow_transform as tft from tensorflow_transform import beam as tft_beam from tfrecorder import beam_pipeline -from tfrecorder import schema +from tfrecorder import input_schema from tfrecorder import test_utils +from tfrecorder import types # pylint: disable=protected-access @@ -45,9 +47,9 @@ def test_processing_fn_with_int_label(self): 'image_uri': 'gs://foo/bar.jpg', 'label': 1} my_schema = frozendict.FrozenOrderedDict({ - 'split': schema.SplitKeyType, - 'image_uri': schema.ImageUriType, - 'label': schema.IntegerLabelType}) + 'split': types.SplitKey, + 'image_uri': types.ImageUri, + 'label': types.IntegerLabel}) result = beam_pipeline._preprocessing_fn(element, schema_map=my_schema) self.assertEqual(element, result) @@ -61,8 +63,8 @@ def test_processing_fn_with_string_label(self, mock_transform): 'split': 'TRAIN', 'image_uri': 'gs://foo/bar.jpg', 'label': tf.constant('cat', dtype=tf.string)} - result = beam_pipeline._preprocessing_fn(element, - schema_map=schema.image_csv_schema) + result = beam_pipeline._preprocessing_fn( + element, schema_map=input_schema.IMAGE_CSV_SCHEMA.input_schema_map) result['label'] = result['label'].numpy() self.assertEqual(0, result['label']) @@ -97,8 +99,9 @@ class GetSplitCountsTest(unittest.TestCase): def setUp(self): self.df = test_utils.get_test_df() - self.schema_map = schema.image_csv_schema - self.split_key = schema.get_key(schema.SplitKeyType, self.schema_map) + self.schema = input_schema.IMAGE_CSV_SCHEMA + self.schema_map = self.schema.input_schema_map + self.split_key = self.schema.split_key def test_all_splits(self): """Tests case where train, validation and test data exists""" @@ -126,16 +129,18 @@ class TransformAndWriteTfrTest(unittest.TestCase): def setUp(self): self.pipeline = test_utils.get_test_pipeline() - self.raw_df = test_utils.get_raw_feature_df() + self.pre_tft_df = test_utils.get_pre_tft_feature_df() self.temp_dir_obj = tempfile.TemporaryDirectory(dir='/tmp', prefix='test-') self.test_dir = self.temp_dir_obj.name self.tfr_writer = functools.partial( beam_pipeline._get_write_to_tfrecord, output_dir=self.test_dir, compress='gzip', num_shards=2) - self.raw_schema = schema.get_raw_schema_map(schema.image_csv_schema) - self.raw_metadata = schema.get_raw_metadata(self.raw_df.columns, - self.raw_schema) - self.converter = schema.get_tft_coder(self.raw_df.columns, self.raw_schema) + self.schema = input_schema.Schema( + input_schema.IMAGE_CSV_SCHEMA.input_schema_map) + self.pre_tft_metadata = self.schema.get_pre_tft_metadata() + self.converter = tft.coders.CsvCoder( + list(self.schema.pre_tft_schema_map.keys()), + self.pre_tft_metadata.schema) self.transform_fn_path = ('./tfrecorder/test_data/sample_tfrecords') def tearDown(self): @@ -153,15 +158,16 @@ def test_train(self): with self.pipeline as p: with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')): - df = self.raw_df[self.raw_df.split == 'TRAIN'] + df = self.pre_tft_df[self.pre_tft_df.split == 'TRAIN'] dataset = self._get_dataset(p, df) - preprocessing_fn = functools.partial(beam_pipeline._preprocessing_fn, - schema_map=self.raw_schema) + preprocessing_fn = functools.partial( + beam_pipeline._preprocessing_fn, + schema_map=self.schema.pre_tft_schema_map) transform_fn = ( beam_pipeline._transform_and_write_tfr( dataset, self.tfr_writer, preprocessing_fn=preprocessing_fn, - raw_metadata=self.raw_metadata, + metadata=self.pre_tft_metadata, label='Train')) _ = transform_fn | tft_beam.WriteTransformFn(self.test_dir) @@ -179,12 +185,12 @@ def test_non_training(self): with self.pipeline as p: with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')): - df = self.raw_df[self.raw_df.split == 'TEST'] + df = self.pre_tft_df[self.pre_tft_df.split == 'TEST'] dataset = self._get_dataset(p, df) transform_fn = p | tft_beam.ReadTransformFn(self.transform_fn_path) beam_pipeline._transform_and_write_tfr( dataset, self.tfr_writer, transform_fn=transform_fn, - raw_metadata=self.raw_metadata, label='Test') + metadata=self.pre_tft_metadata, label='Test') self.assertFalse(glob.glob(os.path.join(self.test_dir, 'train*.gz'))) self.assertFalse(glob.glob(os.path.join(self.test_dir, 'validation*.gz'))) diff --git a/tfrecorder/cli.py b/tfrecorder/cli.py index 7afde2b..493e310 100644 --- a/tfrecorder/cli.py +++ b/tfrecorder/cli.py @@ -18,16 +18,16 @@ import fire -from tfrecorder import client -from tfrecorder import check +from tfrecorder import converter +from tfrecorder import utils def main(): """Entry point for command-line interface.""" fire.Fire({ - 'create-tfrecords': client.create_tfrecords, - 'check-tfrecords': check.check_tfrecords, + 'convert': converter.convert, + 'inspect': utils.inspect, }) diff --git a/tfrecorder/common.py b/tfrecorder/common.py deleted file mode 100644 index 09df3e8..0000000 --- a/tfrecorder/common.py +++ /dev/null @@ -1,42 +0,0 @@ -# Lint as: python3 - -# Copyright 2020 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Common utility functions.""" - -from datetime import datetime -import os - -import tensorflow as tf - -from tfrecorder import constants - - -def get_timestamp() -> str: - """Returns current date and time as formatted string.""" - return datetime.now().strftime('%Y%m%d-%H%M%S') - - -def copy_logfile_to_gcs(logfile: str, output_dir: str): - """Copies a logfile from local to gcs storage.""" - try: - with open(logfile, 'r') as log_reader: - out_log = os.path.join(output_dir, constants.LOGFILE) - with tf.io.gfile.GFile(out_log, 'w') as gcs_logfile: - log = log_reader.read() - gcs_logfile.write(log) - except FileNotFoundError as e: - raise FileNotFoundError("Unable to copy log file {} to gcs.".format( - e.filename)) from e diff --git a/tfrecorder/common_test.py b/tfrecorder/common_test.py deleted file mode 100644 index ea3fda5..0000000 --- a/tfrecorder/common_test.py +++ /dev/null @@ -1,53 +0,0 @@ -# Lint as: python3 - -# Copyright 2020 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for common.""" - -import os -import unittest -import tempfile - -from tfrecorder import common -from tfrecorder import constants - - -class CopyLogTest(unittest.TestCase): - """Misc tests for _copy_logfile_to_gcs.""" - - def test_valid_copy(self): - """Test valid file copy.""" - with tempfile.TemporaryDirectory() as tmpdirname: - text = 'log test log test' - infile = os.path.join(tmpdirname, 'foo.log') - with open(infile, 'w') as f: - f.write(text) - common.copy_logfile_to_gcs(infile, tmpdirname) - - outfile = os.path.join(tmpdirname, constants.LOGFILE) - with open(outfile, 'r') as f: - data = f.read() - self.assertEqual(text, data) - - def test_invalid_copy(self): - """Test invalid file copy.""" - with tempfile.TemporaryDirectory() as tmpdirname: - infile = os.path.join(tmpdirname, 'foo.txt') - with self.assertRaises(FileNotFoundError): - common.copy_logfile_to_gcs(infile, tmpdirname) - -# pylint: disable=protected-access -if __name__ == '__main__': - unittest.main() diff --git a/tfrecorder/client.py b/tfrecorder/converter.py similarity index 76% rename from tfrecorder/client.py rename to tfrecorder/converter.py index a321005..5d46eea 100644 --- a/tfrecorder/client.py +++ b/tfrecorder/converter.py @@ -16,10 +16,10 @@ """Provides a common interface for TFRecorder to DF Accessor and CLI. -client.py provides create_tfrecords() to upstream clients including +converter.py provides create_tfrecords() to upstream clients including the Pandas DataFrame Accessor (accessor.py) and the CLI (cli.py). """ -import collections + import logging import os from typing import Any, Dict, Optional, Sequence, Tuple, Union @@ -29,21 +29,25 @@ import tensorflow as tf from tfrecorder import beam_pipeline -from tfrecorder import common +from tfrecorder import dataset_loader from tfrecorder import constants -from tfrecorder import schema +from tfrecorder import input_schema +from tfrecorder import types +from tfrecorder import utils + # TODO(mikebernico) Add test for only one split_key. -def _validate_data(df: pd.DataFrame, - schema_map: Dict[str, collections.namedtuple]): +def _validate_data(df: pd.DataFrame, schema: input_schema.Schema): """Verifies data is consistent with schema.""" - for key, value in schema_map.items(): + for key, value in schema.input_schema_map.items(): _ = value # TODO(mikebernico) Implement type checking. if key not in df.columns: + schema_keys = list(schema.input_schema_map.keys()) raise AttributeError( f'DataFrame does not contain expected column: {key}. ' - f'Ensure header matches schema keys: {list(schema_map.keys())}.') + f'Ensure header matches schema keys: {schema_keys}.') + def _validate_runner( runner: str, @@ -79,7 +83,7 @@ def _path_split(filepath: str) -> Tuple[str, str]: if filepath.startswith(constants.GCS_PREFIX): _, path = filepath.split(constants.GCS_PREFIX) - head, tail = os.path.split(path) + head, tail = os.path.split(os.path.normpath(path)) return constants.GCS_PREFIX + head, tail return os.path.split(filepath) @@ -113,7 +117,7 @@ def _read_image_directory(image_dir: str) -> pd.DataFrame: """ rows = [] - split_values = schema.allowed_split_values + split_values = types.SplitKey.allowed_values for root, _, files in tf.io.gfile.walk(image_dir): if files: root_, label = _path_split(root) @@ -127,13 +131,47 @@ def _read_image_directory(image_dir: str) -> pd.DataFrame: row = [split, image_uri, label] rows.append(row) - return pd.DataFrame(rows, columns=schema.image_csv_schema.keys()) + return pd.DataFrame( + rows, columns=input_schema.IMAGE_CSV_SCHEMA.get_input_keys()) def _is_directory(input_data) -> bool: """Returns True if `input_data` is a directory; False otherwise.""" - return tf.io.gfile.isdir(input_data) + # Note: First check will flag if user has the necessary credentials + # to access the directory (if it is in GCS) + return tf.io.gfile.exists(input_data) and tf.io.gfile.isdir(input_data) + + +def _get_job_name(job_label: str = None) -> str: + """Returns Beam runner job name. + + Args: + job_label: A user defined string that helps define the job. + + Returns: + A job name compatible with apache beam runners, including a time stamp to + insure uniqueness. + """ + + job_name = 'tfrecorder-' + utils.get_timestamp() + if job_label: + job_label = job_label.replace('_', '-') + job_name += '-' + job_label + + return job_name + + +def _get_job_dir(output_path: str, job_name: str) -> str: + """Returns Beam processing job directory.""" + + return os.path.join(output_path, job_name) + + +def _get_dataflow_url(job_id: str, project: str, region: str) -> str: + """Returns Cloud DataFlow URL for Apache Beam job.""" + + return f'{constants.CONSOLE_DATAFLOW_URI}{region}/{job_id}?=project={project}' def read_csv( @@ -143,7 +181,7 @@ def read_csv( """Returns a a Pandas DataFrame from a CSV file.""" if header is None and not names: - names = list(schema.image_csv_schema.keys()) + names = list(input_schema.IMAGE_CSV_SCHEMA.get_input_keys()) with tf.io.gfile.GFile(csv_file) as f: return pd.read_csv(f, names=names, header=header) @@ -203,13 +241,14 @@ def _configure_logging(logfile): tf_logger.handlers = [] tf_logger.addHandler(handler) + # pylint: disable=too-many-arguments # pylint: disable=too-many-locals -def create_tfrecords( +def convert( source: Union[str, pd.DataFrame], - output_dir: str, - schema_map: Dict[str, collections.namedtuple] = schema.image_csv_schema, + output_dir: str = './tfrecords', + schema: input_schema.Schema = input_schema.IMAGE_CSV_SCHEMA, header: Optional[Union[str, int, Sequence]] = 'infer', names: Optional[Sequence] = None, runner: str = 'DirectRunner', @@ -217,7 +256,7 @@ def create_tfrecords( region: Optional[str] = None, tfrecorder_wheel: Optional[str] = None, dataflow_options: Optional[Dict[str, Any]] = None, - job_label: str = 'create-tfrecords', + job_label: str = 'convert', compression: Optional[str] = 'gzip', num_shards: int = 0) -> Dict[str, Any]: """Generates TFRecord files from given input data. @@ -228,18 +267,19 @@ def create_tfrecords( Usage: import tfrecorder - job_id = tfrecorder.client.create_tfrecords( + job_id = tfrecorder.convert( train_df, output_dir='gcs://foo/bar/train', - runner='DirectFlowRunner) + runner='DirectRunner) Args: source: Pandas DataFrame, CSV file or image directory path. output_dir: Local directory or GCS Location to save TFRecords to. - schema_map: A dict mapping column names to supported types. + schema: An instance of input_schema.Schema. header: Indicates row/s to use as a header. Not used when `input_data` is a Pandas DataFrame. If 'infer' (default), header is taken from the first line of a CSV + names: List of column names to use for CSV or DataFrame input. runner: Beam runner. Can be 'DirectRunner' or 'DataFlowRunner' project: GCP project name (Required if DataflowRunner) region: GCP region name (Required if DataflowRunner) @@ -259,24 +299,27 @@ def create_tfrecords( df = to_dataframe(source, header, names) - _validate_data(df, schema_map) + _validate_data(df, schema) _validate_runner(runner, project, region, tfrecorder_wheel) logfile = os.path.join('/tmp', constants.LOGFILE) _configure_logging(logfile) + job_name = _get_job_name(job_label) + job_dir = _get_job_dir(output_dir, job_name) + p = beam_pipeline.build_pipeline( df, - job_label=job_label, + job_dir=job_dir, runner=runner, project=project, region=region, - output_dir=output_dir, compression=compression, num_shards=num_shards, - schema_map=schema_map, + schema=schema, tfrecorder_wheel=tfrecorder_wheel, - dataflow_options=dataflow_options) + dataflow_options=dataflow_options, + ) result = p.run() @@ -292,7 +335,6 @@ def create_tfrecords( good_image_count = _get_beam_metric(good_image_filter, result) bad_image_count = _get_beam_metric(bad_image_filter, result) - # TODO(mikebernico): Profile metric impact with larger dataset. metrics = { 'rows': row_count, 'good_images': good_image_count, @@ -305,28 +347,27 @@ def create_tfrecords( } logging.info("Job Complete.") - else: + elif runner == 'DataflowRunner': logging.info("Using Dataflow Runner.") - # Construct Dataflow URL - job_id = result.job_id() - - url = ( - constants.CONSOLE_DATAFLOW_URI + - region + - '/' + - job_id + - '?project=' + - project) + url = _get_dataflow_url(job_id, project, region) job_result = { 'job_id': job_id, - 'dataflow_url': url + 'dataflow_url': url, } + # Copy the logfile to GCS output dir + utils.copy_logfile_to_gcs(logfile, output_dir) - logging.shutdown() + else: + raise ValueError(f'Unsupported runner: {runner}') - if runner == 'DataflowRunner': - # if this is a Dataflow job, copy the logfile to GCS - common.copy_logfile_to_gcs(logfile, output_dir) + job_result['tfrecord_dir'] = job_dir return job_result + + +def convert_and_load(*args, **kwargs): + """Converts data into TFRecords and loads them as TF Datasets.""" + + job_result = convert(*args, **kwargs) + return dataset_loader.load(job_result['tfrecord_dir']) diff --git a/tfrecorder/client_test.py b/tfrecorder/converter_test.py similarity index 66% rename from tfrecorder/client_test.py rename to tfrecorder/converter_test.py index c00c6e6..64d8bb4 100644 --- a/tfrecorder/client_test.py +++ b/tfrecorder/converter_test.py @@ -26,16 +26,36 @@ import mock import pandas as pd +import tensorflow as tf -from tfrecorder import client +from tfrecorder import beam_pipeline +from tfrecorder import converter +from tfrecorder import dataset_loader from tfrecorder import test_utils -from tfrecorder import schema +from tfrecorder import input_schema # pylint: disable=protected-access -class ClientTest(unittest.TestCase): +class IsDirectoryTest(unittest.TestCase): + """Tests `_is_directory`.""" + + def test_local_ok(self): + """Test function returns True on local directory.""" + + with tempfile.TemporaryDirectory() as dirname: + self.assertTrue(converter._is_directory(dirname)) + + def test_local_exists_but_not_dir(self): + """Test function returns False on local (non-directory) file.""" + + with tempfile.NamedTemporaryFile(prefix='test_', dir='/tmp') as f: + self.assertFalse(converter._is_directory(f.name)) + + +# TODO(cezequiel): Refactor to per-function test case classes +class MiscTest(unittest.TestCase): """Misc tests for `client` module.""" def setUp(self): @@ -44,51 +64,52 @@ def setUp(self): self.test_project = 'foo' self.test_wheel = '/my/path/wheel.whl' - @mock.patch('tfrecorder.client.beam_pipeline') - def test_create_tfrecords_direct_runner(self, mock_beam): + @mock.patch.object(beam_pipeline, 'build_pipeline', autospec=True) + def test_create_tfrecords_direct_runner(self, _): """Tests `create_tfrecords` Direct case.""" - mock_beam.build_pipeline().run().wait_until_finished.return_value = { - 'rows':6} - r = client.create_tfrecords( + r = converter.convert( self.test_df, runner='DirectRunner', output_dir='/tmp/direct_runner') - self.assertTrue('metrics' in r) + self.assertCountEqual(r.keys(), ['job_id', 'metrics', 'tfrecord_dir']) + self.assertCountEqual( + r['metrics'].keys(), ['rows', 'good_images', 'bad_images']) - @mock.patch('tfrecorder.client.beam_pipeline') - def test_create_tfrecords_dataflow_runner(self, mock_beam): + @mock.patch.object(converter, '_get_dataflow_url') + @mock.patch.object(beam_pipeline, 'build_pipeline') + def test_create_tfrecords_dataflow_runner(self, mock_pipeline, mock_url): """Tests `create_tfrecords` Dataflow case.""" - mock_beam.build_pipeline().run().job_id.return_value = 'foo_id' - + job_id = 'foo_id' + dataflow_url = 'http://some/job/url' + mock_pipeline().run().job_id.return_value = job_id + mock_url.return_value = dataflow_url df2 = self.test_df.copy() df2['image_uri'] = 'gs://' + df2['image_uri'] outdir = '/tmp/dataflow_runner' - - expected = { - 'job_id': 'foo_id', - 'dataflow_url': 'https://console.cloud.google.com/dataflow/jobs/' + - 'us-central1/foo_id?project=foo'} - os.makedirs(outdir, exist_ok=True) - r = client.create_tfrecords( + r = converter.convert( df2, runner='DataflowRunner', output_dir=outdir, region=self.test_region, project=self.test_project, tfrecorder_wheel=self.test_wheel) - self.assertEqual(r, expected) + + self.assertCountEqual(r.keys(), ['job_id', 'dataflow_url', 'tfrecord_dir']) + self.assertEqual(r['job_id'], job_id) + self.assertEqual(r['dataflow_url'], dataflow_url) + self.assertRegex(r['tfrecord_dir'], fr'{outdir}/tfrecorder-.+-?.*') def test_path_split(self): """Tests `_path_split`.""" filename = 'image_file.jpg' - dirpaths = ['/path/to/image/dir', 'gs://path/to/image/dir'] + dirpaths = ['/path/to/image/dir/', 'gs://path/to/image/dir/'] for dir_ in dirpaths: filepath = os.path.join(dir_, filename) - act_dirpath, act_filename = client._path_split(filepath) - self.assertEqual(act_dirpath, dir_) + act_dirpath, act_filename = converter._path_split(filepath) + self.assertEqual(act_dirpath, dir_.rsplit('/', 1)[0]) self.assertEqual(act_filename, filename) @@ -100,42 +121,40 @@ def setUp(self): self.test_region = 'us-central1' self.test_project = 'foo' self.test_wheel = '/my/path/wheel.whl' - self.test_schema_map = schema.image_csv_schema + self.test_schema = input_schema.IMAGE_CSV_SCHEMA def test_valid_dataframe(self): """Tests valid DataFrame input.""" - self.assertIsNone( - client._validate_data( - self.test_df, - schema.image_csv_schema)) + self.assertIsNone(converter._validate_data(self.test_df, self.test_schema)) def test_missing_image(self): """Tests missing image column.""" with self.assertRaises(AttributeError): df2 = self.test_df.copy() df2.drop('image_uri', inplace=True, axis=1) - client._validate_data(df2, schema.image_csv_schema) + converter._validate_data(df2, self.test_schema) def test_missing_label(self): """Tests missing label column.""" with self.assertRaises(AttributeError): df2 = self.test_df.copy() df2.drop('label', inplace=True, axis=1) - client._validate_data(df2, schema.image_csv_schema) + converter._validate_data(df2, self.test_schema) def test_missing_split(self): """Tests missing split column.""" split_key = 'split' - schema_keys = re.escape(str(list(self.test_schema_map.keys()))) + schema_keys = re.escape( + str(list(self.test_schema.input_schema_map.keys()))) regex = fr'^.+column: {split_key}.+keys: {schema_keys}.$' with self.assertRaisesRegex(AttributeError, regex): df2 = self.test_df.copy() df2.drop(split_key, inplace=True, axis=1) - client._validate_data(df2, schema.image_csv_schema) + converter._validate_data(df2, self.test_schema) def test_valid_runner(self): """Tests valid runner.""" - self.assertIsNone(client._validate_runner( + self.assertIsNone(converter._validate_runner( runner='DirectRunner', project=self.test_project, region=self.test_region, @@ -144,7 +163,7 @@ def test_valid_runner(self): def test_invalid_runner(self): """Tests invalid runner.""" with self.assertRaises(AttributeError): - client._validate_runner( + converter._validate_runner( runner='FooRunner', project=self.test_project, region=self.test_region, @@ -156,7 +175,7 @@ def test_gcs_path_with_dataflow_runner_missing_param(self): for p, r in [ (None, self.test_region), (self.test_project, None), (None, None)]: with self.assertRaises(AttributeError) as context: - client._validate_runner( + converter._validate_runner( runner='DataflowRunner', project=p, region=r, @@ -168,7 +187,7 @@ def test_gcs_path_with_dataflow_runner_missing_param(self): def test_gcs_path_with_dataflow_runner_missing_wheel(self): """Tests DataflowRunner with missing required whl path.""" with self.assertRaises(AttributeError) as context: - client._validate_runner( + converter._validate_runner( runner='DataflowRunner', project=self.test_project, region=self.test_region, @@ -202,40 +221,35 @@ class ReadImageDirectoryTest(unittest.TestCase): def setUp(self): self.image_data = test_utils.get_test_df() - self.split_key = schema.get_key( - schema.SplitKeyType, schema.image_csv_schema) - self.label_key = schema.get_key( - schema.StringLabelType, schema.image_csv_schema) - self.image_uri_key = schema.get_key( - schema.ImageUriType, schema.image_csv_schema) self.tempfiles = [] self.tempdir = None + self.schema = input_schema.Schema( + input_schema.IMAGE_CSV_SCHEMA.input_schema_map) def tearDown(self): for fp in self.tempfiles: fp.close() - self.tempdir.cleanup() def test_normal(self): """Tests conversion of expected directory structure on local machine.""" - g = self.image_data.groupby([self.split_key, self.label_key]) + g = self.image_data.groupby([self.schema.split_key, self.schema.label_key]) self.tempdir = tempfile.TemporaryDirectory() rows = [] for (split, label), indices in g.groups.items(): dir_ = os.path.join(self.tempdir.name, split, label) os.makedirs(dir_) - for f in list(self.image_data.loc[indices, self.image_uri_key]): + for f in list(self.image_data.loc[indices, self.schema.image_uri_key]): _, name = os.path.split(f) fp = tempfile.NamedTemporaryFile( dir=dir_, suffix='.jpg', prefix=name) self.tempfiles.append(fp) rows.append([split, fp.name, label]) - columns = list(schema.image_csv_schema.keys()) - actual = client._read_image_directory(self.tempdir.name) + columns = list(input_schema.IMAGE_CSV_SCHEMA.get_input_keys()) + actual = converter._read_image_directory(self.tempdir.name) actual.sort_values(by=columns, inplace=True) actual.reset_index(drop=True, inplace=True) expected = pd.DataFrame(rows, columns=columns) @@ -255,14 +269,16 @@ def setUp(self): def test_valid_csv_no_header_no_names_specified(self): """Tests a valid CSV without a header and no header names given.""" f = _make_csv_tempfile(self.sample_data) - actual = client.read_csv(f.name, header=None) - self.assertEqual(list(actual.columns), list(schema.image_csv_schema.keys())) + actual = converter.read_csv(f.name, header=None) + self.assertEqual( + list(actual.columns), + list(input_schema.IMAGE_CSV_SCHEMA.get_input_keys())) self.assertEqual(actual.values.tolist(), self.sample_data) def test_valid_csv_no_header_names_specified(self): """Tests valid CSV without a header, but header names are given.""" f = _make_csv_tempfile(self.sample_data) - actual = client.read_csv(f.name, header=None, names=self.header) + actual = converter.read_csv(f.name, header=None, names=self.header) self.assertEqual(list(actual.columns), self.header) self.assertEqual(actual.values.tolist(), self.sample_data) @@ -270,7 +286,7 @@ def test_valid_csv_with_header_no_names_specified(self): """Tests valid CSV with header, and no header names given (inferred).""" f = _make_csv_tempfile([self.header] + self.sample_data) - actual = client.read_csv(f.name) + actual = converter.read_csv(f.name) self.assertEqual(list(actual.columns), self.header) self.assertEqual(actual.values.tolist(), self.sample_data) @@ -278,7 +294,7 @@ def test_valid_csv_with_header_names_specified(self): """Tests valid CSV with header, and header names given (override).""" f = _make_csv_tempfile([self.header] + self.sample_data) - actual = client.read_csv(f.name, names=self.header, header=0) + actual = converter.read_csv(f.name, names=self.header, header=0) self.assertEqual(list(actual.columns), self.header) self.assertEqual(actual.values.tolist(), self.sample_data) @@ -291,34 +307,34 @@ def setUp(self) -> None: columns = sample_data.pop(0) self.input_df = pd.DataFrame(sample_data, columns=columns) - @mock.patch.object(client, 'read_csv', autospec=True) + @mock.patch.object(converter, 'read_csv', autospec=True) def test_input_csv(self, read_csv): """Tests valid input CSV file.""" expected = self.input_df read_csv.return_value = expected f = _make_csv_tempfile(get_sample_image_csv_data()) - actual = client.to_dataframe(f.name) + actual = converter.to_dataframe(f.name) pd.testing.assert_frame_equal(actual, expected) def test_input_dataframe_no_names_specified(self): """Tests valid input dataframe with no header names specified.""" - actual = client.to_dataframe(self.input_df) + actual = converter.to_dataframe(self.input_df) pd.testing.assert_frame_equal(actual, self.input_df) def test_input_dataframe_with_header(self): """Tests valid input dataframe with header specified.""" names = list(self.input_df.columns[0:-1]) - actual = client.to_dataframe(self.input_df, names=names) + actual = converter.to_dataframe(self.input_df, names=names) pd.testing.assert_frame_equal(actual, self.input_df[names]) - @mock.patch.object(client, '_read_image_directory') + @mock.patch.object(converter, '_read_image_directory') def test_input_image_dir(self, mock_fn): """Tests valid input image directory.""" mock_fn.return_value = self.input_df with tempfile.TemporaryDirectory() as input_data: - actual = client.to_dataframe(input_data) + actual = converter.to_dataframe(input_data) pd.testing.assert_frame_equal(actual, self.input_df) def test_error_invalid_inputs(self): @@ -326,7 +342,30 @@ def test_error_invalid_inputs(self): inputs = [0, 'not_a_csv_file', list(), dict()] for input_data in inputs: with self.assertRaises(ValueError): - client.to_dataframe(input_data) + converter.to_dataframe(input_data) + + +class ConvertAndLoadTest(unittest.TestCase): + """Tests `convert_and_load`.""" + + def setUp(self): + self.tfrecord_dir = '/path/to/tfrecords' + self.dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) + self.datasets = { + 'TRAIN': self.dataset, + 'VALIDATION': self.dataset, + 'TEST': self.dataset, + } + + @mock.patch.object(dataset_loader, 'load', autospec=True) + @mock.patch.object(converter, 'convert', autospec=True) + def test_convert_and_load_normal(self, convert_fn, load_fn): + """Tests normal case.""" + convert_fn.return_value = dict(tfrecord_dir=self.tfrecord_dir) + load_fn.return_value = self.datasets + source = '/path/to/data.csv' + datasets = converter.convert_and_load(source) + self.assertEqual(datasets, self.datasets) if __name__ == '__main__': diff --git a/tfrecorder/dataset.py b/tfrecorder/dataset_loader.py similarity index 92% rename from tfrecorder/dataset.py rename to tfrecorder/dataset_loader.py index 7185a14..1fd0821 100644 --- a/tfrecorder/dataset.py +++ b/tfrecorder/dataset_loader.py @@ -23,7 +23,7 @@ import tensorflow as tf import tensorflow_transform as tft -from tfrecorder import schema +from tfrecorder import types TRANSFORMED_METADATA_DIR = tft.TFTransformOutput.TRANSFORMED_METADATA_DIR @@ -38,6 +38,10 @@ def _validate_tfrecord_dir(tfrecord_dir: str): """Verifies that the TFRecord directory contains expected files.""" + # Check that input is a valid directory. + if not os.path.isdir(tfrecord_dir): + raise ValueError(f'Not a directory: {tfrecord_dir}') + # Check that TensorFlow Transform directories are present. for dirname in [TRANSFORMED_METADATA_DIR, TRANSFORM_FN_DIR]: if not os.path.isdir(os.path.join(tfrecord_dir, dirname)): @@ -49,10 +53,11 @@ def _get_tfrecord_files_per_split(tfrecord_dir: str): """Returns TFRecord files for each split. The TFRecord filenames should have a prefix based on lowercase versions of - items in `schema.allowed_split_values`. DISCARD split is not checked. + items in `types.SplitKey.allowed_split_values`. DISCARD split is + not checked. """ split_to_files = {} - for split in schema.allowed_split_values[:-1]: + for split in types.SplitKey.allowed_values[:-1]: prefix = split.lower() files = glob.glob(os.path.join(tfrecord_dir, prefix + '*')) if files: diff --git a/tfrecorder/dataset_test.py b/tfrecorder/dataset_loader_test.py similarity index 64% rename from tfrecorder/dataset_test.py rename to tfrecorder/dataset_loader_test.py index 55195b7..28f189f 100644 --- a/tfrecorder/dataset_test.py +++ b/tfrecorder/dataset_loader_test.py @@ -14,15 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for `dataset.py`.""" +"""Tests for `dataset_loader.py`.""" import os import tempfile import unittest -from tfrecorder import dataset -from tfrecorder import schema +from tfrecorder import dataset_loader from tfrecorder import test_utils +from tfrecorder import types # pylint: disable=protected-access @@ -38,28 +38,36 @@ def tearDown(self): def test_ok(self): """Checks that function works as expected when TFT dirs are present.""" - os.makedirs(os.path.join(self.temp_dir, dataset.TRANSFORMED_METADATA_DIR)) - os.makedirs(os.path.join(self.temp_dir, dataset.TRANSFORM_FN_DIR)) - dataset._validate_tfrecord_dir(self.temp_dir) + os.makedirs( + os.path.join(self.temp_dir, dataset_loader.TRANSFORMED_METADATA_DIR)) + os.makedirs(os.path.join(self.temp_dir, dataset_loader.TRANSFORM_FN_DIR)) + dataset_loader._validate_tfrecord_dir(self.temp_dir) def test_missing_metadata_dir(self): """Check exception raised when metadata directory missing.""" with self.assertRaises(FileNotFoundError): - os.makedirs(os.path.join(self.temp_dir, dataset.TRANSFORM_FN_DIR)) - dataset._validate_tfrecord_dir(self.temp_dir) + os.makedirs(os.path.join(self.temp_dir, dataset_loader.TRANSFORM_FN_DIR)) + dataset_loader._validate_tfrecord_dir(self.temp_dir) def test_missing_transform_fn_dir(self): """Check exception raised when transform_fn directory missing.""" with self.assertRaises(FileNotFoundError): os.makedirs( - os.path.join(self.temp_dir, dataset.TRANSFORMED_METADATA_DIR)) - dataset._validate_tfrecord_dir(self.temp_dir) + os.path.join(self.temp_dir, dataset_loader.TRANSFORMED_METADATA_DIR)) + dataset_loader._validate_tfrecord_dir(self.temp_dir) def test_missing_tf_transform_dirs(self): """Check exception raised when both TFT transform directories missing.""" with self.assertRaises(FileNotFoundError): - dataset._validate_tfrecord_dir(self.temp_dir) + dataset_loader._validate_tfrecord_dir(self.temp_dir) + + def test_not_dir(self): + """Check exception raised when input is not a valid directory.""" + + input_dir = '/some/non-existent/dir' + with self.assertRaisesRegex(ValueError, 'Not a directory:'): + dataset_loader._validate_tfrecord_dir(input_dir) class LoadTest(unittest.TestCase): @@ -70,10 +78,10 @@ def setUp(self): def test_load_all_splits(self): """Test case where all TFRecord splits can be loaded.""" - dataset_dict = dataset.load(self.tfrecord_dir) + dataset_dict = dataset_loader.load(self.tfrecord_dir) self.assertEqual(len(dataset_dict), 3) self.assertCountEqual( - list(dataset_dict.keys()), schema.allowed_split_values[:-1]) + list(dataset_dict.keys()), types.SplitKey.allowed_values[:-1]) if __name__ == '__main__': diff --git a/tfrecorder/input_schema.py b/tfrecorder/input_schema.py new file mode 100644 index 0000000..b32bccf --- /dev/null +++ b/tfrecorder/input_schema.py @@ -0,0 +1,102 @@ +# Lint as: python3 + +# Copyright 2020 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Defines input types for TFRecorder's input schema.""" + +from typing import Dict + +import tensorflow as tf +import tensorflow_transform as tft +from tensorflow_transform.tf_metadata import dataset_metadata +from tensorflow_transform.tf_metadata import schema_utils + +from tfrecorder import types + + + +class Schema: + """Defines a TFRecorder input schema.""" + def __init__(self, schema_map: Dict[str, types.SupportedType]) -> None: + """Defines TFRecorder input schema. + + Args: + schema_map: An ordered dictionary that maps input columns to + TFRecorder supported types. + """ + self.split_key = None + self.image_uri_key = None + self.label_key = None + self.input_schema_map = schema_map + self.pre_tft_schema_map = {} + + for k, v in schema_map.items(): + if v == types.SplitKey: + self.split_key = k + if 'Label' in v.__name__: # Matches any label type + self.label_key = k + + if v == types.ImageUri: + self.image_uri_key = k + # if an image key is present, add image features to pre tft schema + self.pre_tft_schema_map['image_name'] = types.ImageSupportString + self.pre_tft_schema_map['image'] = types.ImageSupportString + self.pre_tft_schema_map['image_height'] = types.ImageDim + self.pre_tft_schema_map['image_width'] = types.ImageDim + self.pre_tft_schema_map['image_channels'] = types.ImageDim + else: + self.pre_tft_schema_map[k] = schema_map[k] + + if not self.split_key: + raise AttributeError("Schema must contain a split key.") + + @staticmethod + def _get_feature_spec( + schema_map: Dict[str, types.SupportedType] + ) -> Dict[str, tf.io.FixedLenFeature]: + """Gets map of column names to tf.io.FixedLenFeatures for TFT.""" + return {k: v.feature_spec for k, v in schema_map.items()} + + @staticmethod + def _get_metadata( + feature_spec: Dict[str, tf.io.FixedLenFeature] + ) -> types.BeamDatasetMetadata: + """Gets DatasetMetadata.""" + return dataset_metadata.DatasetMetadata( + schema_utils.schema_from_feature_spec(feature_spec)) + + def get_pre_tft_metadata(self) -> types.BeamDatasetMetadata: + """Gets pre TFT metadata, used by TFT external to this class.""" + feature_spec = self._get_feature_spec(self.pre_tft_schema_map) + return self._get_metadata(feature_spec) + + def get_input_coder(self) -> tft.coders.CsvCoder: + """Gets input schema TFT CSV Coder.""" + feature_spec = self._get_feature_spec(self.input_schema_map) + metadata = self._get_metadata(feature_spec) + return tft.coders.CsvCoder(list(self.input_schema_map.keys()), + metadata.schema) + + def get_input_keys(self): + """Returns keys for input_schema_map as list.""" + return self.input_schema_map.keys() + +# Built in / Default schema map. +image_csv_schema_map = { + 'split': types.SplitKey, + 'image_uri': types.ImageUri, + 'label': types.StringLabel} + +IMAGE_CSV_SCHEMA = Schema(image_csv_schema_map) diff --git a/tfrecorder/schema_test.py b/tfrecorder/input_schema_test.py similarity index 58% rename from tfrecorder/schema_test.py rename to tfrecorder/input_schema_test.py index c0d2d5a..f2b9eec 100644 --- a/tfrecorder/schema_test.py +++ b/tfrecorder/input_schema_test.py @@ -19,42 +19,47 @@ import unittest import tensorflow_transform as tft -from tfrecorder import schema +from tfrecorder import input_schema -class SchemaTest(unittest.TestCase): +class InputSchemaTest(unittest.TestCase): """Tests for type module.""" - def test_valid_get_tft_coder(self): - """Tests a valid call on get_tft_coder.""" - columns = ['split', 'image_uri', 'label'] - converter = schema.get_tft_coder(columns, schema.image_csv_schema) + def setUp(self): + self.schema = input_schema.Schema(input_schema.image_csv_schema_map) + + def test_valid_get_input_coder(self): + """Tests a valid call on get_input_coder.""" + converter = self.schema.get_input_coder() self.assertIsInstance(converter, tft.coders.CsvCoder) def test_valid_get_key(self): """Tests a valid split key.""" - key = schema.get_key(schema.SplitKeyType, schema.image_csv_schema) - self.assertEqual(key, 'split') + self.assertEqual(self.schema.split_key, 'split') def test_no_get_split_key(self): """Tests no split key present.""" - test_schema = dict() - for k, v in schema.image_csv_schema.items(): + test_schema_map = dict() + for k, v in input_schema.IMAGE_CSV_SCHEMA.input_schema_map.items(): # Brute force copy because OG is a FrozenOrderedDict. if k != 'split': - test_schema[k] = v + test_schema_map[k] = v - key = schema.get_key(schema.SplitKeyType, test_schema) - self.assertIsNone(key) + with self.assertRaises(AttributeError): + _ = input_schema.Schema(test_schema_map) def test_get_raw_metadata(self): """Tests a valid call to get_raw_metadata.""" - columns = ['split', 'image_uri', 'label'] - raw_metadata = schema.get_raw_metadata(columns, schema.image_csv_schema) + pre_tft_metadata = self.schema.get_pre_tft_metadata() self.assertIsInstance( - raw_metadata, + pre_tft_metadata, tft.tf_metadata.dataset_metadata.DatasetMetadata) + def test_get_input_keys(self): + """"Tests get_input_keys() function.""" + schema = input_schema.IMAGE_CSV_SCHEMA + self.assertEqual(schema.input_schema_map.keys(), schema.get_input_keys()) + if __name__ == '__main__': unittest.main() diff --git a/tfrecorder/schema.py b/tfrecorder/schema.py deleted file mode 100644 index 13ad3fd..0000000 --- a/tfrecorder/schema.py +++ /dev/null @@ -1,187 +0,0 @@ -# Lint as: python3 - -# Copyright 2020 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Defines input types for TFRecorder's input schema.""" - -import collections -from typing import Dict, List, Union - -import frozendict -import tensorflow as tf -import tensorflow_transform as tft -from tensorflow_transform.tf_metadata import dataset_metadata -from tensorflow_transform.tf_metadata import schema_utils - -# TODO(mikebernico): Refactor types into data classes -# All supported types will be based on SupportedType. -SupportedType = collections.namedtuple( - 'tfrecordInputType', - ['type_name', 'feature_spec', 'allowed_values']) - -# Supported type definitions here. -ImageUriType = SupportedType( - type_name='image_uri', - feature_spec=tf.io.FixedLenFeature([], tf.string), - allowed_values=None) - -# Note: split_key is an immutable type and these allowed values cannot change. -allowed_split_values = ['TRAIN', 'VALIDATION', 'TEST', 'DISCARD'] -SplitKeyType = SupportedType( - type_name='split_key', - feature_spec=tf.io.FixedLenFeature([], tf.string), - allowed_values=allowed_split_values) - -#TODO(mikebernico): Implement in preprocess_fn -IntegerInputType = SupportedType( - type_name='integer_input', - feature_spec=tf.io.FixedLenFeature([], tf.int64), - allowed_values=None) - -#TODO(mikebernico): Implement in preprocess_fn -FloatInputType = SupportedType( - type_name='float_input', - feature_spec=tf.io.FixedLenFeature([], tf.float64), - allowed_values=None) - -#TODO(mikebernico): Implement in preprocess_fn -CategoricalInputType = SupportedType( - type_name='categorical_input', - feature_spec=tf.io.FixedLenFeature([], tf.string), - allowed_values=None) - -IntegerLabelType = SupportedType( - type_name='integer_label', - feature_spec=tf.io.FixedLenFeature([], tf.int64), - allowed_values=None) - -StringLabelType = SupportedType( - type_name='string_label', - feature_spec=tf.io.FixedLenFeature([], tf.string), - allowed_values=None) - -ImageSupportStringType = SupportedType( - type_name='image_support_string', - feature_spec=tf.io.FixedLenFeature([], tf.string), - allowed_values=None) - -ImageSupportIntType = SupportedType( - type_name='image_support_int', - feature_spec=tf.io.FixedLenFeature([], tf.int64), - allowed_values=None) - -# TODO(mikebernico): Refactor schema_map to a container class. -# Default schema supports the legacy image_csv format. -SchemaMap = Dict[str, SupportedType] - -image_csv_schema = frozendict.FrozenOrderedDict({ - 'split': SplitKeyType, - 'image_uri': ImageUriType, - 'label': StringLabelType}) - - -def get_raw_schema_map( - schema_map: Dict[str, collections.namedtuple] - ) -> Dict[str, collections.namedtuple]: - """Converts a schema to a raw (pre TFT / post image extraction) schema.""" - raw_schema = {} - for k, v in schema_map.items(): - if v.type_name == 'image_uri': - raw_schema['image_name'] = ImageSupportStringType - raw_schema['image'] = ImageSupportStringType - raw_schema['image_height'] = ImageSupportIntType - raw_schema['image_width'] = ImageSupportIntType - raw_schema['image_channels'] = ImageSupportIntType - else: - raw_schema[k] = schema_map[k] - return raw_schema - - -def get_tft_coder( - columns: List[str], - schema_map: Dict[str, collections.namedtuple] - ) -> tft.coders.CsvCoder: - """Gets a TFT CSV Coder. - - Args: - columns: Ordered DataFrame column names, from df.column. - schema_map: Schema map used to infer the schema. - - Returns: - tft.coders.CsvCoder - """ - feature_spec = {} - # Because the DF column name order may not match the feature_spec order - # This maps existing column names to their feature spec (required part of - # namedtuple) - for col in columns: - feature_spec[col] = schema_map[col].feature_spec - - metadata = dataset_metadata.DatasetMetadata( - schema_utils.schema_from_feature_spec(feature_spec)) - - return tft.coders.CsvCoder(columns, metadata.schema) - - -def get_key( - type_: SupportedType, - schema_map: Dict[str, collections.namedtuple]) -> Union[str, None]: - """Gets first instance of key of type 'type_name' from schema map. - - Returns key name if present, otherwise returns None. - """ - #TODO(mikebernico): Fix so that multiples of a key type work in future. - for k, v in schema_map.items(): - if v.type_name == type_.type_name: - return k - return None - - -def get_raw_feature_spec(columns: List[str], - schema_map: Dict[str, collections.namedtuple] - ) -> Dict[str, tf.io.FixedLenFeature]: - """Gets RAW (pre TFT) feature spec.""" - - feature_spec = dict() - - # Because the DF column name order may not match the feature_spec order - # this maps existing column names to their feature spec (req part of - # namedtuple) - for col in columns: - if schema_map[col].type_name == 'image_uri': - # Modify feature_spec for extracted image, don't include image_uri. - # TODO(mikebernico) This only works in the case where the input has - # ONLY 1 image. Generalize to multiple images someday? - feature_spec['image_name'] = tf.io.FixedLenFeature([], tf.string) - feature_spec['image'] = tf.io.FixedLenFeature([], tf.string) - feature_spec['image_height'] = tf.io.FixedLenFeature([], tf.int64) - feature_spec['image_width'] = tf.io.FixedLenFeature([], tf.int64) - feature_spec['image_channels'] = tf.io.FixedLenFeature([], tf.int64) - else: - # Copy feature as-is. - feature_spec[col] = schema_map[col].feature_spec - return feature_spec - - -def get_raw_metadata(columns: List[str], - schema_map: Dict[str, collections.namedtuple] - ) -> dataset_metadata.DatasetMetadata: - """Returns metadata prior to TF Transform preprocessing - - Note: takes base schema_map as input, not raw_schema_map. - """ - feature_spec = get_raw_feature_spec(columns, schema_map) - return dataset_metadata.DatasetMetadata( - schema_utils.schema_from_feature_spec(feature_spec)) diff --git a/tfrecorder/test_data/data.csv b/tfrecorder/test_data/data.csv index dfaf143..9991050 100644 --- a/tfrecorder/test_data/data.csv +++ b/tfrecorder/test_data/data.csv @@ -1,7 +1,7 @@ split,image_uri,label -TRAIN,tfrecorder/test_data/images/cat/cat-640x853-1.jpg,cat -VALIDATION,tfrecorder/test_data/images/cat/cat-800x600-2.jpg,cat -TEST,tfrecorder/test_data/images/cat/cat-800x600-3.jpg,cat -TRAIN,tfrecorder/test_data/images/goat/goat-640x640-1.jpg,goat -VALIDATION,tfrecorder/test_data/images/goat/goat-320x320-2.jpg,goat -TEST,tfrecorder/test_data/images/goat/goat-640x427-3.jpg,goat \ No newline at end of file +TEST,tfrecorder/test_data/images/TEST/cat/cat-800x600-3.jpg,cat +TEST,tfrecorder/test_data/images/TEST/goat/goat-640x427-3.jpg,goat +TRAIN,tfrecorder/test_data/images/TRAIN/cat/cat-640x853-1.jpg,cat +TRAIN,tfrecorder/test_data/images/TRAIN/goat/goat-640x640-1.jpg,goat +VALIDATION,tfrecorder/test_data/images/VALIDATION/cat/cat-800x600-2.jpg,cat +VALIDATION,tfrecorder/test_data/images/VALIDATION/goat/goat-320x320-2.jpg,goat diff --git a/tfrecorder/test_data/images/cat/cat-800x600-3.jpg b/tfrecorder/test_data/images/TEST/cat/cat-800x600-3.jpg similarity index 100% rename from tfrecorder/test_data/images/cat/cat-800x600-3.jpg rename to tfrecorder/test_data/images/TEST/cat/cat-800x600-3.jpg diff --git a/tfrecorder/test_data/images/goat/goat-640x427-3.jpg b/tfrecorder/test_data/images/TEST/goat/goat-640x427-3.jpg similarity index 100% rename from tfrecorder/test_data/images/goat/goat-640x427-3.jpg rename to tfrecorder/test_data/images/TEST/goat/goat-640x427-3.jpg diff --git a/tfrecorder/test_data/images/cat/cat-640x853-1.jpg b/tfrecorder/test_data/images/TRAIN/cat/cat-640x853-1.jpg similarity index 100% rename from tfrecorder/test_data/images/cat/cat-640x853-1.jpg rename to tfrecorder/test_data/images/TRAIN/cat/cat-640x853-1.jpg diff --git a/tfrecorder/test_data/images/goat/goat-640x640-1.jpg b/tfrecorder/test_data/images/TRAIN/goat/goat-640x640-1.jpg similarity index 100% rename from tfrecorder/test_data/images/goat/goat-640x640-1.jpg rename to tfrecorder/test_data/images/TRAIN/goat/goat-640x640-1.jpg diff --git a/tfrecorder/test_data/images/cat/cat-800x600-2.jpg b/tfrecorder/test_data/images/VALIDATION/cat/cat-800x600-2.jpg similarity index 100% rename from tfrecorder/test_data/images/cat/cat-800x600-2.jpg rename to tfrecorder/test_data/images/VALIDATION/cat/cat-800x600-2.jpg diff --git a/tfrecorder/test_data/images/goat/goat-320x320-2.jpg b/tfrecorder/test_data/images/VALIDATION/goat/goat-320x320-2.jpg similarity index 100% rename from tfrecorder/test_data/images/goat/goat-320x320-2.jpg rename to tfrecorder/test_data/images/VALIDATION/goat/goat-320x320-2.jpg diff --git a/tfrecorder/test_utils.py b/tfrecorder/test_utils.py index 88b8f8f..8472c49 100644 --- a/tfrecorder/test_utils.py +++ b/tfrecorder/test_utils.py @@ -26,7 +26,7 @@ from apache_beam.testing import test_pipeline import pandas as pd -from tfrecorder import schema +from tfrecorder import input_schema TEST_DIR = 'tfrecorder/test_data' @@ -44,12 +44,12 @@ def get_test_data() -> Dict[str, List[Any]]: return get_test_df().to_dict(orient='list') -def get_raw_feature_df() -> pd.DataFrame: - """Returns test dataframe having raw feature spec schema.""" +def get_pre_tft_feature_df() -> pd.DataFrame: + """Returns test dataframe having pre-TF Transform feature spec schema.""" df = get_test_df() - my_raw_schema = schema.get_raw_schema_map(schema.image_csv_schema) - image_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema) + schema = input_schema.Schema(input_schema.image_csv_schema_map) + image_key = schema.image_uri_key df.drop([image_key], axis=1, inplace=True) df['image_name'] = 'image_name' df['image'] = 'image' @@ -59,8 +59,7 @@ def get_raw_feature_df() -> pd.DataFrame: df['image_height'] = '48' df['image_width'] = '48' df['image_channels'] = '3' - df = df[my_raw_schema.keys()] - + df = df[schema.pre_tft_schema_map.keys()] return df diff --git a/tfrecorder/types.py b/tfrecorder/types.py index e203d5d..17b8d36 100644 --- a/tfrecorder/types.py +++ b/tfrecorder/types.py @@ -16,12 +16,76 @@ """Custom types.""" -from typing import Tuple +import dataclasses +from typing import Tuple, List, Any +import tensorflow as tf from apache_beam.pvalue import PCollection from tensorflow_transform import beam as tft_beam - BeamDatasetMetadata = tft_beam.tft_beam_io.beam_metadata_io.BeamDatasetMetadata TransformedMetadata = BeamDatasetMetadata TransformFn = Tuple[PCollection, TransformedMetadata] + + +@dataclasses.dataclass +class SupportedType: + """Base type for TFRecorder Types.""" + feature_spec: tf.io.FixedLenFeature + allowed_values: List[Any] + + +@dataclasses.dataclass +class ImageUri(SupportedType): + """Supports image uri columns.""" + feature_spec = tf.io.FixedLenFeature([], tf.string) + allowed_values = [] + + +@dataclasses.dataclass +class SplitKey(SupportedType): + """Supports split key columns.""" + feature_spec = tf.io.FixedLenFeature([], tf.string) + allowed_values = ['TRAIN', 'VALIDATION', 'TEST', 'DISCARD'] + + +@dataclasses.dataclass +class IntegerInput(SupportedType): + """Supports integer columns.""" + feature_spec = tf.io.FixedLenFeature([], tf.int64) + allowed_values = [] + + +@dataclasses.dataclass +class FloatInput(SupportedType): + """Supports float columns.""" + feature_spec = tf.io.FixedLenFeature([], tf.float32) + allowed_values = [] + + +#TODO(mikebernico): Implement in preprocess_fn +@dataclasses.dataclass +class StringInput(SupportedType): + """Supports string input columns.""" + feature_spec = tf.io.FixedLenFeature([], tf.string) + allowed_values = [] + + +@dataclasses.dataclass +class IntegerLabel(IntegerInput): + """Supports integer labels.""" + + +@dataclasses.dataclass +class StringLabel(StringInput): + """Supports string labels.""" + + +@dataclasses.dataclass +class ImageSupportString(StringInput): + """Supports generated image bytestrings.""" + + +@dataclasses.dataclass +class ImageDim(IntegerInput): + """Supports generated image ints (height, width, channels).""" diff --git a/tfrecorder/check.py b/tfrecorder/utils.py similarity index 60% rename from tfrecorder/check.py rename to tfrecorder/utils.py index ff9f715..0e032f1 100644 --- a/tfrecorder/check.py +++ b/tfrecorder/utils.py @@ -14,18 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Utilities for checking content of TFRecord files.""" +"""Miscellaneous utility functions.""" -from typing import Dict, Optional, Sequence, Union +from datetime import datetime +from typing import Dict import csv import os import tensorflow as tf -import tensorflow_transform as tft from tfrecorder import beam_image -from tfrecorder import common +from tfrecorder import constants +from tfrecorder import dataset_loader _OUT_IMAGE_TEMPLATE = 'image_{:0>3d}.png' @@ -36,27 +37,6 @@ def _stringify(scalar: tf.Tensor) -> str: val = scalar.numpy() return val.decode('utf-8') if isinstance(val, bytes) else str(val) -# pylint: disable=too-many-locals -# TODO(cezequiel): deprecate in favor of `dataset.load`. -def _read_tfrecords( - file_pattern: Union[str, Sequence[str]], - tft_output_dir: Optional[str] = None, - compression_type: str = 'GZIP') -> tf.data.Dataset: - """Reads TFRecords files and outputs a TensorFlow Dataset. - - Currently supports Image CSV format only. - """ - - files = tf.io.gfile.glob(file_pattern) - - if not tft_output_dir: - tft_output_dir = os.path.dirname(file_pattern) - tf_transform_output = tft.TFTransformOutput(tft_output_dir) - feature_spec = tf_transform_output.transformed_feature_spec() - dataset = tf.data.TFRecordDataset(files, compression_type) - return dataset.map(lambda x: tf.io.parse_single_example( - x, feature_spec)) - def _save_image_from_record(record: Dict[str, tf.Tensor], outfile: str): """Extracts image data from parsed TFRecord and saves it to a file.""" @@ -68,21 +48,32 @@ def _save_image_from_record(record: Dict[str, tf.Tensor], outfile: str): image.save(outfile) -def check_tfrecords( - file_pattern: str, +def inspect( + tfrecord_dir: str, + split: str = 'TRAIN', num_records: int = 1, - output_dir: str = 'output', - compression_type: str = 'GZIP'): - """Reads TFRecord files and outputs decoded contents to a temp directory.""" + output_dir: str = 'output'): + """Prints contents of TFRecord files generated by TFRecorder. + + Args: + tfrecord_dir: TFRecord directory. + split: Dataset split (see `schema.allowed_split_values`). + num_records: Number of records to output. + output_dir: Directory to dump read records. - dataset = _read_tfrecords(file_pattern, compression_type=compression_type) + Raises: + `ValueError` when data for a given `split` could not be loaded. + """ + + dataset = dataset_loader.load(tfrecord_dir).get(split) + if not dataset: + raise ValueError(f'Could not load data for {split}') data_dir = os.path.join( - output_dir, 'check-tfrecords-' + common.get_timestamp()) + output_dir, 'check-tfrecords-' + get_timestamp()) os.makedirs(data_dir) - csv_file = os.path.join(data_dir, 'data.csv') - with open(csv_file, 'wt') as f: + with open(os.path.join(data_dir, 'data.csv'), 'wt') as f: writer = csv.writer(f) # Write CSV header @@ -108,3 +99,21 @@ def check_tfrecords( print('Output written to {}'.format(data_dir)) return data_dir + + +def get_timestamp() -> str: + """Returns current date and time as formatted string.""" + return datetime.now().strftime('%Y%m%d-%H%M%S') + + +def copy_logfile_to_gcs(logfile: str, output_dir: str): + """Copies a logfile from local to gcs storage.""" + try: + with open(logfile, 'r') as log_reader: + out_log = os.path.join(output_dir, constants.LOGFILE) + with tf.io.gfile.GFile(out_log, 'w') as gcs_logfile: + log = log_reader.read() + gcs_logfile.write(log) + except FileNotFoundError as e: + raise FileNotFoundError("Unable to copy log file {} to gcs.".format( + e.filename)) from e diff --git a/tfrecorder/check_test.py b/tfrecorder/utils_test.py similarity index 60% rename from tfrecorder/check_test.py rename to tfrecorder/utils_test.py index feeeb6e..77331e4 100644 --- a/tfrecorder/check_test.py +++ b/tfrecorder/utils_test.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests `check.py`.""" +"""Tests `utils.py`.""" import functools import os @@ -27,49 +27,15 @@ import tensorflow as tf from tfrecorder import beam_image -from tfrecorder import check +from tfrecorder import constants +from tfrecorder import utils from tfrecorder import test_utils -from tfrecorder import schema +from tfrecorder import input_schema +from tfrecorder import dataset_loader # pylint: disable=protected-access -class ReadTFRecordsTest(unittest.TestCase): - """Tests `_read_tfrecords`.""" - - def setUp(self): - self.tfrecords_dir = os.path.join(test_utils.TEST_DIR, 'sample_tfrecords') - - def test_valid_compressed_gzip(self): - """Tests valid case using GZIP compression.""" - - # Use list of file pattern strings to maintain train, validation, test - # order. - file_pattern = [ - os.path.join(self.tfrecords_dir, '{}*.tfrecord.gz'.format(f)) - for f in ['train, validation, test']] - - compression_type = 'GZIP' - actual = check._read_tfrecords( - file_pattern, self.tfrecords_dir, compression_type) - - expected_csv = os.path.join(test_utils.TEST_DIR, 'data.csv') - expected = tf.data.experimental.make_csv_dataset( - expected_csv, batch_size=1, label_name=None, num_epochs=1, - shuffle=False) - - for a, e in zip(actual, expected): - self.assertCountEqual(a.keys(), schema.image_csv_schema) - for key in schema.image_csv_schema: - self.assertEqual(a[key], e[key]) - - def test_error_invalid_file_pattern(self): - """Tests error case where file pattern is invalid.""" - - file_pattern = 'gs://path/to/memes/folder' - with self.assertRaises(tf.errors.OpError): - check._read_tfrecords(file_pattern) - class CheckTFRecordsTest(unittest.TestCase): """Tests `check_tfrecords`.""" @@ -85,7 +51,8 @@ def setUp(self): image_channels) data = test_utils.get_test_data() - image_uri_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema) + schema = input_schema.IMAGE_CSV_SCHEMA + image_uri_key = schema.image_uri_key num_records = len(data[image_uri_key]) image_uris = data.pop(image_uri_key) data['image_name'] = [os.path.split(uri)[-1] for uri in image_uris] @@ -96,21 +63,23 @@ def setUp(self): 'image_width': [image_width] * num_records, 'image_channels': [image_channels] * num_records, }) + self.tfrecord_dir = 'gs://path/to/tfrecords/dir' + self.split = 'TRAIN' self.num_records = num_records self.data = data self.dataset = tf.data.Dataset.from_tensor_slices(self.data) - @mock.patch.object(check, '_read_tfrecords', autospec=True) + @mock.patch.object(dataset_loader, 'load', autospec=True) def test_valid_records(self, mock_fn): """Tests valid case on reading multiple records.""" - file_pattern = 'gs://path/to/tfrecords/*' - mock_fn.return_value = self.dataset + mock_fn.return_value = {self.split: self.dataset} num_records = len(self.data['image']) with tempfile.TemporaryDirectory(dir='/tmp') as dir_: - actual_dir = check.check_tfrecords( - file_pattern, num_records=num_records, output_dir=dir_) + actual_dir = utils.inspect( + self.tfrecord_dir, split=self.split, num_records=num_records, + output_dir=dir_) self.assertTrue('check-tfrecords-' in actual_dir) actual_csv = os.path.join(actual_dir, 'data.csv') @@ -129,6 +98,39 @@ def test_valid_records(self, mock_fn): expected_image_files = self.data['image_name'] self.assertCountEqual(actual_image_files, expected_image_files) + @mock.patch.object(dataset_loader, 'load', autospec=True) + def test_no_data_for_split(self, mock_fn): + """Check exception raised when data could not be loaded given `split`.""" + + mock_fn.return_value = {} + with self.assertRaisesRegex(ValueError, 'Could not load data for'): + utils.inspect(self.tfrecord_dir, split='UNSUPPORTED') + if __name__ == '__main__': unittest.main() + + +class CopyLogTest(unittest.TestCase): + """Misc tests for _copy_logfile_to_gcs.""" + + def test_valid_copy(self): + """Test valid file copy.""" + with tempfile.TemporaryDirectory() as tmpdirname: + text = 'log test log test' + infile = os.path.join(tmpdirname, 'foo.log') + with open(infile, 'w') as f: + f.write(text) + utils.copy_logfile_to_gcs(infile, tmpdirname) + + outfile = os.path.join(tmpdirname, constants.LOGFILE) + with open(outfile, 'r') as f: + data = f.read() + self.assertEqual(text, data) + + def test_invalid_copy(self): + """Test invalid file copy.""" + with tempfile.TemporaryDirectory() as tmpdirname: + infile = os.path.join(tmpdirname, 'foo.txt') + with self.assertRaises(FileNotFoundError): + utils.copy_logfile_to_gcs(infile, tmpdirname)