Clarify need for proper header when using create_tfrecords. Fixes #47. (

#48)
google · Oct 22, 2020 · 747dabe · 747dabe
1 parent d083ef2
commit 747dabe
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 6 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -29,5 +29,6 @@ Please delete options that are not relevant.
 - [ ] I have made relevant changes to the documentation, if needed
 - [ ] My changes generate no new warnings
 - [ ] I have corrected any misspellings in my code
+- [ ] (For feature) I made sure that I'm merging to the `dev` branch
 - [ ] (For hotfix/release) I have updated the package version number in `setup.py` (i.e. [MAJOR.MINOR.PATCH](https://semver.org/))
-- [ ] (For hotfix/release) I have updated RELEASE.md with notes regarding the changes.
+- [ ] (For hotfix/release) I have updated RELEASE.md with notes regarding the changes
diff --git a/README.md b/README.md
@@ -44,15 +44,26 @@ pip install tfrecorder
 
 ### Generating TFRecords
 
+You can generate TFRecords from a Pandas DataFrame, CSV file or
+a directory containing images.
+
 #### From Pandas DataFrame
 
+TFRecorder has an accessor which enables creation of TFRecord files through
+the Pandas DataFrame object.
+
+Make sure the DataFrame contains a header identifying each of the columns.
+In particular, the `split` column needs to be specified so that TFRecorder
+would know how to split the data into train, test and validation sets.
+
 ##### Running on a local machine
 
 ```python
 import pandas as pd
 import tfrecorder
 
-df = pd.read_csv(...)
+csv_file = '/path/to/images.csv'
+df = pd.read_csv(csv_file, names=['split', 'image_uri', 'label'])
 df.tensorflow.to_tfr(output_dir='/my/output/path')
 ```
 
@@ -92,7 +103,6 @@ df.tensorflow.to_tfr(
     tfrecorder_wheel='/path/to/my/tfrecorder.whl')
 ```
 
-
 #### From CSV
 
 Using Python interpreter:

diff --git a/RELEASE.md b/RELEASE.md
@@ -1,3 +1,8 @@
+# Hotfix 1.1.3
+
+* Adds note regarding DataFrame header specification in README.md.
+* Adds more informative error message when input is missing expected column
+
 # Release 1.1.1
 * Adds feature to load dataset from TFRecords generated by TFRecorder.
 

diff --git a/tfrecorder/client.py b/tfrecorder/client.py
@@ -42,7 +42,8 @@ def _validate_data(df: pd.DataFrame,
     _ = value # TODO(mikebernico) Implement type checking.
     if key not in df.columns:
       raise AttributeError(
-          'DataFrame does not contain column {} listed in schema'.format(key))
+          f'DataFrame does not contain expected column: {key}. '
+          f'Ensure header matches schema keys: {list(schema_map.keys())}.')
 
 def _validate_runner(
     runner: str,

diff --git a/tfrecorder/client_test.py b/tfrecorder/client_test.py
@@ -17,6 +17,7 @@
 """Tests for client."""
 
 import os
+import re
 from typing import List
 
 import csv
@@ -124,9 +125,12 @@ def test_missing_label(self):
 
   def test_missing_split(self):
     """Tests missing split column."""
-    with self.assertRaises(AttributeError):
+    split_key = 'split'
+    schema_keys = re.escape(str(list(self.test_schema_map.keys())))
+    regex = fr'^.+column: {split_key}.+keys: {schema_keys}.$'
+    with self.assertRaisesRegex(AttributeError, regex):
       df2 = self.test_df.copy()
-      df2.drop('split', inplace=True, axis=1)
+      df2.drop(split_key, inplace=True, axis=1)
       client._validate_data(df2, schema.image_csv_schema)
 
   def test_valid_runner(self):