csvflatten: renamed --eor to --separator; added -N/--newline-sep option

dannguyen · Dec 24, 2020 · d7e0121 · d7e0121
1 parent 1e33aca
commit d7e0121
Show file tree

Hide file tree

Showing 7 changed files with 335 additions and 101 deletions.
diff --git a/TODOS.md b/TODOS.md
@@ -9,9 +9,9 @@ while working on data project, wondered that:
 
 - csvflatten 
     - [x] --prettify should be default
-    - [ ] option to replace record separator with empty row
+    - [x] option to replace record separator with empty row
 - csvnorm
-    - [ ] should have the --max-length option, not csvflatten
+    - [ ] should have the --max-length option, similar to csvflatten
 
 
 **thoughts 2020-11-24**

diff --git a/csvmedkit/utils/csvflatten.py b/csvmedkit/utils/csvflatten.py
@@ -10,7 +10,7 @@
     Optional as OptionalType,
 )
 
-DEFAULT_EOR_MARKER = "~"
+DEFAULT_EOR_MARKER = "="
 DEFAULT_MAX_LENGTH = 50
 FLAT_COLUMN_NAMES = (
     "field",
@@ -58,7 +58,12 @@ def add_arguments(self):
             "--rec-id",
             dest="rec_ids_mode",
             action="store_true",
-            help="""Include a `_recid_` column for each row, for easier tracking the 0-based index of each record""",
+            help="""Include a `recid` column at the beginning of each row, corresponding to the
+            numerical index of a flatten record.
+
+            (Using this option disables the default record separator, but you can still set --separator
+            to a custom marker)
+            """,
         )
 
         self.argparser.add_argument(
@@ -67,16 +72,26 @@ def add_arguments(self):
             dest="label_chunks_mode",
             action="store_true",
             help="""When a long value is split into multiple "chunks", the `field` (i.e. first column) is left blank after the first chunk.
-                    Setting the --chunk-labels flag will fill the `field` column with: "field~n", where `n` indicates the n-th chunk of a chopped value""",
+                    Setting the --label-chunks flag will fill the `field` column with: "field~n", where `n` indicates the n-th chunk of a chopped value""",
+        )
+
+        self.argparser.add_argument(
+            "-N",
+            "--newline-sep",
+            dest="newline_separator",
+            action="store_true",
+            help="""Separate each flattened record with a blank newline. Cannot be used with -S/--s""",
         )
 
         self.argparser.add_argument(
-            "-E",
-            "--eor",
-            dest="end_of_record_marker",
+            "-S",
+            "--separator",
+            dest="record_separator",
+            metavar="TEXT_MARKER",
             type=str,
-            help="""end of record; When flattening multiple records, separate each records with a row w/ fieldname of [marker]. Set to '' or 'none' to disable. By default,
-                    the EOR marker is a series of tildes (~~~~~). However, this setting defaults to 'none' if `-R/--rowid` mode is true. """,
+            help=f"""Separate each flatten record with a blank row in which the `field` column contains a
+                text marker. The default marker is a series of "{DEFAULT_EOR_MARKER}". Set this option to "" or "none"
+                to disable.""",
         )
 
     @property
@@ -92,27 +107,35 @@ def chunkpattern(self) -> CallableType:
         return _cp
 
     @property
-    def end_of_record_marker(self) -> OptionalType[str]:
+    def record_separator(self) -> OptionalType[str]:
         """
         preconditions:
             - self.max_column_name_length
             - self.rec_ids_mode
         """
         marker: OptionalType[str]
-
-        _eor = self.args.end_of_record_marker
-        if _eor == "none" or _eor == "":
-            marker = None
-        elif _eor:  # use default
-            marker = _eor
+        if self.args.newline_separator:
+            # empty string effectively makes a blank row
+            marker = ""
         else:
-            # disable by default, if we're in rowid mode
-            if self.rec_ids_mode:
+            argval = self.args.record_separator
+            if argval == "none" or argval == "":
+                # user explicitly disables it
                 marker = None
+            elif argval:
+                # user specified *something*
+                marker = argval
             else:
-                marker = "".join(
-                    DEFAULT_EOR_MARKER for i in range(self.max_column_name_length)
-                )
+                # user did not set option
+                if self.rec_ids_mode:
+                    # if we're using rec_ids, record separation is disabled by default
+                    marker = None
+                else:
+                    # this is the default record separator
+                    marker = "".join(
+                        DEFAULT_EOR_MARKER for i in range(self.max_column_name_length)
+                    )
+
         return marker
 
     @property
@@ -133,6 +156,9 @@ def read_input(self):
         self._read_input_done = True
 
     def main(self):
+        if self.args.newline_separator and self.args.record_separator:
+            self.argparser.error("Cannot set both -N/--newline-sep and -S/--separator.")
+
         if self.additional_input_expected():
             self.argparser.error("You must provide an input file or piped data.")
 
@@ -184,10 +210,10 @@ def main(self):
                 else []
             )
 
-            if self.end_of_record_marker and row_idx > 0:
-                # print out a end-of-record marker
-                eor_row = [None] if self.rec_ids_mode else []
-                outrows.append(eor_row + [self.end_of_record_marker, None])
+            if self.record_separator is not None and row_idx > 0:
+                # print out a record-separator
+                sep = [None] if self.rec_ids_mode else []
+                outrows.append(sep + [self.record_separator, None])
 
             for col_idx, colname in enumerate(self.i_column_names):
                 # value_lines = row[col_idx].strip().splitlines()

diff --git a/docs/utils/csvflatten/index.rst b/docs/utils/csvflatten/index.rst
@@ -27,7 +27,7 @@ For example, given this ``data.csv``:
     | price       | 1.50                                                  |
     | description | An apple is an edible fruit produced by an apple tree |
     |             | (Malus domestica)                                     |
-    | ~~~~~~~~~~~ |                                                       |
+    | =========== |                                                       |
     | id          | 002                                                   |
     | product     | oranges                                               |
     | price       | 2.25                                                  |

diff --git a/docs/utils/csvflatten/options.rstinc b/docs/utils/csvflatten/options.rstinc
@@ -7,30 +7,30 @@ Options and flags
 TK: change the other option examples to use flatfruit.csv
 
 
-``-c, --csv``
-------------------
+-c, --csv
+---------
 
 Print output in CSV format, instead of "prettified" Markdown tabular format.
 
 Maybe you want "flattened" output, but something that you can paste into a spreadsheet:
 
-    .. code-block:: shell
+.. code-block:: shell
 
-        $ csvflatten -c data.csv
+    $ csvflatten -c data.csv
 
 
-    .. code-block:: text
+.. code-block:: text
 
-        field,value
-        id,001
-        product,apples
-        price,1.50
-        description,An apple is an edible fruit produced by an apple tree (Malus domestica)
-        ~~~~~~~~~~~,
-        id,002
-        product,oranges
-        price,2.25
-        description,An orange is a type of citrus fruit that people often eat. Oranges are a very good source of vitamin C.
+    field,value
+    id,001
+    product,apples
+    price,1.50
+    description,An apple is an edible fruit produced by an apple tree (Malus domestica)
+    ===========,
+    id,002
+    product,oranges
+    price,2.25
+    description,An orange is a type of citrus fruit that people often eat. Oranges are a very good source of vitamin C.
 
 
 Here's what it looks like in a spreadsheet:
@@ -52,7 +52,7 @@ Specify a max character length for field values; values that exceed this length
     code,alfa
     blob,01234
     ,56789
-    ~~~~~,
+    =====,
     code,beta
     blob,ABCDE
     ,FGHIJ
@@ -67,7 +67,7 @@ Or combining with the ``-P/--prettify`` option::
     | code  | alfa  |
     | blob  | 01234 |
     |       | 56789 |
-    | ~~~~~ |       |
+    | ===== |       |
     | code  | beta  |
     | blob  | ABCDE |
     |       | FGHIJ |
@@ -81,7 +81,7 @@ The default behavior is as follows:
 
 
 
--B/--chunk-labels
+-B/--label-chunks
 -----------------
 
 When a value is chopped into chunks across multiple rows, by default, the  ``field`` (i.e. first column) is filled in for the value's *first* row, then left blank for its subsequent rows::
@@ -95,7 +95,7 @@ When a value is chopped into chunks across multiple rows, by default, the  ``fie
     |       | of love |
 
 
-If the ``--chunk-labels`` flag is set, each subsequent ``field`` will be filled with an incremental label, e.g.::
+If the ``--label-chunks`` flag is set, each subsequent ``field`` will be filled with an incremental label, e.g.::
 
     |  field   |  value  |
     +----------+---------+
@@ -106,24 +106,37 @@ If the ``--chunk-labels`` flag is set, each subsequent ``field`` will be filled
 
 
 
--E/--eor [END_OF_RECORD_MARKER (string)]
-----------------------------------------
+-S/--separator [TEXT_MARKER (string)]
+-------------------------------------
+
+By default, each record is separated by having a string of *equals signs* in ``field``, e.g.:
 
-By default, each record is separated by having a string of *tildes* in ``field``, e.g.::
 
 
+.. code-block:: text
+
     | field |  value  |
     +-------+---------+
     | id    | 001     |
     | title | this is |
     |       | a story |
     |       | of love |
-    | ~~~~~ |         |
+    | ===== |         |
     | id    | 002     |
     | title | Book 2  |
 
 
-Set ``'none'`` to disable::
+    $ csvflatten -S 'NEW-RECORD' data.csv
+
+
+Set ``'none'`` to disable:
+
+
+.. code-block:: shell
+
+    $ csvflatten -S none data.csv
+
+.. code-block:: text
 
     | field |  value  |
     +-------+---------+
@@ -134,10 +147,16 @@ Set ``'none'`` to disable::
     | id    | 002     |
     | title | Book 2  |
 
-Or to a value of your choosing::
+Or to a value of your choosing:
+
+.. code-block:: shell
+
+    $ csvflatten -S 'NEW-RECORD' data.csv
+
 
 
-    $ csvflatten -E 'NEW-RECORD' data.csv
+
+.. code-block:: text
 
     |   field    |  value  |
     +------------+---------+
@@ -150,21 +169,45 @@ Or to a value of your choosing::
     | title      | Book 2  |
 
 
+-N/--newline-sep
+----------------
+
+Separate each flattened record with a blank newline. Cannot be used with ``-S/--separator``
+
+.. code-block:: text
+
+    | field |  value  |
+    +-------+---------+
+    | id    | 001     |
+    | title | this is |
+    |       | a story |
+    |       | of love |
+    |       |         |
+    | id    | 002     |
+    | title | Book 2  |
+
 
 -R/--rec-id
 -----------
 
-Include a ``_recid_`` column that contains the 0-based index for the respective record::
+Include a ``recid`` column that contains the 0-based index for the respective record:
+
+.. code-block:: shell
+
+    $ csvflatten -R data.csv
+
+
+.. code-block:: text
 
 
-    | _recid_ | field |  value  |
-    +---------+-------+---------+
-    |       0 | id    | 001     |
-    |       0 | title | this is |
-    |       0 |       | a story |
-    |       0 |       | of love |
-    |       1 | id    | 002     |
-    |       1 | title | Book 2  |
+    | recid  | field |  value  |
+    +---------+-------+--------+
+    |      0 | id    | 001     |
+    |      0 | title | this is |
+    |      0 |       | a story |
+    |      0 |       | of love |
+    |      1 | id    | 002     |
+    |      1 | title | Book 2  |
 
 
 Note that ``-rec-id`` by default disables the end-of-record separator

diff --git a/docs/utils/csvflatten/usage.rstinc b/docs/utils/csvflatten/usage.rstinc
@@ -26,7 +26,7 @@ The "flattened" view of its 2 records would look like this:
    id,001
    product,apples
    price,1.50
-   ~~~~~,
+   "\=====",""
    id,002
    product,oranges
    price,2.25
@@ -51,25 +51,25 @@ Given data file :ref:`hamlet.csv <example-data-hamlet-csv>`:
     | scene     | 5                                              |
     | speaker   | Horatio                                        |
     | lines     | Propose the oath, my lord.                     |
-    | ~~~~~~~~~ |                                                |
+    | ========= |                                                |
     | act       | 1                                              |
     | scene     | 5                                              |
     | speaker   | Hamlet                                         |
     | lines     | Never to speak of this that you have seen,     |
     |           | Swear by my sword.                             |
-    | ~~~~~~~~~ |                                                |
+    | ========= |                                                |
     | act       | 1                                              |
     | scene     | 5                                              |
     | speaker   | Ghost                                          |
     | lines     | [Beneath] Swear.                               |
-    | ~~~~~~~~~ |                                                |
+    | ========= |                                                |
     | act       | 3                                              |
     | scene     | 4                                              |
     | speaker   | Gertrude                                       |
     | lines     | O, speak to me no more;                        |
     |           | These words, like daggers, enter in mine ears; |
     |           | No more, sweet Hamlet!                         |
-    | ~~~~~~~~~ |                                                |
+    | ========= |                                                |
     | act       | 4                                              |
     | scene     | 7                                              |
     | speaker   | Laertes                                        |