ENH: addurls: Support tsv input

To read tab-separated input, the core functional change needed is specifying tab as the delimiter to csv.reader(). The rest is a matter of updating the documentation and the handling of the input_type argument. Closes #4839.
datalad · Aug 28, 2020 · 5d6e4fd · 5d6e4fd
1 parent 9dbe588
commit 5d6e4fd
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 20 deletions.
diff --git a/datalad/plugin/addurls.py b/datalad/plugin/addurls.py
@@ -254,17 +254,19 @@ def fmt_to_name(format_string, num_to_name):
         return name
 
 
-INPUT_TYPES = ["ext", "csv", "json"]
+INPUT_TYPES = ["ext", "csv", "tsv", "json"]
 
 
 def _read(stream, input_type):
-    if input_type == "csv":
+    if input_type in ["csv", "tsv"]:
         import csv
-        csvrows = csv.reader(stream)
+        csvrows = csv.reader(stream,
+                             delimiter="\t" if input_type == "tsv" else ",")
         try:
             headers = next(csvrows)
         except StopIteration:
-            raise ValueError("Failed to read CSV rows from {}".format(stream))
+            raise ValueError("Failed to read {} rows from {}"
+                             .format(input_type.upper(), stream))
         lgr.debug("Taking %s fields from first line as headers: %s",
                   len(headers), headers)
         idx_map = dict(enumerate(headers))
@@ -438,7 +440,7 @@ def extract(stream, input_type, url_format="{0}", filename_format="{1}",
     ----------
     stream : file object
         Items used to construct the file names and URLs.
-    input_type : {'csv', 'json'}
+    input_type : {'csv', 'tsv', 'json'}
 
     All other parameters match those described in `AddUrls`.
 
@@ -587,10 +589,11 @@ class Addurls(Interface):
     *Format specification*
 
     Several arguments take format strings.  These are similar to normal Python
-    format strings where the names from `URL-FILE` (column names for a CSV or
-    properties for JSON) are available as placeholders.  If `URL-FILE` is a CSV
-    file, a positional index can also be used (i.e., "{0}" for the first
-    column).  Note that a placeholder cannot contain a ':' or '!'.
+    format strings where the names from `URL-FILE` (column names for a comma-
+    or tab-separated file or properties for JSON) are available as
+    placeholders. If `URL-FILE` is a CSV or TSV file, a positional index can
+    also be used (i.e., "{0}" for the first column). Note that a placeholder
+    cannot contain a ':' or '!'.
 
     In addition, the `FILENAME-FORMAT` arguments has a few special
     placeholders.
@@ -682,10 +685,11 @@ class Addurls(Interface):
             metavar="URL-FILE",
             doc="""A file that contains URLs or information that can be used to
             construct URLs.  Depending on the value of --input-type, this
-            should be a CSV file (with a header as the first row) or a JSON
-            file (structured as a list of objects with string values). If '-',
-            read from standard input, taking the content as JSON when
-            --input-type is at its default value of 'ext'."""),
+            should be a comma- or tab-separated file (with a header as the
+            first row) or a JSON file (structured as a list of objects with
+            string values). If '-', read from standard input, taking the
+            content as JSON when --input-type is at its default value of
+            'ext'."""),
         urlformat=Parameter(
             args=("urlformat",),
             metavar="URL-FORMAT",
@@ -705,10 +709,10 @@ class Addurls(Interface):
         input_type=Parameter(
             args=("-t", "--input-type"),
             metavar="TYPE",
-            doc="""Whether `URL-FILE` should be considered a CSV file or a JSON
-            file.  The default value, "ext", means to consider `URL-FILE` as a
-            JSON file if it ends with ".json".  Otherwise, treat it as a CSV
-            file.""",
+            doc="""Whether `URL-FILE` should be considered a CSV file, TSV
+            file, or JSON file. The default value, "ext", means to consider
+            `URL-FILE` as a JSON file if it ends with ".json" or a TSV file if
+            it ends with ".tsv". Otherwise, treat it as a CSV file.""",
             constraints=EnsureChoice(*INPUT_TYPES)),
         exclude_autometa=Parameter(
             args=("-x", "--exclude_autometa"),
@@ -809,7 +813,12 @@ def __call__(dataset, urlfile, urlformat, filenameformat,
                 input_type = "json"
             else:
                 extension = os.path.splitext(url_file)[1]
-                input_type = "json" if extension == ".json" else "csv"
+                if extension == ".json":
+                    input_type = "json"
+                elif extension == ".tsv":
+                    input_type = "tsv"
+                else:
+                    input_type = "csv"
 
         fd = sys.stdin if url_file == "-" else open(url_file)
         try:

diff --git a/datalad/plugin/tests/test_addurls.py b/datalad/plugin/tests/test_addurls.py
@@ -324,8 +324,9 @@ def check_extract_csv_json_equal(input_type):
     eq_(json_output, csv_output)
 
 
-def test_extract_csv_json_equal():
+def test_extract_csv_tsv_json_equal():
     yield check_extract_csv_json_equal, "csv"
+    yield check_extract_csv_json_equal, "tsv"
 
 
 def test_extract_wrong_input_type():
@@ -667,10 +668,11 @@ def test_addurls_invalid_input(self, path):
             assert_in("Failed to read", str(exc.exception))
 
     @with_tree({"in.csv": "url,name,subdir",
+                "in.tsv": "url\tname\tsubdir",
                 "in.json": "[]"})
     def test_addurls_no_rows(self, path):
         ds = Dataset(path).create(force=True)
-        for fname in ["in.csv", "in.json"]:
+        for fname in ["in.csv", "in.tsv", "in.json"]:
             with swallow_logs(new_level=logging.WARNING) as cml:
                 assert_in_results(
                     ds.addurls(fname, "{url}", "{name}"),
@@ -706,6 +708,7 @@ def make_delim_text(delim):
                 [row.format(**rec) for rec in json.loads(json_text)])
 
         yield make_test(make_delim_text(","), "csv", "csv,csv input type")
+        yield make_test(make_delim_text("\t"), "tsv", "tsv,tsv input type")
 
     @with_tempfile(mkdir=True)
     def test_addurls_stdin_input_command_line(self, path):