Skip to content

Commit

Permalink
ENH: addurls: Support tsv input
Browse files Browse the repository at this point in the history
To read tab-separated input, the core functional change needed is
specifying tab as the delimiter to csv.reader().  The rest is a matter
of updating the documentation and the handling of the input_type
argument.

Closes #4839.
  • Loading branch information
kyleam committed Aug 28, 2020
1 parent 9dbe588 commit 5d6e4fd
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 20 deletions.
45 changes: 27 additions & 18 deletions datalad/plugin/addurls.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,17 +254,19 @@ def fmt_to_name(format_string, num_to_name):
return name


INPUT_TYPES = ["ext", "csv", "json"]
INPUT_TYPES = ["ext", "csv", "tsv", "json"]


def _read(stream, input_type):
if input_type == "csv":
if input_type in ["csv", "tsv"]:
import csv
csvrows = csv.reader(stream)
csvrows = csv.reader(stream,
delimiter="\t" if input_type == "tsv" else ",")
try:
headers = next(csvrows)
except StopIteration:
raise ValueError("Failed to read CSV rows from {}".format(stream))
raise ValueError("Failed to read {} rows from {}"
.format(input_type.upper(), stream))
lgr.debug("Taking %s fields from first line as headers: %s",
len(headers), headers)
idx_map = dict(enumerate(headers))
Expand Down Expand Up @@ -438,7 +440,7 @@ def extract(stream, input_type, url_format="{0}", filename_format="{1}",
----------
stream : file object
Items used to construct the file names and URLs.
input_type : {'csv', 'json'}
input_type : {'csv', 'tsv', 'json'}
All other parameters match those described in `AddUrls`.
Expand Down Expand Up @@ -587,10 +589,11 @@ class Addurls(Interface):
*Format specification*
Several arguments take format strings. These are similar to normal Python
format strings where the names from `URL-FILE` (column names for a CSV or
properties for JSON) are available as placeholders. If `URL-FILE` is a CSV
file, a positional index can also be used (i.e., "{0}" for the first
column). Note that a placeholder cannot contain a ':' or '!'.
format strings where the names from `URL-FILE` (column names for a comma-
or tab-separated file or properties for JSON) are available as
placeholders. If `URL-FILE` is a CSV or TSV file, a positional index can
also be used (i.e., "{0}" for the first column). Note that a placeholder
cannot contain a ':' or '!'.
In addition, the `FILENAME-FORMAT` arguments has a few special
placeholders.
Expand Down Expand Up @@ -682,10 +685,11 @@ class Addurls(Interface):
metavar="URL-FILE",
doc="""A file that contains URLs or information that can be used to
construct URLs. Depending on the value of --input-type, this
should be a CSV file (with a header as the first row) or a JSON
file (structured as a list of objects with string values). If '-',
read from standard input, taking the content as JSON when
--input-type is at its default value of 'ext'."""),
should be a comma- or tab-separated file (with a header as the
first row) or a JSON file (structured as a list of objects with
string values). If '-', read from standard input, taking the
content as JSON when --input-type is at its default value of
'ext'."""),
urlformat=Parameter(
args=("urlformat",),
metavar="URL-FORMAT",
Expand All @@ -705,10 +709,10 @@ class Addurls(Interface):
input_type=Parameter(
args=("-t", "--input-type"),
metavar="TYPE",
doc="""Whether `URL-FILE` should be considered a CSV file or a JSON
file. The default value, "ext", means to consider `URL-FILE` as a
JSON file if it ends with ".json". Otherwise, treat it as a CSV
file.""",
doc="""Whether `URL-FILE` should be considered a CSV file, TSV
file, or JSON file. The default value, "ext", means to consider
`URL-FILE` as a JSON file if it ends with ".json" or a TSV file if
it ends with ".tsv". Otherwise, treat it as a CSV file.""",
constraints=EnsureChoice(*INPUT_TYPES)),
exclude_autometa=Parameter(
args=("-x", "--exclude_autometa"),
Expand Down Expand Up @@ -809,7 +813,12 @@ def __call__(dataset, urlfile, urlformat, filenameformat,
input_type = "json"
else:
extension = os.path.splitext(url_file)[1]
input_type = "json" if extension == ".json" else "csv"
if extension == ".json":
input_type = "json"
elif extension == ".tsv":
input_type = "tsv"
else:
input_type = "csv"

fd = sys.stdin if url_file == "-" else open(url_file)
try:
Expand Down
7 changes: 5 additions & 2 deletions datalad/plugin/tests/test_addurls.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,9 @@ def check_extract_csv_json_equal(input_type):
eq_(json_output, csv_output)


def test_extract_csv_json_equal():
def test_extract_csv_tsv_json_equal():
yield check_extract_csv_json_equal, "csv"
yield check_extract_csv_json_equal, "tsv"


def test_extract_wrong_input_type():
Expand Down Expand Up @@ -667,10 +668,11 @@ def test_addurls_invalid_input(self, path):
assert_in("Failed to read", str(exc.exception))

@with_tree({"in.csv": "url,name,subdir",
"in.tsv": "url\tname\tsubdir",
"in.json": "[]"})
def test_addurls_no_rows(self, path):
ds = Dataset(path).create(force=True)
for fname in ["in.csv", "in.json"]:
for fname in ["in.csv", "in.tsv", "in.json"]:
with swallow_logs(new_level=logging.WARNING) as cml:
assert_in_results(
ds.addurls(fname, "{url}", "{name}"),
Expand Down Expand Up @@ -706,6 +708,7 @@ def make_delim_text(delim):
[row.format(**rec) for rec in json.loads(json_text)])

yield make_test(make_delim_text(","), "csv", "csv,csv input type")
yield make_test(make_delim_text("\t"), "tsv", "tsv,tsv input type")

@with_tempfile(mkdir=True)
def test_addurls_stdin_input_command_line(self, path):
Expand Down

0 comments on commit 5d6e4fd

Please sign in to comment.