Skip to content

Commit

Permalink
ENH: addurls: Set argument for setting metadata (via git-annex)
Browse files Browse the repository at this point in the history
  • Loading branch information
kyleam committed Feb 20, 2018
1 parent 49de6ea commit 037e19d
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 13 deletions.
78 changes: 69 additions & 9 deletions datalad/plugin/addurls.py
Expand Up @@ -60,7 +60,36 @@ def get_value(self, key, args, kwargs):
key, args, kwargs)


def extract(stream, input_type, filename_format, url_format):
def clean_meta_args(args):
"""Prepare formatted metadata arguments to be passed to git-annex.
Parameters
----------
args : iterable of str
Formatted metadata arguments for 'git-annex metadata --set'.
Returns
-------
Generator that yields processed arguments (str).
"""
for arg in args:
parts = [x.strip() for x in arg.split("=", 1)]
if len(parts) == 2:
if not parts[0]:
raise ValueError("Empty field name")
field, value = parts
else:
field = "tag"
value = parts[0]

if not value:
# The `url_file` may have an empty value.
continue

yield field + "=" + value


def extract(stream, input_type, filename_format, url_format, meta):
"""Extract and format information from `url_file`.
Parameters
Expand Down Expand Up @@ -98,21 +127,26 @@ def extract(stream, input_type, filename_format, url_format):
fmt = Formatter(colidx_to_name)
format_filename = partial(fmt.format, filename_format)
format_url = partial(fmt.format, url_format)
# Unlike `filename_format` and `url_format`, `meta` is a list
# because meta may be given multiple times on the command line.
formats_meta = [partial(fmt.format, m) for m in meta]

for row in rows:
url = format_url(row)
filename = format_filename(row)

meta_args = list(clean_meta_args(fmt(row) for fmt in formats_meta))

subpaths = []
if "//" in filename:
for part in filename.split("//")[:-1]:
subpaths.append(os.path.join(*(subpaths + [part])))
filename = filename.replace("//", os.path.sep)
yield filename, url, subpaths
yield filename, url, meta_args, subpaths


def dlplugin(dataset=None, url_file=None, input_type="ext",
url_format="{0}", filename_format="{1}",
url_format="{0}", filename_format="{1}", meta=None,
message=None, dry_run=False, fast=False):
"""Create and update a dataset from a list of URLs.
Expand Down Expand Up @@ -148,6 +182,14 @@ def dlplugin(dataset=None, url_file=None, input_type="ext",
may contain directories. The separator "//" can be used to
indicate that the left-side directory should be created as a
new subdataset.
meta : str, optional
A format string that specifies metadata. It should be
structured as "<field>=<value>". The same placeholders from
`url_format` can be used. As an example, "location={3}" would
mean that the value for the "location" metadata field should
be set the value of the fourth column. A plain value is
shorthand for "tag=<value>". This option can be given
multiple times.
message : str, optional
Use this message when committing the URL additions.
dry_run : bool, optional
Expand Down Expand Up @@ -197,9 +239,12 @@ def dlplugin(dataset=None, url_file=None, input_type="ext",
from datalad.interface.results import get_status_dict
import datalad.plugin.addurls as me
from datalad.support.annexrepo import AnnexRepo
from datalad.utils import assure_list

lgr = logging.getLogger("datalad.plugin.addurls")

meta = assure_list(meta)

if url_file is None:
# `url_file` is not a required argument in `dlplugin` because
# the argument before it, `dataset`, needs to be optional to
Expand All @@ -215,12 +260,13 @@ def dlplugin(dataset=None, url_file=None, input_type="ext",
input_type = "json" if extension == ".json" else "csv"

with open(url_file) as fd:
info = me.extract(fd, input_type, filename_format, url_format)
info = me.extract(fd, input_type, filename_format, url_format, meta)

if dry_run:
for fname, url, _ in info:
for fname, url, meta, _ in info:
lgr.info("Would download %s to %s",
url, os.path.join(dataset.path, fname))
lgr.info("Metadata: %s", meta)
yield get_status_dict(action="addurls",
ds=dataset,
status="ok",
Expand All @@ -240,8 +286,9 @@ def dlplugin(dataset=None, url_file=None, input_type="ext",
annex_options = ["--fast"] if fast else []

seen_subpaths = set()
to_add = []
for fname, url, subpaths in info:
files_to_add = []
meta_to_add = []
for fname, url, meta, subpaths in info:
for spath in subpaths:
if spath not in seen_subpaths:
if os.path.exists(spath):
Expand Down Expand Up @@ -275,13 +322,26 @@ def dlplugin(dataset=None, url_file=None, input_type="ext",
path=os.path.join(ds_current.path, fname),
status="ok")

to_add.append(fname)
files_to_add.append(fname)
meta_to_add.append((ds_current, ds_filename, meta))

msg = message or """\
[DATALAD] add files from URLs
url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)
for r in dataset.add(to_add, message=msg):
for r in dataset.add(files_to_add, message=msg):
yield r

for ds, fname, meta in meta_to_add:
lgr.debug("Adding metadata to %s in %s", fname, ds.path)
for arg in meta:
ds.repo._run_annex_command("metadata",
annex_options=["--set", arg, fname])
yield get_status_dict(action="addurls-metadata",
ds=ds_current,
type="file",
path=os.path.join(ds.path, fname),
message="added metadata",
status="ok")
25 changes: 21 additions & 4 deletions datalad/plugin/tests/test_addurls.py
Expand Up @@ -62,6 +62,18 @@ def test_formatter_placeholder_nonpermitted_chars():
fmt.format, "{key:<5}", {"key:<5": "value0"})


def test_clean_meta_args():
for args, expect in [(["", "field="], []),
([" field=yes "], ["field=yes"]),
([" atag "], ["tag=atag"]),
(["field= value="], ["field=value="])]:
assert list(addurls.clean_meta_args(args)) == expect

assert_raises(ValueError,
list,
addurls.clean_meta_args(["=value"]))


ST_DATA = {"header": ["name", "debut_season", "age_group", "now_dead"],
"rows": [{"name": "will", "debut_season": 1,
"age_group": "kid", "now_dead": "no"},
Expand All @@ -81,16 +93,20 @@ def json_stream(data):


def test_extract():
fnames, urls, subpaths = zip(*addurls.extract(
fnames, urls, meta, subpaths = zip(*addurls.extract(
json_stream(ST_DATA["rows"]), "json",
"{age_group}//{now_dead}//{name}.csv",
"{name}_{debut_season}.com"))
"{name}_{debut_season}.com",
["group={age_group}"]))

assert urls == ("will_1.com", "bob_2.com", "scott_1.com", "max_2.com")

assert fnames == ("kid/no/will.csv", "adult/yes/bob.csv",
"adult/no/scott.csv", "kid/no/max.csv")

assert meta == (["group=kid"], ["group=adult"],
["group=adult"], ["group=kid"])

assert subpaths == (["kid", "kid/no"], ["adult", "adult/yes"],
["adult", "adult/no"], ["kid", "kid/no"])

Expand All @@ -102,7 +118,8 @@ def test_extract_csv_json_equal():
for row in ST_DATA["rows"])

args = ["{age_group}//{now_dead}//{name}.csv",
"{name}_{debut_season}.com"]
"{name}_{debut_season}.com",
["group={age_group}"]]

json_output = addurls.extract(json_stream(ST_DATA["rows"]), "json", *args)
csv_output = addurls.extract(csv_rows, "csv", *args)
Expand All @@ -114,4 +131,4 @@ def test_extract_csv_json_equal():
def test_extract_wrong_input_type():
assert_raises(ValueError,
list,
addurls.extract(None, "not_csv_or_json", None, None))
addurls.extract(None, "not_csv_or_json", None, None, None))

0 comments on commit 037e19d

Please sign in to comment.