Skip to content

Commit

Permalink
RF+BF: centralize aggregation of dataset locations
Browse files Browse the repository at this point in the history
plus quite a fix fixes
  • Loading branch information
mih committed Sep 5, 2016
1 parent 20fd941 commit 5f670c2
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 21 deletions.
39 changes: 35 additions & 4 deletions datalad/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""Metadata handling (parsing, storing, querying)"""


from six import string_types
from os.path import join as opj, exists, relpath
from importlib import import_module
from datalad.distribution.dataset import Dataset
Expand Down Expand Up @@ -146,6 +147,13 @@ def _get_implicit_metadata(ds, ds_identifier=None, subdatasets=None):
return meta


def is_implicit_metadata(meta):
"""Return whether a meta data set looks like our own implicit meta data"""
std_spec = meta.get('dcterms:conformsTo', '')
return isinstance(std_spec, string_types) \
and std_spec.startswith('http://docs.datalad.org/metadata.html#v')


def _simplify_meta_data_structure(meta):
# get a list of terms from any possible source
if isinstance(meta, list) and len(meta) == 1:
Expand All @@ -167,6 +175,26 @@ def _simplify_meta_data_structure(meta):
return meta


def _adjust_subdataset_location(meta, subds_relpath):
# find implicit meta data for all contained subdatasets
for m in meta:
# skip non-implicit
if not is_implicit_metadata(m):
continue
# prefix all subdataset location information with the relpath of this
# subdataset
if 'dcterms:hasPart' in m:
parts = m['dcterms:hasPart']
if not isinstance(parts, list):
parts = [parts]
for p in parts:
if not 'location' in p:
continue
loc = p.get('location', subds_relpath)
if loc != subds_relpath:
p['location'] = opj(subds_relpath, loc)


# XXX might become its own command
def get_metadata(ds, guess_type=False, ignore_subdatasets=False,
ignore_cache=False):
Expand Down Expand Up @@ -220,10 +248,12 @@ def get_metadata(ds, guess_type=False, ignore_subdatasets=False,
subds_path = relpath(subds.path, ds.path)
if ignore_cache and subds.is_installed():
# simply pull meta data from actual subdataset and go to next part
meta.extend(
get_metadata(subds, guess_type=guess_type,
ignore_subdatasets=False,
ignore_cache=True))
subds_meta = get_metadata(
subds, guess_type=guess_type,
ignore_subdatasets=False,
ignore_cache=True)
_adjust_subdataset_location(subds_meta, subds_path)
meta.extend(subds_meta)
continue

# we need to look for any aggregated meta data
Expand All @@ -242,6 +272,7 @@ def get_metadata(ds, guess_type=False, ignore_subdatasets=False,
# compact/flatten at the end. However assuming a single context
# we can cheat.
subds_meta = _simplify_meta_data_structure(subds_meta)
_adjust_subdataset_location(subds_meta, subds_path)

# make sure we have a meaningful @id for any subdataset in hasPart,
# regardless of whether it is installed or not. This is needed to
Expand Down
19 changes: 2 additions & 17 deletions datalad/metadata/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@
from ..support.constraints import EnsureNone
from ..log import lgr
from . import get_metadata, get_native_metadata, metadata_filename, \
metadata_basepath
metadata_basepath, is_implicit_metadata
from datalad.support.json_py import dump as jsondump
from datalad.support.dsconfig import ConfigManager
from six import string_types


def _store_json(path, meta):
Expand Down Expand Up @@ -145,25 +144,11 @@ def __call__(dataset, guess_native_type=False, save=False, recursive=False,
# find implicit meta data for all contained subdatasets
for m in subds_meta:
# skip non-implicit
std_spec = m.get('dcterms:conformsTo', '')
if not (isinstance(std_spec, string_types)
and std_spec.startswith('http://docs.datalad.org/metadata.html#v')):
if not is_implicit_metadata(m):
continue
if m.get('@id', None) == subds.id:
# register relation to dataset being aggregated into
m['dcterms:isPartOf'] = dataset.id
# prefix all subdataset location information with the relpath of this
# subdataset
if 'dcterms:hasPart' in m:
parts = m['dcterms:hasPart']
if not isinstance(parts, list):
parts = [parts]
for p in parts:
if not 'location' in p:
continue
loc = p.get('location', subds_relpath)
if loc != subds_relpath:
p['location'] = opj(subds_relpath, loc)
_store_json(
opj(metapath, subds_relpath),
subds_meta)
Expand Down

0 comments on commit 5f670c2

Please sign in to comment.