Skip to content

Commit

Permalink
Merge pull request #6518 from christian-monch/enh-ng-metadata-search
Browse files Browse the repository at this point in the history
Support next generation metadata in search
  • Loading branch information
yarikoptic committed Jul 6, 2022
2 parents 7d615d3 + b7628b0 commit 65ef546
Show file tree
Hide file tree
Showing 4 changed files with 389 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ matrix:
env:
- PYTEST_SELECTION_OP=not
- DATALAD_SSH_MULTIPLEX__CONNECTIONS=0
- _DL_ANNEX_INSTALL_SCENARIO="miniconda --batch git-annex=8.20210310 -m conda"
- _DL_ANNEX_INSTALL_SCENARIO="miniconda --batch git-annex=10.20220525 -m conda"
- python: 3.7
env:
- PYTEST_SELECTION_OP=""
Expand Down
170 changes: 166 additions & 4 deletions datalad/metadata/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
from collections import (
OrderedDict,
)
from pathlib import Path
from typing import (
Dict,
List,
Optional,
)

from datalad import cfg
from datalad.interface.annotate_paths import _minimal_annotate_paths
Expand Down Expand Up @@ -70,6 +76,21 @@
from datalad.log import log_progress
from datalad.core.local.status import get_paths_by_ds

# Check availability of new-generation metadata
try:
from datalad_metalad.dump import Dump
from datalad_metalad.exceptions import NoMetadataStoreFound
next_generation_metadata_available = True
except ImportError:
class NoMetadataStoreFound(Exception):
pass

class Dump:
def __call__(self, *args, **kwargs):
return []
next_generation_metadata_available = False


lgr = logging.getLogger('datalad.metadata.metadata')

aggregate_layout_version = 1
Expand Down Expand Up @@ -172,11 +193,11 @@ def _get_containingds_from_agginfo(info, rpath):
return dspath


def query_aggregated_metadata(reporton, ds, aps, recursive=False,
**kwargs):
def legacy_query_aggregated_metadata(reporton, ds, aps, recursive=False,
**kwargs):
"""Query the aggregated metadata in a dataset
Query paths (`aps`) have to be composed in an intelligent fashion
Query paths (`annotated_paths`) have to be composed in an intelligent fashion
by the caller of this function, i.e. it should have been decided
outside which dataset to query for any given path.
Expand All @@ -185,7 +206,7 @@ def query_aggregated_metadata(reporton, ds, aps, recursive=False,
Parameters
----------
reporton : {None, 'none', 'dataset', 'files', 'all'}
reporton : {None, 'none', 'datasets', 'files', 'all'}
If `None`, reporting will be based on the `type` property of the
incoming annotated paths.
ds : Dataset
Expand Down Expand Up @@ -1025,3 +1046,144 @@ def custom_result_renderer(res, **kwargs):
if meta else ' -' if 'metadata' in res else ' aggregated',
tags='' if 'tag' not in meta else ' [{}]'.format(
','.join(ensure_list(meta['tag'])))))


def gen4_query_aggregated_metadata(reporton: str,
ds: Dataset,
aps: List[Dict],
recursive: bool = False,
**kwargs):
"""Query metadata in a metadata store
Query paths (`aps["path"]`) have to be contained in the poth of the ds.
This requirement is due to the colling conventions of the legacy
implementation.
This function doesn't cache anything, hence the caller must
make sure to only call this once per dataset to avoid waste.
Parameters
----------
reporton : {None, 'none', 'datasets', 'files', 'all'}
If `None`, reporting will be based on the `type` property of the
incoming annotated paths.
ds : Dataset
Dataset to query
aps : list
Sequence of annotated paths to query metadata for.
recursive : bool
Whether or not to report metadata underneath all query paths
recursively.
**kwargs
Any other argument will be passed on to the query result dictionary.
Returns
-------
generator
Of result dictionaries.
"""

annotated_paths = aps
dataset = ds

matching_types = {
None: None,
"files": ("file",),
"datasets": ("dataset",),
"all": ("dataset", "file")
}[reporton]

for annotated_path in annotated_paths:
relative_path = Path(annotated_path["path"]).relative_to(dataset.pathobj)
if matching_types is None:
matching_types = (annotated_path["type"],)

try:
for dump_result in Dump()(dataset=dataset.pathobj,
path=str(relative_path),
recursive=recursive,
result_renderer="disabled",
return_type="generator"):

if dump_result["status"] != "ok":
continue

metadata = dump_result["metadata"]
if metadata["type"] not in matching_types:
continue

yield {
**kwargs,
"status": "ok",
"type": metadata["type"],
"path": str(dump_result["path"]),
"dsid": metadata["dataset_id"],
"refcommit": metadata["dataset_version"],
"metadata": {
metadata["extractor_name"]: metadata["extracted_metadata"]
}
}
except NoMetadataStoreFound:
yield {
**kwargs,
'path': str(ds.pathobj / relative_path),
'status': 'impossible',
'message': f'Dataset at {ds.pathobj} does not contain gen4 '
f'metadata',
'type': matching_types
}

return None


def query_aggregated_metadata(reporton: str,
ds: Dataset,
aps: List[Dict],
recursive: bool = False,
metadata_source: Optional[str] = None,
**kwargs):
"""Query legacy and NG-metadata stored in a dataset or its metadata store
Parameters
----------
reporton : {None, 'none', 'datasets', 'files', 'all'}
If `None`, reporting will be based on the `type` property of the
incoming annotated paths.
ds : Dataset
Dataset to query
aps : list
Sequence of annotated paths to query metadata for.
recursive : bool
Whether or not to report metadata underneath all query paths
recursively.
metadata_source : Optional[str]
Metadata source that should be used. If set to "legacy", only metadata
prior metalad version 0.3.0 will be queried, if set to "gen4", only
metadata of metalad version 0.3.0 and beyond will be queried, if set
to 'None', all known metadata will be queried.
**kwargs
Any other argument will be passed on to the query result dictionary.
Returns
-------
generator
Of result dictionaries.
"""

if metadata_source in (None, "legacy"):
yield from legacy_query_aggregated_metadata(
reporton=reporton,
ds=ds,
aps=aps,
recursive=recursive,
**kwargs
)

if metadata_source in (None, "gen4") and next_generation_metadata_available:
yield from gen4_query_aggregated_metadata(
reporton=reporton,
ds=ds,
aps=aps,
recursive=recursive,
**kwargs
)
80 changes: 51 additions & 29 deletions datalad/metadata/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

__docformat__ = 'restructuredtext'

import collections
import logging
from datalad.log import log_progress
lgr = logging.getLogger('datalad.metadata.search')
Expand All @@ -26,18 +25,29 @@
from time import time

from datalad import cfg
from datalad.interface.base import Interface
from datalad.interface.base import build_doc
from datalad.interface.utils import eval_results
from datalad.consts import SEARCH_INDEX_DOTGITDIR
from datalad.distribution.dataset import Dataset
from datalad.distribution.dataset import datasetmethod, EnsureDataset, \
require_dataset
from datalad.support.gitrepo import GitRepo
from datalad.distribution.dataset import (
datasetmethod,
EnsureDataset,
require_dataset,
)
from datalad.dochelpers import single_or_plural
from datalad.interface.base import (
Interface,
build_doc,
)
from datalad.interface.utils import eval_results
from datalad.support.constraints import (
EnsureInt,
EnsureNone,
)
from datalad.support.exceptions import (
CapturedException,
NoDatasetFound,
)
from datalad.support.param import Parameter
from datalad.support.constraints import EnsureNone
from datalad.support.constraints import EnsureInt

from datalad.consts import SEARCH_INDEX_DOTGITDIR
from datalad.ui import ui
from datalad.utils import (
as_unicode,
ensure_list,
Expand All @@ -46,12 +56,6 @@
shortened_repr,
unicode_srctypes,
)
from datalad.support.exceptions import (
CapturedException,
NoDatasetFound,
)
from datalad.ui import ui
from datalad.dochelpers import single_or_plural
from datalad.metadata.metadata import query_aggregated_metadata

# TODO: consider using plain as_unicode, without restricting
Expand Down Expand Up @@ -309,8 +313,9 @@ def _search_from_virgin_install(dataset, query):


class _Search(object):
def __init__(self, ds, **kwargs):
def __init__(self, ds, metadata_source=None, **kwargs):
self.ds = ds
self.metadata_source = metadata_source
self.documenttype = self.ds.config.obtain(
'datalad.search.index-{}-documenttype'.format(self._mode_label),
default=self._default_documenttype)
Expand Down Expand Up @@ -345,8 +350,8 @@ def get_nohits_msg(self):


class _WhooshSearch(_Search):
def __init__(self, ds, force_reindex=False, **kwargs):
super(_WhooshSearch, self).__init__(ds, **kwargs)
def __init__(self, ds, metadata_source=None, force_reindex=False, **kwargs):
super(_WhooshSearch, self).__init__(ds, metadata_source, **kwargs)

self.idx_obj = None
# where does the bunny have the eggs?
Expand Down Expand Up @@ -505,7 +510,8 @@ def _mk_search_index(self, force_reindex):
aps=[dict(path=self.ds.path, type='dataset')],
# MIH: I cannot see a case when we would not want recursion (within
# the metadata)
recursive=True):
recursive=True,
metadata_source=self.metadata_source):
# this assumes that files are reported after each dataset report,
# and after a subsequent dataset report no files for the previous
# dataset will be reported again
Expand Down Expand Up @@ -624,7 +630,8 @@ def __call__(self, query, max_nresults=None, force_reindex=False, full_record=Fa
ds=self.ds,
aps=annotated_hits,
# never recursive, we have direct hits already
recursive=False):
recursive=False,
metadata_source=self.metadata_source):
res.update(
refds=self.ds.path,
action='search',
Expand Down Expand Up @@ -728,7 +735,8 @@ def _mk_schema(self, dsinfo):
reporton='datasets',
ds=self.ds,
aps=[dict(path=self.ds.path, type='dataset')],
recursive=True):
recursive=True,
metadata_source=self.metadata_source):
meta = res.get('metadata', {})
# no stringification of values for speed, we do not need/use the
# actual values at this point, only the keys
Expand Down Expand Up @@ -765,8 +773,8 @@ class _EGrepCSSearch(_Search):
_mode_label = 'egrepcs'
_default_documenttype = 'datasets'

def __init__(self, ds, **kwargs):
super(_EGrepCSSearch, self).__init__(ds, **kwargs)
def __init__(self, ds, metadata_source=None, **kwargs):
super(_EGrepCSSearch, self).__init__(ds, metadata_source, **kwargs)
self._queried_keys = None # to be memoized by get_query

# If there were custom "per-search engine" options, we could expose
Expand All @@ -785,7 +793,8 @@ def __call__(self, query, max_nresults=None, consider_ucn=False, full_record=Tru
aps=[dict(path=self.ds.path, type='dataset')],
# MIH: I cannot see a case when we would not want recursion (within
# the metadata)
recursive=True):
recursive=True,
metadata_source=self.metadata_source):
# this assumes that files are reported after each dataset report,
# and after a subsequent dataset report no files for the previous
# dataset will be reported again
Expand Down Expand Up @@ -913,7 +922,8 @@ def __init__(self):
reporton='datasets',
ds=self.ds,
aps=[dict(path=self.ds.path, type='dataset')],
recursive=True):
recursive=True,
metadata_source=self.metadata_source):
meta = res.get('metadata', {})
# inject a few basic properties into the dict
# analog to what the other modes do in their index
Expand Down Expand Up @@ -1304,6 +1314,16 @@ class Search(Interface):
doc="""if given, the formal query that was generated from the given
query string is shown, but not actually executed. This is mostly useful
for debugging purposes."""),
metadata_source=Parameter(
args=('--metadata-source',),
choices=('legacy', 'gen4'),
default=None,
doc="""if given, defines which metadata source will be used to
search. 'legacy' will limit search to metadata in the old format,
i.e. stored in '$DATASET/.datalad/metadata'. 'gen4' will limit
search to metadata stored by the git-backend of
'datalad-metadata-model'. If not given, metadata from all supported
sources will be included in search.""")
)

@staticmethod
Expand All @@ -1317,7 +1337,8 @@ def __call__(query=None,
mode=None,
full_record=False,
show_keys=None,
show_query=False):
show_query=False,
metadata_source=None):
try:
ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
if ds.id is None:
Expand Down Expand Up @@ -1347,7 +1368,8 @@ def __call__(query=None,
raise ValueError(
'unknown search mode "{}"'.format(mode))

searcher = searcher(ds, force_reindex=force_reindex)
searcher = searcher(
ds, metadata_source=metadata_source, force_reindex=force_reindex)

if show_keys:
searcher.show_keys(show_keys, regexes=query)
Expand Down

0 comments on commit 65ef546

Please sign in to comment.