Skip to content

Commit

Permalink
Merge pull request #361 from datalad/issue-356
Browse files Browse the repository at this point in the history
ENH: Allow `DatasetMetadataExtractor.get_required_content()` to yield a generator
  • Loading branch information
christian-monch committed Mar 7, 2023
2 parents 4d926fb + 58c51e5 commit c741d49
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 18 deletions.
22 changes: 17 additions & 5 deletions datalad_metalad/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,11 +458,23 @@ def perform_dataset_metadata_extraction(ep: ExtractionArguments,
}

# Let the extractor get the files it requires
if extractor.get_required_content() is False:
yield {
"status": "impossible",
**result_template
}
# Handle both return possibilities of bool and Generator
res = extractor.get_required_content()
if isinstance(res, bool):
if res is False:
yield {
"status": "impossible",
**result_template
}
return
else:
failure_count = 0
for r in res:
if r["status"] in ("error", "impossible"):
failure_count += 1
yield r
if failure_count > 0:
return

# Process results
result = extractor.extract(None)
Expand Down
30 changes: 21 additions & 9 deletions datalad_metalad/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import enum
from typing import (
Any,
IO,
Dict,
Generator,
IO,
List,
Optional,
Union,
Expand Down Expand Up @@ -158,17 +159,28 @@ def __init__(self,
self.ref_commit = ref_commit
self.parameter = parameter or {}

def get_required_content(self) -> bool:
"""
Let the extractor get the content that it needs locally.
The default implementation is to do nothing.
def get_required_content(self) -> Union[bool, Generator]:
"""Let the extractor get the content that it needs locally.
The default implementation is to do nothing and return True
Extractors that overwrite this function can return a boolean
(True/False) value OR yield DataLad result records.
Returns
-------
True if all required content could be fetched, False
otherwise. If False is returned, the extractor
infrastructure will signal an error and the extractor's
extract method will not be called.
bool
True if all required content could be fetched, False
otherwise. If False is returned, the extractor
infrastructure will signal an error and the extractor's
extract method will not be called.
Yields
------
dict
DataLad result records. If a result record is yielded
with a failure 'status' (i.e. equal to 'impossible' or
'error') the extractor infrastructure will signal an error
and the extractor's extract method will not be called.
"""
return True

Expand Down
1 change: 1 addition & 0 deletions datalad_metalad/tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ def __init__(self, dataset, ref_commit, parameter):

def get_required_content(self):
self.required_content_called = True
return True

def get_id(self) -> UUID:
return UUID(int=10)
Expand Down
47 changes: 43 additions & 4 deletions docs/source/user_guide/writing-extractors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,55 @@ This function is used in dataset-level extractors only.
It will be called by MetaLad prior to metadata extraction.
Its purpose is to allow the extractor to ensure that content that is required for metadata extraction is present
(relevant, for example, if some of files to be inspected may be annexed).
The function should return ``True`` if it has obtained the required content, or confirmed its presence.
If it returns ``False``, metadata extraction will not proceed.

The function should either return a boolean value (``True | False``) or return a ``Generator`` with
`DataLad result records`_. In the case of a boolean value, the function should return ``True`` if
it has obtained the required content, or confirmed its presence. If it returns ``False``,
metadata extraction will not proceed. Alternatively, yielding result records provides extractors with
the capability to signal more expressive messages or errors. If a result record is yielded with a failure
status (i.e. with ``status`` equal to ``impossible`` or ``error``) metadata extraction will not proceed.

This function can be a place to call ``dataset.get()``.
It is advisable to disable result rendering (``result_renderer="disabled"``), because during metadata extraction, users will typically want to redirect standard output to a file or another command.
It is advisable to disable result rendering (``result_renderer="disabled"``), because during metadata
extraction, users will typically want to redirect standard output to a file or another command.

Example::
Example 1::

def get_required_content(self) -> bool:
result = self.dataset.get("CITATION.cff", result_renderer="disabled")
return result[0]["status"] in ("ok", "notneeded")

Example 2::
from typing import Generator
def get_required_content(self) -> Generator:
yield self.dataset.get("CITATION.cff", result_renderer="disabled")

Example 3::

from typing import Generator
def get_required_content(self) -> Generator:
result = self.dataset.get('CITATION.cff', result_renderer='disabled')
failure_count = 0
result_dict = dict(
path=self.dataset.path,
type='dataset',
)
for r in res:
if r['status'] in ['error', 'impossible']:
failure_count += 1
if failure_count > 0:
result_dict.update({
'status': 'error'
'message': 'could not retrieve required content'
})
else:
result_dict.update({
'status': 'ok'
'message': 'required content retrieved'
})
yield result_dict

``is_content_required()``
-------------------------

Expand Down Expand Up @@ -241,3 +278,5 @@ For example, a list of files with a given extension (including those in subfolde

files = list(self.dataset.repo.call_git_items_(["ls-files", "*.xyz"]))


.. _DataLad result records: https://docs.datalad.org/en/stable/design/result_records.html

0 comments on commit c741d49

Please sign in to comment.