Skip to content

Commit

Permalink
remove redefinition of legacy extractor base-class
Browse files Browse the repository at this point in the history
This commit removes the definition of the legacy
extractor base class `BaseMetadataExtractor` from
`datalad_metalad.extractors.base`. It now is imported from
`datalad_deprecated.metadata.extractors.base` and
augmented with a generation-ID.

This should fix mixups with identical class names
from different packages.
  • Loading branch information
christian-monch committed Jan 23, 2024
1 parent 9d44e78 commit 0892fa7
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 55 deletions.
2 changes: 2 additions & 0 deletions datalad_metalad/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,8 @@ def legacy_get_file_info(dataset: Dataset,

def legacy_extract_file(ea: ExtractionArguments) -> Iterable[dict]:

import sys
print(repr(ea), file=sys.stderr)
if issubclass(ea.extractor_class, MetadataExtractor):

# Call metalad legacy extractor with a single status record.
Expand Down
62 changes: 7 additions & 55 deletions datalad_metalad/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
from uuid import UUID

from datalad.distribution.dataset import Dataset
# XXX this is the legacy-legacy interface, keep around for a bit more and then
# remove
from datalad_deprecated.metadata.extractors.base import BaseMetadataExtractor


# Add a generation identifier to the old extractor base class
BaseMetadataExtractor.__generation__ = 2


@dataclasses.dataclass
Expand Down Expand Up @@ -350,58 +357,3 @@ def get_state(self, dataset):
object instance is passed via the method's `dataset` argument.
"""
return {}


# XXX this is the legacy-legacy interface, keep around for a bit more and then
# remove
class BaseMetadataExtractor:

__generation__ = 2

NEEDS_CONTENT = True # majority of the extractors need data content

def __init__(self, ds, paths):
"""
Parameters
----------
ds : dataset instance
Dataset to extract metadata from.
paths : list
Paths to investigate when extracting content metadata
"""

self.ds = ds
self.paths = paths

def get_metadata(self, dataset=True, content=True):
"""
Returns
-------
dict or None, dict or None
Dataset metadata dict, dictionary of filepath regexes with metadata,
dicts, each return value could be None if there is no such metadata
"""
# default implementation
return \
self._get_dataset_metadata() if dataset else None, \
((k, v) for k, v in self._get_content_metadata()) if content else None

def _get_dataset_metadata(self):
"""
Returns
-------
dict
keys and values are arbitrary
"""
raise NotImplementedError

def _get_content_metadata(self):
"""Get ALL metadata for all dataset content.
Possibly limited to the paths given to the extractor.
Returns
-------
generator((location, metadata_dict))
"""
raise NotImplementedError

0 comments on commit 0892fa7

Please sign in to comment.