Showing with 28 additions and 11 deletions.
  1. +13 −4 datalad/crawler/nodes/annex.py
  2. +8 −4 datalad/crawler/pipeline.py
  3. +7 −3 datalad/crawler/pipelines/tests/test_openfmri_collection.py
@@ -71,7 +71,7 @@ class initiate_dataset(object):
"""
def __init__(self, template, dataset_name=None,
path=None, branch=None, backend=None,
template_func=None,
template_func=None, template_kwargs=None,
data_fields=[], add_fields={}, existing=None):
"""
Parameters
@@ -80,7 +80,11 @@ def __init__(self, template, dataset_name=None,
Which template (probably matching the superdataset name) to use.
TODO: refer to specs of template that it might understand some
arguments encoded, such as #func=custom_pipeline
dataset_name : str
template_func : str, optional
Explicitly specify the function name within template module
template_kwargs: dict, optional
Keyword arguments to pass into the `template_func`.
dataset_name : str, optional
Name of the dataset. If None, reacts on 'dataset_name' in data
path : str, optional
Path were to initiate the dataset. If not specified, would use
@@ -104,6 +108,8 @@ def __init__(self, template, dataset_name=None,
# TODO: add_fields might not be flexible enough for storing more elaborate
# configurations for e.g. "basic" template
self.template = template
self.template_func = template_func
self.template_kwargs = template_kwargs
self.dataset_name = dataset_name
self.data_fields = data_fields
self.add_fields = add_fields
@@ -136,13 +142,16 @@ def _initiate_dataset(self, path, name):
return repo

def _save_crawl_config(self, dataset_path, data):
kwargs = {f: data[f] for f in self.data_fields}
kwargs = self.template_kwargs or {}
# update with those from data
kwargs.update({f: data[f] for f in self.data_fields})
# additional options given as a dictionary
kwargs.update(self.add_fields)
return initiate_pipeline_config(
template=self.template,
template_func=self.template_func,
template_kwargs=kwargs,
path=dataset_path,
kwargs=kwargs,
commit=True
)

@@ -317,9 +317,10 @@ def _compare_dicts(d1, d2):
return added, changed, removed, maybe_changed


def initiate_pipeline_config(template, path=curdir, kwargs=None, commit=False):
def initiate_pipeline_config(template, template_func=None, template_kwargs=None,
path=curdir, commit=False):
"""
TODO
TODO Gergana ;)
"""
lgr.debug("Creating crawler configuration for template %s under %s",
template, path)
@@ -334,7 +335,10 @@ def initiate_pipeline_config(template, path=curdir, kwargs=None, commit=False):
cfg_.add_section(CRAWLER_PIPELINE_SECTION)

cfg_.set(CRAWLER_PIPELINE_SECTION, 'template', template)
for k, v in (kwargs or {}).items():
if template_func:
cfg_.set(CRAWLER_PIPELINE_SECTION, 'func', template_func)

for k, v in (template_kwargs or {}).items():
cfg_.set(CRAWLER_PIPELINE_SECTION, "_" + k, str(v))

with open(crawl_config, 'w') as f:
@@ -465,7 +469,7 @@ def load_pipeline_from_config(path):
func = pipeline1
_kwarg1 = 1
which would instantial pipeline from standard.py module by calling
which would instantiate a pipeline from standard.py module by calling
`standard.pipeline1` with `_kwarg1='1'`. This definition is identical to
[crawl:pipeline]
@@ -30,8 +30,10 @@
from ....tests.utils import ok_file_has_content
from ....tests.utils import ok_file_under_git
from ....distribution.dataset import Dataset
from ....distribution.dataset import Dataset
from ....consts import CRAWLER_META_CONFIG_PATH

from datalad.api import crawl
from ..openfmri import superdataset_pipeline as ofcpipeline

from logging import getLogger
@@ -56,13 +58,15 @@ def test_openfmri_superdataset_pipeline1(ind, topurl, outd):
list(initiate_dataset(
template="openfmri",
template_func="superdataset_pipeline",
template_kwargs={'url': topurl},
path=outd,
)())

with chpwd(outd):
pipeline = ofcpipeline(url=topurl)
out = run_pipeline(pipeline)
eq_(out, [{'datalad_stats': ActivityStats()}])
crawl()
#pipeline = ofcpipeline(url=topurl)
#out = run_pipeline(pipeline)
#eq_(out, [{'datalad_stats': ActivityStats()}])

# TODO: replace below command with the one listing subdatasets
subdatasets = ['ds000001', 'ds000002']