Skip to content

Commit

Permalink
Merge pull request #359 from yarikoptic/enh-default-md5e
Browse files Browse the repository at this point in the history
By default initiate handle's backed to be MD5
  • Loading branch information
yarikoptic committed Feb 27, 2016
2 parents bcbab2b + 8aeb9fa commit e8c1007
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 23 deletions.
40 changes: 33 additions & 7 deletions datalad/crawler/nodes/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from ...utils import rmtree, updated
from ...utils import lmtime
from ...utils import find_files
from ...utils import auto_repr
from ...tests.utils import put_file_under_git

from ...downloaders.providers import Providers
from ...support.configparserinc import SafeConfigParserWithIncludes
Expand All @@ -54,12 +56,14 @@
_call = _runner.call
_run = _runner.run


# TODO: make use of datalad_stats
@auto_repr
class initiate_handle(object):
"""Action to initiate a handle following one of the known templates
"""
def __init__(self, template, handle_name=None, collection_name=None,
path=None, branch=None,
path=None, branch=None, backend=None,
data_fields=[], add_fields={}, existing=None):
"""
Parameters
Expand All @@ -75,6 +79,10 @@ def __init__(self, template, handle_name=None, collection_name=None,
default path for all new handles (DATALAD_CRAWL_COLLECTIONSPATH)
branch : str, optional
Which branch to initialize
backend : str, optional
Supported by git-annex backend. By default (if None specified),
it is MD5E backend to improve compatibility with filesystems
having a relatively small limit for a maximum path size
data_fields : list or tuple of str, optional
Additional fields from data to store into configuration for
the handle crawling options -- would be passed into the corresponding
Expand All @@ -95,6 +103,8 @@ def __init__(self, template, handle_name=None, collection_name=None,
self.existing = existing
self.path = path
self.branch = branch
# TODO: backend -> backends (https://github.com/datalad/datalad/issues/358)
self.backend = backend

def _initiate_handle(self, path, name):
lgr.info("Initiating handle %s" % name)
Expand All @@ -106,11 +116,17 @@ def _initiate_handle(self, path, name):
git_repo.git_checkout(self.branch, options="--orphan")
# TODO: RF whenevever create becomes a dedicated factory/method
# and/or branch becomes an option for the "creater"
return HandleRepo(
path,
direct=cfg.getboolean('crawl', 'init direct', default=False),
name=name,
create=True)
backend = self.backend or cfg.get('crawl', 'default backend', default='MD5E')
repo = HandleRepo(
path,
direct=cfg.getboolean('crawl', 'init direct', default=False),
name=name,
backend=backend,
create=True)
# TODO: centralize
if backend:
put_file_under_git(path, '.gitattributes', '* annex.backend=%s' % backend, annexed=False)
return repo

def _save_crawl_config(self, handle_path, name, data):
lgr.debug("Creating handle configuration for %s" % name)
Expand Down Expand Up @@ -614,6 +630,15 @@ def _precommit(self):
self.repo.precommit() # so that all batched annexes stop
if self._statusdb:
self._statusdb.save()
# there is something to commit and backends was set but no .gitattributes yet
path = self.repo.path
if self.repo.dirty and not exists(opj(path, '.gitattributes')):
backends = self.repo.default_backends
if backends:
# then record default backend into the .gitattributes
put_file_under_git(path, '.gitattributes', '* annex.backend=%s' % backends[0],
annexed=False)


# At least use repo._git_custom_command
def _commit(self, msg=None, options=[]):
Expand Down Expand Up @@ -950,7 +975,8 @@ def __call__(self_, data):
statusdb = self._statusdb
obsolete = statusdb.get_obsolete()
if obsolete:
lgr.info('Removing %d obsolete files' % len(obsolete))
files_str = ": " + ', '.join(obsolete) if len(obsolete) < 10 else ""
lgr.info('Removing %d obsolete files%s' % (len(obsolete), files_str))
stats = data.get('datalad_stats', None)
_call(self.repo.git_remove, obsolete)
if stats:
Expand Down
19 changes: 18 additions & 1 deletion datalad/crawler/nodes/tests/test_annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@
from ....tests.utils import ok_file_under_git
from ....tests.utils import ok_file_has_content
from ....tests.utils import assert_cwd_unchanged
from ....tests.utils import put_file_under_git
from ...pipeline import load_pipeline_from_config
from ....consts import CRAWLER_META_CONFIG_PATH, DATALAD_SPECIAL_REMOTE, ARCHIVES_SPECIAL_REMOTE
from ....support.stats import ActivityStats
from ....support.annexrepo import AnnexRepo

@with_tempfile(mkdir=True)
def test_initialize_handle(path):
@with_tempfile()
def test_initialize_handle(path, path2):
handle_path = opj(path, 'test')
datas = list(initiate_handle('template', 'testhandle', path=handle_path)())
assert_equal(len(datas), 1)
Expand All @@ -33,6 +36,20 @@ def test_initialize_handle(path):
crawl_cfg = opj(handle_path, CRAWLER_META_CONFIG_PATH)
ok_(exists, crawl_cfg)
pipeline = load_pipeline_from_config(crawl_cfg)

# by default we should initiate to MD5E backend
fname = 'test.dat'
f = opj(handle_path, fname)
annex = put_file_under_git(f, content="test", annexed=True)
eq_(annex.get_file_backend(f), 'MD5E')

# and even if we clone it -- nope -- since persistence is set by Annexificator
# so we don't need to explicitly to commit it just in master since that might
# not be the branch we will end up working in
annex2 = AnnexRepo(path2, url=handle_path)
annex3 = put_file_under_git(path2, 'test2.dat', content="test2", annexed=True)
eq_(annex3.get_file_backend('test2.dat'), 'MD5E')

raise SkipTest("TODO much more")


Expand Down
1 change: 1 addition & 0 deletions datalad/crawler/pipelines/tests/test_openfmri.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def hexsha(l):
'./.datalad/crawl/versions/incoming.json',
'./README.txt', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv'}
target_incoming_files = {
'.gitattributes', # we marked default backend right in the incoming
'README.txt', 'changelog.txt',
'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz',
'.datalad/crawl/statuses/incoming.json',
Expand Down
11 changes: 11 additions & 0 deletions datalad/support/annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,17 @@ def get_file_backend(self, files):

return [self.get_file_key(f).split('-')[0] for f in files]

@property
def default_backends(self):
try:
backends = self.repo.config_reader().get_value("annex", "backends")
if backends:
return backends.split()
else:
return None
except NoOptionError:
return None

def annex_fsck(self):
self._run_annex_command('fsck')

Expand Down
17 changes: 16 additions & 1 deletion datalad/tests/test_annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,4 +778,19 @@ def test_AnnexRepo_addurl_to_file_batched(sitepath, siteurl, dst):

raise SkipTest("TODO: more, e.g. add with a custom backend")
# TODO: also with different modes (relaxed, fast)
# TODO: verify that file is added with that backend and that we got a new batched process
# TODO: verify that file is added with that backend and that we got a new batched process


@with_tempfile(mkdir=True)
def test_annex_backends(path):
repo = AnnexRepo(path)
eq_(repo.default_backends, None)

rmtree(path)

repo = AnnexRepo(path, backend='MD5E')
eq_(repo.default_backends, ['MD5E'])

# persists
repo = AnnexRepo(path)
eq_(repo.default_backends, ['MD5E'])
53 changes: 40 additions & 13 deletions datalad/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,48 @@ def ok_file_under_git(path, filename=None, annexed=False):
If relative path provided, then test from current directory
"""
annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)

assert(file_repo_path in repo.get_indexed_files()) # file is known to Git

if annex:
try:
# operates on relative to curdir path
repo.get_file_key(opj(path, filename))
in_annex = True
except FileNotInAnnexError as e:
in_annex = False
else:
in_annex = False

assert(annexed == in_annex)

def put_file_under_git(path, filename=None, content=None, annexed=False):
"""Place file under git/annex and return used Repo
"""
annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)
if content is None:
content = ""
with open(opj(path, filename), 'w') as f_:
f_.write(content)

if annexed:
if not isinstance(repo, AnnexRepo):
repo = AnnexRepo(repo.path)
repo.add_to_annex(filename)
else:
repo.git_add(filename)
ok_file_under_git(path, filename, annexed)
return repo

def _prep_file_under_git(path, filename):
"""Get instance of the repository for the given filename
Helper to be used by few functions
"""
if filename is None:
# path provides the path and the name
path, filename = pathsplit(path)

try:
# if succeeds when must not (not `annexed`) -- fail
repo = get_repo_instance(path, class_=AnnexRepo)
Expand All @@ -186,19 +224,8 @@ def ok_file_under_git(path, filename=None, annexed=False):
# path to the file within the repository
file_repo_dir = os.path.relpath(path, repo.path)
file_repo_path = filename if file_repo_dir == curdir else opj(file_repo_dir, filename)
assert(file_repo_path in repo.get_indexed_files()) # file is known to Git

if annex:
try:
# operates on relative to curdir path
repo.get_file_key(opj(path, filename))
in_annex = True
except FileNotInAnnexError as e:
in_annex = False
else:
in_annex = False
return annex, file_repo_path, filename, path, repo

assert(annexed == in_annex)

#
# Helpers to test symlinks
Expand Down
2 changes: 1 addition & 1 deletion datalad/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def sorted_files(dout):
if not '.git' in r], []))

from os.path import sep as dirsep
_VCS_REGEX = '%s\.(git|svn|bzr|hg)(?:%s|$)' % (dirsep, dirsep)
_VCS_REGEX = '%s\.(git|gitattributes|svn|bzr|hg)(?:%s|$)' % (dirsep, dirsep)

def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False):
"""Generator to find files matching regex
Expand Down

0 comments on commit e8c1007

Please sign in to comment.