Skip to content

Commit

Permalink
Merge pull request #327 from yarikoptic/enh-whereis
Browse files Browse the repository at this point in the history
ENH: annex_whereis(output="full") returns a dict with a dict of remotes
  • Loading branch information
yarikoptic committed Dec 21, 2015
2 parents 2bde97e + ca066ba commit 7fee574
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 23 deletions.
10 changes: 6 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ script:
after_success:
- coveralls

after_failure:
- if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route add -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi
- DATALAD_LOGLEVEL=DEBUG $NOSE_WRAPPER `which nosetests` -s -v --with-doctest --with-cov --cover-package datalad --logging-level=DEBUG
- if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route del -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi
# makes it only more difficult to comprehend the failing output. Enable only when necessary
# for a particular debugging
#after_failure:
# - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route add -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi
# - DATALAD_LOGLEVEL=DEBUG $NOSE_WRAPPER `which nosetests` -s -v --with-doctest --with-cov --cover-package datalad --logging-level=DEBUG
# - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route del -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi
77 changes: 65 additions & 12 deletions datalad/support/annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""

import re
from os import linesep
from os.path import join as opj, exists, relpath
import logging
Expand Down Expand Up @@ -602,25 +603,68 @@ def annex_drop(self, files, options=None):

self._run_annex_command('drop', annex_options=options + files)


# TODO: a dedicated unit-test
def _whereis_json_to_dict(self, j):
"""Convert json record returned by annex whereis --json to our dict representation for it
"""
assert (j.get('success', True) is True)
# process 'whereis' containing list of remotes
remotes = {remote['description']: {'here': remote['here'], 'uuid': remote['uuid']}
for remote in j.get('whereis')}
if 'web' in remotes:
# will be replaced with a list of urls
remotes['web']['urls'] = []
# process 'note' which would contain urls for 'web' remote
note = j.get('note', '')
if note:
for note_record in filter(bool, note.split('\n')):
# format remote: url ?
note_split = note_record.split(':', 1)
if len(note_split) != 2:
lgr.debug("Skipping note record %r for file %s", note_record, j['file'])
continue
remote, url = map(lambda x: x.strip(), note_split)
if remote not in remotes:
lgr.warning("Remote %r not found among remotes %s. Skipping", remote, remotes.keys())
assert remote == 'web', "ATM can understand only notes for web remote"
remotes['web']['urls'].append(url)
return remotes


# TODO: reconsider having any magic at all and maybe just return a list/dict always
@normalize_paths
def annex_whereis(self, files):
def annex_whereis(self, files, output='remotes'):
"""Lists repositories that have actual content of file(s).
Parameters
----------
files: list of str
files to look for
output: {'remotes', 'full'}, optional
If 'remotes', a list of remotes returned per each file. If full,
per each file a dictionary returned with "web" also containing
'urls' list with all the urls for that file
Returns
-------
list of list of unicode
Contains a list of descriptions per each input file,
describing the remote for each remote, which was found by
git-annex whereis, like:
list of list of unicode or dict
if output == 'remotes', contains a list of descriptions per
each input file, describing the remote for each remote, which
was found by git-annex whereis, like:
u'me@mycomputer:~/where/my/repo/is [origin]' or
u'web' or
u'me@mycomputer:~/some/other/clone'
if output == 'full', returns a dictionary with filenames as keys
and values a detailed record, e.g.
{'web': {
'uuid': '00000000-0000-0000-0000-000000000001',
'here': False,
'urls': ['http://127.0.0.1:43442/about.txt', 'http://example.com/someurl']
}}
"""

try:
Expand All @@ -635,13 +679,22 @@ def annex_whereis(self, files):
else:
raise e

json_objects = [json.loads(line)
for line in out.splitlines() if line.startswith('{')]

return [
[remote.get('description') for remote in item.get('whereis')]
if item.get('success') else []
for item in json_objects]
json_objects = (json.loads(line)
for line in out.splitlines() if line.startswith('{'))

if output == 'remotes':
return [
[remote.get('description') for remote in j.get('whereis')]
if j.get('success') else []
for j in json_objects]
return out
elif output == 'full':
# TODO: we might want to optimize storage since many remotes entries will be the
# same so we could just reuse them instead of brewing copies
return {j['file']: self._whereis_json_to_dict(j)
for j in json_objects}
else:
raise ValueError("Unknown value output=%r. Known are remotes and full" % output)

def get_annexed_files(self):
"""Get a list of files in annex
Expand Down
13 changes: 11 additions & 2 deletions datalad/support/gitrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,17 @@ def newfunc(self, files, *args, **kwargs):
# was requested or it was not a single file
return result
elif single_file:
assert(len(result) == 1)
return result[0]
if len(result) != 1:
# Magic doesn't apply
return result
elif isinstance(result, (list, tuple)):
return result[0]
elif isinstance(result, dict) and tuple(result)[0] == files_new[0]:
# assume that returned dictionary has files as keys.
return tuple(result.values())[0]
else:
# no magic can apply
return result
else:
return RuntimeError("should have not got here... check logic")

Expand Down
65 changes: 60 additions & 5 deletions datalad/tests/test_annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,15 +241,21 @@ def test_AnnexRepo_annex_add_to_git(src, dst):
assert_in(filename, ar.get_indexed_files())


@with_testrepos('.*annex.*', flavors=local_testrepo_flavors)
@with_tree(tree=(('about.txt', 'Lots of abouts'),))
@with_tree(tree=(('about.txt', 'Lots of abouts'),
('about2.txt', 'more abouts'),
('d', {'sub.txt': 'more stuff'})))
@serve_path_via_http()
@with_tempfile
def test_AnnexRepo_web_remote(src, sitepath, siteurl, dst):
def test_AnnexRepo_web_remote(sitepath, siteurl, dst):

ar = AnnexRepo(dst, src)
ar = AnnexRepo(dst, create=True)
testurl = urljoin(siteurl, 'about.txt')
testfile = '%s_about.txt' % urlsplit(testurl).netloc.split(':')[0]
testurl2 = urljoin(siteurl, 'about2.txt')
testurl3 = urljoin(siteurl, 'd', 'sub.txt')
url_file_prefix = urlsplit(testurl).netloc.split(':')[0]
testfile = '%s_about.txt' % url_file_prefix
testfile2 = '%s_about2.txt' % url_file_prefix
testfile3 = opj('d', 'sub.txt')

# get the file from remote
with swallow_outputs() as cmo:
Expand All @@ -259,6 +265,14 @@ def test_AnnexRepo_web_remote(src, sitepath, siteurl, dst):
assert_equal(len(l), 2)
assert_true(ar.file_has_content(testfile))

# output='full'
lfull = ar.annex_whereis(testfile, output='full')
assert_equal(set(lfull), set(l)) # the same entries
non_web_remote = l[1-l.index('web')]
assert_not_in('urls', lfull[non_web_remote])
assert_equal(lfull['web']['uuid'], '00000000-0000-0000-0000-000000000001')
assert_equal(lfull['web']['urls'], [testurl])

# remove the remote
ar.annex_rmurl(testfile, testurl)
l = ar.annex_whereis(testfile)
Expand Down Expand Up @@ -292,6 +306,47 @@ def test_AnnexRepo_web_remote(src, sitepath, siteurl, dst):
assert_in('web', l)
assert_equal(len(l), 1)
assert_false(ar.file_has_content(testfile))
lfull = ar.annex_whereis(testfile, output='full')
assert_not_in(non_web_remote, lfull) # not present -- so not even listed

# multiple files/urls
# get the file from remote
with swallow_outputs() as cmo:
ar.annex_addurls([testurl2])

# TODO: if we ask for whereis on all files, we should get for all files
lall = ar.annex_whereis('.')
assert_equal(len(lall), 2)
for e in lall:
assert(isinstance(e, list))
# but we don't know which one for which file. need a 'full' one for that
lall_full = ar.annex_whereis('.', output='full')
assert_true(ar.file_has_content(testfile2))
assert_true(lall_full[testfile2][non_web_remote]['here'])
assert_equal(set(lall_full), {testfile, testfile2})

# add a bogus 2nd url to testfile

someurl = "http://example.com/someurl"
ar.annex_addurl_to_file(testfile, someurl, options=['--relaxed'])
lfull = ar.annex_whereis(testfile, output='full')
assert_equal(set(lfull['web']['urls']), {testurl, someurl})

# and now test with a file in subdirectory
subdir = opj(dst, 'd')
os.mkdir(subdir)
with swallow_outputs() as cmo:
ar.annex_addurl_to_file(testfile3, url=testurl3)
assert_equal(set(ar.annex_whereis(testfile3)), {'web', non_web_remote})
assert_equal(set(ar.annex_whereis(testfile3, output='full').keys()), {'web', non_web_remote})

# which would work even if we cd to that subdir
with chpwd(subdir):
assert_equal(set(ar.annex_whereis('sub.txt')), {'web', non_web_remote})
assert_equal(set(ar.annex_whereis('sub.txt', output='full').keys()), {'web', non_web_remote})




@with_testrepos('.*annex.*', flavors=['local', 'network'])
@with_tempfile
Expand Down

0 comments on commit 7fee574

Please sign in to comment.