-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #329 from yarikoptic/rf-crawler-new-design
ENH: further work on the crawler new design
- Loading branch information
Showing
32 changed files
with
1,245 additions
and
256 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 noet: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
"""File-based "DB" which uses file modification times to deduce if new version is available | ||
""" | ||
|
||
import os | ||
from os.path import join as opj, exists, lexists, islink, realpath | ||
|
||
from ...dochelpers import exc_str | ||
from ...support.status import FileStatus | ||
from ...support.exceptions import CommandError | ||
from ...utils import auto_repr | ||
from ...utils import swallow_logs | ||
|
||
import logging | ||
lgr = logging.getLogger('datalad.crawler.dbs') | ||
|
||
__docformat__ = 'restructuredtext' | ||
|
||
|
||
@auto_repr | ||
class AnnexFileAttributesDB(object): | ||
|
||
def __init__(self, annex, track_queried=True): | ||
""" | ||
Parameters | ||
---------- | ||
annex : AnnexRepo | ||
Annex repository which will be consulted on the size and full path | ||
""" | ||
self.annex = annex | ||
# which file paths were referred | ||
self._track_queried = track_queried | ||
self._queried_filepaths = set() | ||
|
||
@property | ||
def track_queried(self): | ||
return self._track_queried | ||
|
||
@property | ||
def queried_filepaths(self): | ||
return self._queried_filepaths | ||
|
||
# TODO: think if default should be provided | ||
def get(self, fpath): | ||
"""Given a file (under annex) relative path, return its status record | ||
annex information about size etc might be used if load is not available | ||
Parameters | ||
---------- | ||
fpath: str | ||
Path (relative to the top of the repo) of the file to get stats of | ||
""" | ||
filepath = opj(self.annex.path, fpath) | ||
if self._track_queried: | ||
self._queried_filepaths.add(filepath) | ||
|
||
assert(lexists(filepath)) # of check and return None? | ||
|
||
# I wish I could just test using filesystem stats but that would not | ||
# be reliable, and also file might not even be here. | ||
# File might be under git, not annex so then we would need to assess size | ||
filestat = os.lstat(filepath) | ||
try: | ||
with swallow_logs(): | ||
info = self.annex.annex_info(fpath) | ||
size = info['size'] | ||
except CommandError as exc: | ||
# must be under git or a plain file | ||
lgr.debug("File %s must be not under annex, since info failed: %s" % (filepath, exc_str(exc))) | ||
size = filestat.st_size | ||
|
||
# deduce mtime from the file or a content which it points to. Take the oldest (I wonder | ||
# if it would bite ;) XXX) | ||
mtime = filestat.st_mtime | ||
|
||
if islink(fpath): | ||
filepath_ = realpath(filepath) # symlinked to | ||
if exists(filepath_): | ||
mtime_ = os.stat(filepath_).st_mtime | ||
mtime = min(mtime_, mtime) | ||
|
||
return FileStatus( | ||
size=size, | ||
mtime=mtime | ||
) | ||
|
||
def is_different(self, fpath, status, url=None): | ||
"""Return True if file pointed by fpath newer in status | ||
""" | ||
# TODO: make use of URL -- we should validate that url is among those associated | ||
# with the file | ||
old_status = self.get(fpath) | ||
return old_status != status |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 noet: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 noet: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
|
||
import os | ||
from os.path import join as opj | ||
from ..files import AnnexFileAttributesDB | ||
|
||
from ....tests.utils import with_tree | ||
from ....tests.utils import assert_equal | ||
from ....tests.utils import assert_false | ||
from ....tests.utils import chpwd | ||
from ....support.annexrepo import AnnexRepo | ||
|
||
@with_tree( | ||
tree={'file1.txt': 'load1', | ||
'2git': 'load', | ||
'd': { | ||
'file2.txt': 'load2' | ||
} | ||
} | ||
) | ||
def test_AnnexFileAttributesDB(path): | ||
filepath1 = opj(path, 'file1.txt') | ||
filep2 = opj('d', 'file2.txt') | ||
filepath2 = opj(path, filep2) | ||
|
||
annex = AnnexRepo(path, create=True) | ||
# AnnexFileAttributesDB relies on information in annex so files | ||
# must be committed first | ||
annex.annex_add('file1.txt') | ||
annex.git_commit("initial commit") | ||
db = AnnexFileAttributesDB(annex=annex) | ||
status1 = db.get('file1.txt') | ||
assert(status1.size) | ||
|
||
status1_ = db.get('file1.txt') | ||
assert_equal(status1, status1_) | ||
assert_false(db.is_different('file1.txt', status1)) | ||
|
||
os.unlink(filepath1) # under annex- - we don't have unlock yet and thus can't inplace augment | ||
with open(filepath1, 'a') as f: | ||
f.write('+') | ||
assert(db.is_different('file1.txt', status1)) | ||
|
||
# we should be able to get status of files out and inside of git | ||
status_git1 = db.get('2git') | ||
annex.git_add('2git') | ||
annex.git_commit("added 2git") | ||
assert_equal(db.get('2git'), status_git1) | ||
|
||
# we should be able to get status of files with relative path to top dir and abs path | ||
status2 = db.get(filep2) | ||
status2_full = db.get(filepath2) | ||
assert_equal(status2, status2_full) | ||
# TODO? what about relative to curdir?? | ||
#with chpwd(opj(path, 'd')): | ||
# status2_dir = db.get('./file2.txt') | ||
# assert_equal(status2, status2_dir) |
Oops, something went wrong.