Skip to content

Commit

Permalink
Merge pull request #329 from yarikoptic/rf-crawler-new-design
Browse files Browse the repository at this point in the history
ENH: further work on the crawler new design
  • Loading branch information
bpoldrack committed Jan 14, 2016
2 parents 2f53949 + 2afbe29 commit 9cf93b7
Show file tree
Hide file tree
Showing 32 changed files with 1,245 additions and 256 deletions.
103 changes: 103 additions & 0 deletions datalad/crawler/dbs/files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""File-based "DB" which uses file modification times to deduce if new version is available
"""

import os
from os.path import join as opj, exists, lexists, islink, realpath

from ...dochelpers import exc_str
from ...support.status import FileStatus
from ...support.exceptions import CommandError
from ...utils import auto_repr
from ...utils import swallow_logs

import logging
lgr = logging.getLogger('datalad.crawler.dbs')

__docformat__ = 'restructuredtext'


@auto_repr
class AnnexFileAttributesDB(object):

def __init__(self, annex, track_queried=True):
"""
Parameters
----------
annex : AnnexRepo
Annex repository which will be consulted on the size and full path
"""
self.annex = annex
# which file paths were referred
self._track_queried = track_queried
self._queried_filepaths = set()

@property
def track_queried(self):
return self._track_queried

@property
def queried_filepaths(self):
return self._queried_filepaths

# TODO: think if default should be provided
def get(self, fpath):
"""Given a file (under annex) relative path, return its status record
annex information about size etc might be used if load is not available
Parameters
----------
fpath: str
Path (relative to the top of the repo) of the file to get stats of
"""
filepath = opj(self.annex.path, fpath)
if self._track_queried:
self._queried_filepaths.add(filepath)

assert(lexists(filepath)) # of check and return None?

# I wish I could just test using filesystem stats but that would not
# be reliable, and also file might not even be here.
# File might be under git, not annex so then we would need to assess size
filestat = os.lstat(filepath)
try:
with swallow_logs():
info = self.annex.annex_info(fpath)
size = info['size']
except CommandError as exc:
# must be under git or a plain file
lgr.debug("File %s must be not under annex, since info failed: %s" % (filepath, exc_str(exc)))
size = filestat.st_size

# deduce mtime from the file or a content which it points to. Take the oldest (I wonder
# if it would bite ;) XXX)
mtime = filestat.st_mtime

if islink(fpath):
filepath_ = realpath(filepath) # symlinked to
if exists(filepath_):
mtime_ = os.stat(filepath_).st_mtime
mtime = min(mtime_, mtime)

return FileStatus(
size=size,
mtime=mtime
)

def is_different(self, fpath, status, url=None):
"""Return True if file pointed by fpath newer in status
"""
# TODO: make use of URL -- we should validate that url is among those associated
# with the file
old_status = self.get(fpath)
return old_status != status
8 changes: 8 additions & 0 deletions datalad/crawler/dbs/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
64 changes: 64 additions & 0 deletions datalad/crawler/dbs/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##

import os
from os.path import join as opj
from ..files import AnnexFileAttributesDB

from ....tests.utils import with_tree
from ....tests.utils import assert_equal
from ....tests.utils import assert_false
from ....tests.utils import chpwd
from ....support.annexrepo import AnnexRepo

@with_tree(
tree={'file1.txt': 'load1',
'2git': 'load',
'd': {
'file2.txt': 'load2'
}
}
)
def test_AnnexFileAttributesDB(path):
filepath1 = opj(path, 'file1.txt')
filep2 = opj('d', 'file2.txt')
filepath2 = opj(path, filep2)

annex = AnnexRepo(path, create=True)
# AnnexFileAttributesDB relies on information in annex so files
# must be committed first
annex.annex_add('file1.txt')
annex.git_commit("initial commit")
db = AnnexFileAttributesDB(annex=annex)
status1 = db.get('file1.txt')
assert(status1.size)

status1_ = db.get('file1.txt')
assert_equal(status1, status1_)
assert_false(db.is_different('file1.txt', status1))

os.unlink(filepath1) # under annex- - we don't have unlock yet and thus can't inplace augment
with open(filepath1, 'a') as f:
f.write('+')
assert(db.is_different('file1.txt', status1))

# we should be able to get status of files out and inside of git
status_git1 = db.get('2git')
annex.git_add('2git')
annex.git_commit("added 2git")
assert_equal(db.get('2git'), status_git1)

# we should be able to get status of files with relative path to top dir and abs path
status2 = db.get(filep2)
status2_full = db.get(filepath2)
assert_equal(status2, status2_full)
# TODO? what about relative to curdir??
#with chpwd(opj(path, 'd')):
# status2_dir = db.get('./file2.txt')
# assert_equal(status2, status2_dir)

0 comments on commit 9cf93b7

Please sign in to comment.