Skip to content

Commit

Permalink
Merge branch 'master' of github.com:data-workspaces/data-workspaces-core
Browse files Browse the repository at this point in the history
  • Loading branch information
jfischer committed Apr 14, 2019
2 parents f324873 + fb89aa5 commit 25bdb30
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 22 deletions.
15 changes: 9 additions & 6 deletions dataworkspaces/dws.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ def init(ctx, hostname, name, create_resources, git_fat_remote,
@click.option('--workspace-dir', type=WORKSPACE_PARAM, default=DWS_PATHDIR)
@click.pass_context
def add(ctx, workspace_dir):
"""Add a data collection to the workspace"""
"""Add a data collection to the workspace as a resource.
Possible types of resources are ``git``, ``local-files``, or ``rclone``; these are subcommands of add."""
ns = ctx.obj
if workspace_dir is None:
if ns.batch:
Expand All @@ -222,17 +223,19 @@ def add(ctx, workspace_dir):
@click.option('--role', type=ROLE_PARAM)
@click.option('--name', type=str, default=None,
help="Short name for this resource")
@click.option('--compute-hash', is_flag=True, default=False,
help="Compute hashes for all files. If this option is not set, we use a lightweight comparison of file sizes only.")
@click.argument('path', type=DIRECTORY_PARAM)
@click.pass_context
def local_files(ctx, role, name, path):
"""Local file directory (not managed by git)"""
def local_files(ctx, role, name, path, compute_hash):
"""Add a local file directory (not managed by git) to the workspace. Subcommand of ``add``"""
ns = ctx.obj
if role is None:
if ns.batch:
raise BatchModeError("--role")
else:
role = click.prompt("Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM)
add_command('file', role, name, ns.workspace_dir, ns.batch, ns.verbose, path)
add_command('file', role, name, ns.workspace_dir, ns.batch, ns.verbose, path, compute_hash)

add.add_command(local_files)

Expand All @@ -248,7 +251,7 @@ def local_files(ctx, role, name, path):
@click.argument('dest', type=str) # Currently, dest is required. Later: make dest optional and use the same path as remote?
@click.pass_context
def rclone(ctx, role, name, config, compute_hash, source, dest):
"""rclone-d repository"""
"""Add an rclone-d repository as a resource to the workspace. Subcommand of ``add``"""
ns = ctx.obj
if role is None:
if ns.batch:
Expand All @@ -272,7 +275,7 @@ def rclone(ctx, role, name, config, compute_hash, source, dest):
@click.argument('path', type=str)
@click.pass_context
def git(ctx, role, name, branch, path):
"""Local git repository"""
"""Add a local git repository as a resource. Subcommand of ``add``"""
ns = ctx.obj
if role is None:
if ns.batch:
Expand Down
29 changes: 23 additions & 6 deletions dataworkspaces/resources/hashtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ def compute_hash(tmpname):
sha1.update(data)
return sha1.hexdigest()

def compute_size(fname):
if not(os.path.exists(fname)):
raise Exception("File %s does not exist" % fname)
statinfo = os.stat(fname)
return statinfo.st_size

BLOB = 'blob'
TREE = 'tree'
Expand Down Expand Up @@ -141,7 +146,7 @@ def __reversed__(self):
HashTree._map_id_to_type[TREE] = HashTree


def generate_hashes(path_where_hashes_are_stored, local_dir, ignore=[],
def generate_hashes(path_where_hashes_are_stored, local_dir, ignore=[], hash_fun=compute_hash,
add_to_git=True):
"""traverse a directory tree rooted at :local_dir: and construct the tree hashes
in the directory :path_where_hashes_are_stored:
Expand All @@ -154,11 +159,11 @@ def generate_hashes(path_where_hashes_are_stored, local_dir, ignore=[],

t = HashTree(path_where_hashes_are_stored, root, add_to_git=add_to_git)
for f in files:
print(f)
sha = compute_hash(os.path.join(root, f))
# print(f)
sha = hash_fun(os.path.join(root, f))
t.add(f, BLOB, sha)
for dir in dirs:
print(dir)
# print(dir)
if dir in ignore:
continue
dirsha = hashtbl[os.path.join(root, dir)]
Expand All @@ -176,7 +181,7 @@ def _get_next_element(dl, startindex, ignore):
return d, index
return None, -1

def check_hashes(roothash, basedir_where_hashes_are_stored, local_dir, ignore=[]):
def check_hashes(roothash, basedir_where_hashes_are_stored, local_dir, ignore=[], hash_fun=compute_hash):
"""Traverse a directory tree rooted at :local_dir: and check that the files
match the hashes kept in :basedir_where_hashes_are_stored: and that no new
files have been added.
Expand Down Expand Up @@ -217,7 +222,7 @@ def check_hashes(roothash, basedir_where_hashes_are_stored, local_dir, ignore=[]
if f != name:
print("File mismatch:", f, " and (hash says)", name)
return False
sha = compute_hash(os.path.join(root, f))
sha = hash_fun(os.path.join(root, f))
if sha != h:
print("Hash mismatch for file: ", f, ":", sha, " and (hash says)", h)
return False
Expand All @@ -234,6 +239,18 @@ def check_hashes(roothash, basedir_where_hashes_are_stored, local_dir, ignore=[]
fd.close()
return True

def generate_sha_signature(rsrcdir, localpath, ignore=[]):
return generate_hashes(rsrcdir, localpath, ignore=ignore, hash_fun=compute_hash)

def check_sha_signature(hashval, rsrdir, localpath, ignore=[]):
return check_hashes(hashval, rsrcdir, localpath, ignore=ignore, hash_fun=compute_hash)

def generate_size_signature(rsrcdir, localpath, ignore=[]):
return generate_hashes(rsrcdir, localpath, ignore=ignore, hash_fun=compute_size)

def check_sha_signature(hashval, rsrdir, localpath, ignore=[]):
return check_hashes(hashval, rsrcdir, localpath, ignore=ignore, hash_fun=compute_size)

def test_walk(base):
for root, dir, files in os.walk(base, topdown=True):
print(root, "\t", dir, "\t", files)
Expand Down
28 changes: 20 additions & 8 deletions dataworkspaces/resources/local_file_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@
LOCAL_FILE = 'file'

class LocalFileResource(Resource):
def __init__(self, name, role, workspace_dir, local_path, ignore=[]):
def __init__(self, name, role, workspace_dir, local_path, ignore=[], compute_hash=False):
super().__init__(LOCAL_FILE, name, role, workspace_dir)
self.local_path = local_path
self.ignore = ignore
self.compute_hash = compute_hash
self.rsrcdir = os.path.abspath(self.workspace_dir + '/.dataworkspace/' + LOCAL_FILE + '/' + self.role + '/' + self.name)
self.rsrcdir_relative = '.dataworkspace/' +LOCAL_FILE + '/' + self.role + '/' + self.name

def to_json(self):
d = super().to_json()
d['local_path'] = self.local_path
d['compute_hash'] = self.compute_hash
return d

def get_local_path_if_any(self):
Expand Down Expand Up @@ -70,7 +72,10 @@ def results_move_current_files(self, rel_dest_root, exclude_files,

def snapshot(self):
# rsrcdir = os.path.abspath(self.workspace_dir + 'resources/' + self.role + '/' + self.name)
h = hashtree.generate_hashes(self.rsrcdir, self.local_path, ignore=self.ignore)
if self.compute_hash:
h = hashtree.generate_sha_signature(self.rsrcdir, self.local_path, ignore=self.ignore)
else:
h = hashtree.generate_size_signature(self.rsrcdir, self.local_path, ignore=self.ignore)
assert os.path.exists(os.path.join(self.rsrcdir, h))
if is_git_staging_dirty(self.workspace_dir, subdir=self.rsrcdir_relative):
call_subprocess([GIT_EXE_PATH, 'commit', '-m',
Expand All @@ -91,8 +96,13 @@ def add_results_file(self, temp_path, rel_dest_path):
os.rename(temp_path, abs_dest_path)

def restore_prechecks(self, hashval):
rc = hashtree.check_hashes(hashval, self.rsrcdir, self.local_path, ignore=self.ignore)
print("IN RESTORE")
if self.compute_hash:
rc = hashtree.check_sha_signature(hashval, self.rsrcdir, self.local_path, ignore=self.ignore)
else:
rc = hashtree.check_size_signature(hashval, self.rsrcdir, self.local_path, ignore=self.ignore)
if not rc:
print("ERROR IN RESTORE")
raise ConfigurationError("Local file structure not compatible with saved hash")

def restore(self, hashval):
Expand All @@ -103,23 +113,25 @@ def __str__(self):

class LocalFileFactory(ResourceFactory):
def from_command_line(self, role, name, workspace_dir, batch, verbose,
local_path):
local_path, compute_hash=False):
"""Instantiate a resource object from the add command's arguments"""
return LocalFileResource(name, role, workspace_dir, local_path)
return LocalFileResource(name, role, workspace_dir, local_path, compute_hash=compute_hash)

def from_json(self, json_data, local_params, workspace_dir, batch, verbose):
"""Instantiate a resource object from the parsed resources.json file"""
assert json_data['resource_type']==LOCAL_FILE
return LocalFileResource(json_data['name'],
json_data['role'], workspace_dir, json_data['local_path'])
json_data['role'], workspace_dir, json_data['local_path'],
compute_hash=json_data['compute_hash'])

def from_json_remote(self, json_data, workspace_dir, batch, verbose):
"""Instantiate a resource object from the parsed resources.json file"""
assert json_data['resource_type']==LOCAL_FILE
# XXX need to convert local path to be stored in local params
return LocalFileResource(json_data['name'],
json_data['role'], workspace_dir, json_data['local_path'])
json_data['role'], workspace_dir, json_data['local_path'],
compute_hash=json_data['compute_hash'])

def suggest_name(self, local_path):
def suggest_name(self, local_path, *args):
return os.path.basename(local_path)

2 changes: 1 addition & 1 deletion docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ help:
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
7 changes: 6 additions & 1 deletion tests/testcli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,14 @@ echo 'File 3' > local_files/dir/f3
cp $TESTSDIR/data.csv $WORKDIR/local_files/data.csv
ls

# create a local directory for storing intermediate data
# create two local directories for storing intermediate data
cd $WORKDIR
mkdir workspace

cd $WORKDIR
mkdir hashed_workspace
echo "FILE DATA" > hashed_workspace/f1

# create a git repo for storing results
cd $REMOTE
git init --bare results_git.git
Expand All @@ -160,6 +164,7 @@ run dws $ARGS add git --role=code --name=code-git ./code
run dws $ARGS add rclone --role=source-data --name=code-local localfs:./local_files my_local_files
echo "local_files/" >> .gitignore
run dws $ARGS add local-files --role=intermediate-data --name=workspace ./workspace
run dws $ARGS add local-files --role=results --compute-hash --name=hspace ./hashed_workspace
run dws $ARGS add git --role=results --name=results-git ./results_git

# Add a git subdirectory resource
Expand Down

0 comments on commit 25bdb30

Please sign in to comment.