Skip to content

Commit

Permalink
add git-lfs support
Browse files Browse the repository at this point in the history
  • Loading branch information
jfischer committed Mar 8, 2020
1 parent 2a6cae2 commit a152a24
Show file tree
Hide file tree
Showing 8 changed files with 357 additions and 10 deletions.
11 changes: 10 additions & 1 deletion dataworkspaces/backends/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
validate_git_fat_in_path_if_needed, \
run_git_fat_pull_if_needed, validate_git_fat_in_path, run_git_fat_push_if_needed,\
setup_git_fat_for_repo, is_a_git_fat_repo
from dataworkspaces.utils.git_lfs_utils import \
init_git_lfs, is_a_git_lfs_repo, ensure_git_lfs_configured_if_needed
from dataworkspaces.utils.file_utils import safe_rename, get_subpath_from_absolute
from dataworkspaces.utils.param_utils import HOSTNAME, init_scratch_directory,\
clone_scratch_directory, get_scratch_directory, SCRATCH_DIRECTORY,\
Expand Down Expand Up @@ -503,7 +505,8 @@ def init_workspace(workspace_name:str, dws_version:str, # type: ignore
git_fat_remote:Optional[str]=None,
git_fat_user:Optional[str]=None,
git_fat_port:Optional[int]=None,
git_fat_attributes:Optional[str]=None) -> ws.Workspace:
git_fat_attributes:Optional[str]=None,
git_lfs_attributes:Optional[str]=None) -> ws.Workspace:
if not exists(workspace_dir):
raise ConfigurationError("Directory for new workspace '%s' does not exist"%
workspace_dir)
Expand Down Expand Up @@ -562,6 +565,10 @@ def init_workspace(workspace_name:str, dws_version:str, # type: ignore
if git_fat_remote is not None:
setup_git_fat_for_repo(workspace_dir, git_fat_remote, git_fat_user,
git_fat_port, git_fat_attributes, verbose)
if git_lfs_attributes or is_a_git_lfs_repo(workspace_dir):
if git_fat_remote:
raise ConfigurationError("Cannot have both git-lfs and git-fat for the same repo.")
init_git_lfs(workspace_dir, git_lfs_attributes, verbose=verbose)
return Workspace(workspace_dir, batch, verbose)

@staticmethod
Expand Down Expand Up @@ -655,6 +662,8 @@ def clone_workspace(local_params:JSONDict, batch:bool, verbose:bool, *args) -> w
# pull the objects referenced by the current head
git_fat.run_git_fat(python2_exe, ['pull'], cwd=directory,
verbose=verbose)
ensure_git_lfs_configured_if_needed(directory, verbose=verbose)

except:
if isdir(initial_path):
shutil.rmtree(initial_path)
Expand Down
6 changes: 5 additions & 1 deletion dataworkspaces/commands/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,25 @@
import click

from dataworkspaces.workspace import init_workspace, RESOURCE_ROLE_CHOICES
from dataworkspaces.errors import ConfigurationError


def init_command(name:str, hostname:str, create_resources:List[str],
scratch_dir:Optional[str]=None,
git_fat_remote:Optional[str]=None, git_fat_user:Optional[str]=None,
git_fat_port:Optional[int]=None,
git_fat_attributes:Optional[str]=None,
git_lfs_attributes:Optional[str]=None,
batch:bool=False, verbose:bool=False):
if git_fat_attributes and git_lfs_attributes:
raise ConfigurationError("Cannot specify git-fat and git-lfs for the same repository")
workspace_dir=abspath(expanduser(os.curdir))
if scratch_dir is None:
scratch_dir = join(workspace_dir, 'scratch')
workspace = init_workspace('dataworkspaces.backends.git', #TODO: remove hardcoding
name, hostname, batch, verbose, scratch_dir, workspace_dir,
git_fat_remote, git_fat_user, git_fat_port,
git_fat_attributes)
git_fat_attributes, git_lfs_attributes)

if len(create_resources)>0:
click.echo("Will now create sub-directory resources for "+
Expand Down
14 changes: 10 additions & 4 deletions dataworkspaces/commands/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,21 @@ def merge_snapshot_metadata(old, new, batch):
message = new.message
else:
message = old.message
if old.tags==new.tags and old.message==new.message:
raise ConfigurationError("No differences from previous snapshot, doing nothing.")
click.echo("Snapshot %s already exists, updating metadata..."% old.hashval)
return SnapshotMetadata(
hashval=old.hashval,
tags=tags,
message=message,
hostname=old.hostname,
hostname=new.hostname,
timestamp=old.timestamp,
relative_detination_path=old.relative_destination_path,
metric_name=old.metric_name,
metric_value=old.metric_value,
relative_destination_path=new.relative_destination_path,
# The restore hash may have changed, even if the content did not.
# E.g., in the git subdirectory, the restore hash reflects the hash of the overall
# repo rather than just the subdirectory.
restore_hashes=new.restore_hashes,
metrics=new.metrics,
updated_timestamp=new.timestamp
)

Expand Down
8 changes: 7 additions & 1 deletion dataworkspaces/dws.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,15 @@ def convert(self, value, param, ctx):
help="Comma-separated list of file patterns to manage under git-fat."+
" For example --git-fat-attributes='*.gz,*.zip'. If you do not specify"+
" here, you can always add the .gitattributes file later.")
@click.option('--git-lfs-attributes', type=str, default=None,
help="Comma-separated list of file patterns to manage under git-lfs."+
" For example --git-lfs-attributes='*.gz,*.zip'. If you do not specify"+
" here, you can always add the .gitattributes file later.")
@click.argument('name', default=CURR_DIRNAME)
@click.pass_context
def init(ctx, hostname, name, create_resources, scratch_directory,
git_fat_remote, git_fat_user, git_fat_port, git_fat_attributes):
git_fat_remote, git_fat_user, git_fat_port, git_fat_attributes,
git_lfs_attributes):
"""Initialize a new workspace"""
if hostname is None:
if not ctx.obj.batch:
Expand All @@ -225,6 +230,7 @@ def init(ctx, hostname, name, create_resources, scratch_directory,
init_command(name, hostname, create_resources, scratch_directory,
git_fat_remote,
git_fat_user, git_fat_port, git_fat_attributes,
git_lfs_attributes,
batch=ctx.obj.batch, verbose=ctx.obj.verbose)


Expand Down
13 changes: 11 additions & 2 deletions dataworkspaces/resources/git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import stat
import click
import json
from typing import Set, Pattern, Union, Optional
from typing import Set, Pattern, Union, Optional, Tuple

from dataworkspaces.errors import ConfigurationError, InternalError
from dataworkspaces.utils.subprocess_utils import \
Expand All @@ -27,6 +27,8 @@
is_a_git_fat_repo,\
has_git_fat_been_initialized, validate_git_fat_in_path,\
validate_git_fat_in_path_if_needed
from dataworkspaces.utils.git_lfs_utils import \
ensure_git_lfs_configured_if_needed
from dataworkspaces.workspace import Resource, ResourceFactory, ResourceRoles,\
RESOURCE_ROLE_PURPOSES, LocalStateResourceMixin, FileResourceMixin,\
SnapshotResourceMixin, JSONDict, JSONList, Workspace
Expand Down Expand Up @@ -405,6 +407,7 @@ def from_command_line(self, role, name, workspace,
verbose=workspace.verbose):
raise ConfigurationError("%s is a git repository, but also part of the parent workspace's repo"%(local_path))
validate_git_fat_in_path_if_needed(local_path)
ensure_git_lfs_configured_if_needed(local_path, verbose=workspace.verbose)
remote_origin = get_remote_origin(local_path, verbose=workspace.verbose)
(current, others) = get_branch_info(local_path, workspace.verbose)
if branch!=current and branch not in others:
Expand Down Expand Up @@ -475,6 +478,7 @@ def clone(self, params, workspace):
cmd = [GIT_EXE_PATH, 'pull', 'origin', 'master']
call_subprocess(cmd, local_path, workspace.verbose)
switch_git_branch_if_needed(local_path, branch, workspace.verbose, ok_if_not_present=True)
ensure_git_lfs_configured_if_needed(local_path, verbose=workspace.verbose)
if is_a_git_fat_repo(local_path) and not has_git_fat_been_initialized(local_path):
import dataworkspaces.third_party.git_fat as git_fat
python2_exe = git_fat.find_python2_exe()
Expand Down Expand Up @@ -643,7 +647,9 @@ def add_results_file(self, data, rel_dest_path) -> None:
def snapshot_precheck(self):
validate_git_fat_in_path_if_needed(self.workspace_dir)

def snapshot(self):
def snapshot(self) -> Tuple[Optional[str], Optional[str]]:
"""Returns (cmopare_hash, restore_hash)
"""
# Todo: handle tags
commit_changes_in_repo_subdir(self.workspace_dir, self.relative_path, 'autocommit ahead of snapshot',
verbose=self.workspace.verbose)
Expand Down Expand Up @@ -734,6 +740,7 @@ def from_command_line(self, role, name, workspace,
lpr = realpath(local_path)
workspace_dir = _get_workspace_dir_for_git_backend(workspace)
validate_git_fat_in_path_if_needed(workspace_dir)
ensure_git_lfs_configured_if_needed(workspace_dir, verbose=workspace.verbose)
wdr = realpath(workspace_dir)
if not lpr.startswith(wdr):
raise ConfigurationError("Git subdirectories can only be used as resources when under the workspace repo.")
Expand Down Expand Up @@ -774,6 +781,8 @@ def clone(self, params, workspace):
raise ConfigurationError("Git subdirectory resources are only supported with the Git workspace backend.")
workspace_dir = workspace.get_workspace_local_path_if_any()
assert workspace_dir is not None
# this should be redundant, as we already intialized for the parent repo, but run just in case.
ensure_git_lfs_configured_if_needed(workspace_dir, verbose=workspace.verbose)
local_path = join(workspace_dir, relative_path)
if not exists(local_path):
# this subdirectory most have been created in the remote
Expand Down
122 changes: 122 additions & 0 deletions dataworkspaces/utils/git_lfs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Utilities for git repos that use git-lfs for support of large
files.
"""
import os
from os.path import join, exists
from typing import Optional
from pathlib import Path

import click

from dataworkspaces.utils.git_utils import \
is_git_repo, git_add, git_commit
from dataworkspaces.utils.subprocess_utils import find_exe, call_subprocess

def _does_attributes_file_reference_lfs(fpath) -> bool:
with open(fpath, 'r') as f:
for line in f:
words=line.split()
if len(words)<2:
continue
# start with index 1 to skip the pattern
for attr in words[1:]:
if attr.endswith('=lfs'):
return True
return False

def is_a_git_lfs_repo(repo_dir:str, recursive:bool=True) -> bool:
assert is_git_repo(repo_dir), "%s is not a git repo" % repo_dir
if recursive:
for (dirpath, filenames, dirnames) in os.walk(repo_dir):
fname = join(dirpath, '.gitattributes')
if exists(fname):
if _does_attributes_file_reference_lfs(fname):
return True
return False
else: # just check the root
fname = join(repo_dir, '.gitattributes')
if exists(fname):
return _does_attributes_file_reference_lfs(fname)
else:
return False


def is_git_lfs_installed_for_user(home_dir:str=str(Path.home())) -> bool:
git_config = join(home_dir, '.gitconfig')
if not exists(git_config):
return False
with open(git_config, 'r') as f:
for line in f:
if line.strip()=='[filter "lfs"]':
return True
return False

# Utility funtions for issue #12 - if a repo is git-lfs enabled, and git-lfs is not in the path,
# git add will fail silently for filter calls (e.g. in git add). We explicitly check that
# the executable is in the path in situations where we will call git as a subprocess.

GIT_LFS_ERRMSG=\
"git-lfs does not seem to be installed on your system. Install it or, if it is already installed, make sure that it is in your PATH."

def find_git_lfs_in_path() -> str:
"""Validate that git-lfs is in the path, asssuming we already know that this
is a git-lfs repo.
If the executable is not found, throw a configuration error. We need to do this, as git itself
will not return an error return code if a filter (e.g. git-lfs) is not found.
Retuns the path to the git-lfs executable.
"""
return find_exe('git-lfs', GIT_LFS_ERRMSG,
additional_search_locations=[])


def ensure_git_lfs_installed_for_user(lfs_exe, verbose:bool=False) -> bool:
"""Run the install operation if necessary. Returns true if it was necessary,
false otherwise.
"""
if not is_git_lfs_installed_for_user():
click.echo("Git lfs not installed for your account, installing...")
call_subprocess([lfs_exe, 'install'], cwd=str(Path.home()), verbose=verbose)
return True
elif verbose:
click.echo("git-lfs is installed for this user")
return False

def init_git_lfs(workspace_dir:str, git_lfs_attributes:Optional[str]=None,
verbose:bool=False):
"""Called during the dws init if the repo has references to lfs in its
.gitattributes files or if the user requested lfs file wildcares via
git_lfs_attributes
"""
lfs_exe = find_git_lfs_in_path()
ensure_git_lfs_installed_for_user(lfs_exe, verbose=verbose)
if git_lfs_attributes:
for extn in git_lfs_attributes.split(','):
call_subprocess([lfs_exe, 'track', extn], cwd=workspace_dir, verbose=verbose)
git_add(workspace_dir, ['.gitattributes'], verbose=verbose)
git_commit(workspace_dir, 'added git-lfs attributes', verbose=verbose)


def ensure_git_lfs_configured_if_needed(repo_dir:str, verbose:bool=False) -> None:
"""If this repo uses git-lfs, then 1) validate that git-lfs is in the path,
and 2) run git-lfs install for the user, if needed.
If the repo uses git-lfs, but we cannot find the executable,
throw a configuration error. We need to do this, as git itself
will not return an error return code if a filter (e.g. git-lfs) is not found.
"""
if not is_a_git_lfs_repo(repo_dir, recursive=True):
return
lfs_exe = find_exe('git-lfs', GIT_LFS_ERRMSG,
additional_search_locations=[])
need_to_download = ensure_git_lfs_installed_for_user(lfs_exe, verbose=verbose)
if need_to_download:
# If the user wasn't configured for git-lfs when cloning, we need to
# explicitly download the files.
call_subprocess([lfs_exe, 'fetch'], cwd=repo_dir, verbose=verbose)
call_subprocess([lfs_exe, 'checkout'], cwd=repo_dir, verbose=verbose)






2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ DATAWORKSPACES:=$(shell cd ../dataworkspaces; pwd)
help:
@echo targets are: test clean mypy pyflakes check help install-rclone-deb

UNIT_TESTS=test_git_utils test_file_utils test_move_results test_snapshots test_push_pull test_local_files_resource test_hashtree test_lineage_utils test_git_fat_integration test_lineage test_jupyter_kit test_sklearn_kit test_api test_wrapper_utils test_tensorflow test_scratch_dir
UNIT_TESTS=test_git_utils test_file_utils test_move_results test_snapshots test_push_pull test_local_files_resource test_hashtree test_lineage_utils test_git_fat_integration test_git_lfs test_lineage test_jupyter_kit test_sklearn_kit test_api test_wrapper_utils test_tensorflow test_scratch_dir

MYPY_KITS=scikit_learn.py jupyter.py tensorflow.py wrapper_utils.py

Expand Down

0 comments on commit a152a24

Please sign in to comment.