Skip to content

Commit

Permalink
Merge pull request #6932 from yarikoptic/bf-long-CLI
Browse files Browse the repository at this point in the history
ENH: GitRepo -- make use of --pathspec-from-file where possible and chunking needed
  • Loading branch information
yarikoptic committed Aug 20, 2022
2 parents 80aa7b5 + a2b00af commit 85102f6
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 34 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,13 @@ matrix:
env:
- PYTEST_SELECTION_OP=not
- DATALAD_SSH_MULTIPLEX__CONNECTIONS=0
- DATALAD_RUNTIME_PATHSPEC__FROM__FILE=always
- _DL_ANNEX_INSTALL_SCENARIO="miniconda --python-match minor --batch git-annex=10.20220525 -m conda"
- python: 3.7
env:
- PYTEST_SELECTION_OP=""
- DATALAD_SSH_MULTIPLEX__CONNECTIONS=0
- DATALAD_RUNTIME_PATHSPEC__FROM__FILE=always
- _DL_ANNEX_INSTALL_SCENARIO="miniconda --python-match minor --batch git-annex=8.20210310 -m conda"
# To test https://github.com/datalad/datalad/pull/4342 fix in case of no "not" for pytest.
# From our testing in that PR seems to have no effect, but kept around since should not hurt.
Expand Down Expand Up @@ -267,7 +269,7 @@ script:
# Run tests
# Note: adding --log-cli-level=INFO would result in DATALAD_LOG_TARGET=/dev/null being not
# in effect, dumping too many logs.
- http_proxy=
- set -x; http_proxy=
PATH=$PWD/../tools/coverage-bin:$PATH
$PYTEST_WRAPPER python
-m pytest "${PYTEST_OPTS[@]}"
Expand Down
35 changes: 30 additions & 5 deletions datalad/dataset/gitrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from locale import getpreferredencoding
from os import environ
from os.path import lexists
from typing import Optional
from weakref import (
finalize,
WeakValueDictionary
Expand Down Expand Up @@ -271,8 +272,10 @@ def add_fake_dates_to_env(self, env=None):

def _generator_call_git(self,
args,
*,
files=None,
env=None,
pathspec_from_file: Optional[bool]=False,
sep=None):
"""
Call git, yield stdout and stderr lines when available. Output lines
Expand Down Expand Up @@ -330,7 +333,9 @@ def pipe_data_received(self, fd, data):
cmd,
files,
protocol=GeneratorStdOutErrCapture,
env=env)
env=env,
pathspec_from_file=pathspec_from_file,
)
else:
generator = self._git_runner.run(
cmd,
Expand Down Expand Up @@ -360,6 +365,7 @@ def _call_git(self,
expect_stderr=False,
expect_fail=False,
env=None,
pathspec_from_file: Optional[bool] = False,
read_only=False):
"""Allows for calling arbitrary commands.
Expand All @@ -386,7 +392,9 @@ def _call_git(self,

for file_no, line in self._generator_call_git(args,
files=files,
env=env):
env=env,
pathspec_from_file=pathspec_from_file,
):
output[file_no].append(line)

for line in output[STDERR_FILENO]:
Expand All @@ -397,7 +405,10 @@ def _call_git(self,
"".join(output[STDERR_FILENO]))

def call_git(self, args, files=None,
expect_stderr=False, expect_fail=False, read_only=False):
expect_stderr=False, expect_fail=False,
env=None,
pathspec_from_file: Optional[bool] = False,
read_only=False):
"""Call git and return standard output.
Parameters
Expand All @@ -414,6 +425,10 @@ def call_git(self, args, files=None,
expect_fail : bool, optional
A non-zero exit is expected and should not be elevated above the
DEBUG level.
pathspec_from_file : bool, optional
Could be set to True for a `git` command which supports
--pathspec-from-file and --pathspec-file-nul options. Then pathspecs
would be passed through a temporary file.
read_only : bool, optional
By setting this to True, the caller indicates that the command does
not write to the repository, which lets this function skip some
Expand All @@ -435,6 +450,8 @@ def call_git(self, args, files=None,
files,
expect_stderr=expect_stderr,
expect_fail=expect_fail,
env=env,
pathspec_from_file=pathspec_from_file,
read_only=read_only,
keep_ends=True))

Expand All @@ -444,6 +461,7 @@ def call_git_items_(self,
expect_stderr=False,
expect_fail=False,
env=None,
pathspec_from_file: Optional[bool] = False,
read_only=False,
sep=None,
keep_ends=False):
Expand Down Expand Up @@ -495,6 +513,7 @@ def call_git_items_(self,
args,
files=files,
env=env,
pathspec_from_file=pathspec_from_file,
sep=sep):
if file_no == STDOUT_FILENO:
if keep_ends is True:
Expand All @@ -511,7 +530,9 @@ def call_git_items_(self,
for line in stderr_lines:
lgr.log(stderr_log_level, "stderr| " + line.strip("\n"))

def call_git_oneline(self, args, files=None, expect_stderr=False, read_only=False):
def call_git_oneline(self, args, files=None, expect_stderr=False,
pathspec_from_file: Optional[bool] = False,
read_only=False):
"""Call git for a single line of output.
All other parameters match those described for `call_git`.
Expand All @@ -523,14 +544,17 @@ def call_git_oneline(self, args, files=None, expect_stderr=False, read_only=Fals
"""
lines = list(self.call_git_items_(args, files=files,
expect_stderr=expect_stderr,
pathspec_from_file=pathspec_from_file,
read_only=read_only))
if len(lines) > 1:
raise AssertionError(
"Expected {} to return single line, but it returned {}"
.format(["git"] + args, lines))
return lines[0]

def call_git_success(self, args, files=None, expect_stderr=False, read_only=False):
def call_git_success(self, args, files=None, expect_stderr=False,
pathspec_from_file: Optional[bool] = False,
read_only=False):
"""Call git and return true if the call exit code of 0.
All parameters match those described for `call_git`.
Expand All @@ -542,6 +566,7 @@ def call_git_success(self, args, files=None, expect_stderr=False, read_only=Fals
try:
self._call_git(
args, files, expect_fail=True, expect_stderr=expect_stderr,
pathspec_from_file=pathspec_from_file,
read_only=read_only)

except CommandError:
Expand Down
11 changes: 11 additions & 0 deletions datalad/interface/common_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,17 @@ def get_default_ssh():
'type': EnsureInt(),
'default': 1,
},
'datalad.runtime.pathspec-from-file': {
'ui': ('question', {
'title': 'Provide list of files to git commands via --pathspec-from-file',
'text': "Instructs when DataLad will provide list of paths to 'git' commands which "
"support --pathspec-from-file option via some temporary file. If set to "
"'multi-chunk' it will be done only if multiple invocations of the command "
"on chunks of files list is needed. If set to 'always', DataLad will always "
"use --pathspec-from-file."}),
'type': EnsureChoice('multi-chunk', 'always'),
'default': 'multi-chunk',
},
'datalad.runtime.raiseonerror': {
'ui': ('question', {
'title': 'Error behavior',
Expand Down
44 changes: 41 additions & 3 deletions datalad/runner/gitrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@
import os
import os.path as op

from datalad.dochelpers import borrowdoc
from datalad.utils import generate_file_chunks
from typing import Optional

from datalad.dochelpers import borrowdoc
from datalad.utils import (
generate_file_chunks,
make_tempfile,
)
from .runner import (
GeneratorMixIn,
WitlessRunner,
Expand Down Expand Up @@ -121,6 +125,9 @@ class GitWitlessRunner(WitlessRunner, GitRunnerBase):
See GitRunnerBase it mixes in for more details
"""

# Behavior option to load up from config upon demand
_CFG_PATHSPEC_FROM_FILE = None

@borrowdoc(WitlessRunner)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -139,14 +146,35 @@ def _get_adjusted_env(self, env=None, cwd=None, copy=True):
def _get_chunked_results(self,
cmd,
files,
*,
protocol=None,
cwd=None,
env=None,
pathspec_from_file: Optional[bool] = False,
**kwargs):

assert isinstance(cmd, list)

file_chunks = generate_file_chunks(files, cmd)
if self._CFG_PATHSPEC_FROM_FILE is None:
from datalad import cfg # avoid circular import
GitWitlessRunner._CFG_PATHSPEC_FROM_FILE = cfg.obtain('datalad.runtime.pathspec-from-file')
assert GitWitlessRunner._CFG_PATHSPEC_FROM_FILE in ('multi-chunk', 'always')
file_chunks = list(generate_file_chunks(files, cmd))

if pathspec_from_file and (len(file_chunks) > 1 or GitWitlessRunner._CFG_PATHSPEC_FROM_FILE == 'always'):
# if git supports pathspec---from-file and we need multiple chunks to do,
# just use --pathspec-from-file
with make_tempfile(content=b'\x00'.join(f.encode() for f in files)) as tf:
cmd += ['--pathspec-file-nul', f'--pathspec-from-file={tf}']
yield self.run(
cmd=cmd,
protocol=protocol,
cwd=cwd,
env=env,
**kwargs)
return

# "classical" chunking
for i, file_chunk in enumerate(file_chunks):
# do not pollute with message when there only ever is a single chunk
if len(file_chunk) < len(files):
Expand All @@ -163,9 +191,11 @@ def _get_chunked_results(self,
def run_on_filelist_chunks(self,
cmd,
files,
*,
protocol=None,
cwd=None,
env=None,
pathspec_from_file: Optional[bool] = False,
**kwargs):
"""
Run a git-style command multiple times if `files` is too long,
Expand All @@ -192,6 +222,10 @@ def run_on_filelist_chunks(self,
This must be a complete environment definition, no values
from the current environment will be inherited. Overrides
any `env` given to the constructor.
pathspec_from_file : bool, optional
Could be set to True for a `git` command which supports
--pathspec-from-file and --pathspec-file-nul options. Then pathspecs
would be passed through a temporary file.
kwargs :
Passed to the Protocol class constructor.
Expand Down Expand Up @@ -224,6 +258,7 @@ def run_on_filelist_chunks(self,
protocol=protocol,
cwd=cwd,
env=env,
pathspec_from_file=pathspec_from_file,
**kwargs):
if results is None:
results = res
Expand All @@ -235,9 +270,11 @@ def run_on_filelist_chunks(self,
def run_on_filelist_chunks_items_(self,
cmd,
files,
*,
protocol=None,
cwd=None,
env=None,
pathspec_from_file: Optional[bool] = False,
**kwargs):
"""
Run a git-style command multiple times if `files` is too long,
Expand All @@ -264,5 +301,6 @@ def run_on_filelist_chunks_items_(self,
protocol=protocol,
cwd=cwd,
env=env,
pathspec_from_file=pathspec_from_file,
**kwargs):
yield from chunk_generator
4 changes: 2 additions & 2 deletions datalad/runner/tests/test_gitrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_gitrunner_generator():
generator = git_runner.run_on_filelist_chunks_items_(
["a", "b", "c"],
["f1.txt", "f2.txt"],
TestGeneratorProtocol)
protocol=TestGeneratorProtocol)
with patch.object(git_runner, "_get_chunked_results") as get_mock:
get_mock.return_value = (range(2), range(3))
assert_equal(tuple(generator), (0, 1, 0, 1, 2))
Expand All @@ -38,5 +38,5 @@ def test_gitrunner_list():
result = git_runner.run_on_filelist_chunks(
["a", "b", "c"],
["f1.txt", "f2.txt"],
StdOutErrCapture)
protocol=StdOutErrCapture)
assert_equal(result, {"a": 4, "b": 6})
39 changes: 16 additions & 23 deletions datalad/support/gitrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,7 @@ def add_(self, files, git=True, git_options=None, update=False):
ensure_list(git_options) +
to_options(update=update) + ['--verbose'],
files=files,
pathspec_from_file=True,
read_only=False,
)
# get all the entries
Expand Down Expand Up @@ -1315,7 +1316,7 @@ def remove(self, files, recursive=False, **kwargs):
return [
line.strip()[4:-1]
for line in self.call_git_items_(
['rm'] + to_options(**kwargs), files=files)
['rm'] + to_options(**kwargs), files=files, pathspec_from_file=True)
]

def precommit(self):
Expand Down Expand Up @@ -1373,7 +1374,7 @@ def commit(self, msg=None, options=None, _datalad_msg=False, careless=True,
self.precommit()

# assemble commandline
cmd = self._git_cmd_prefix + ['commit']
cmd = ['commit']
options = ensure_list(options)

if date:
Expand Down Expand Up @@ -1406,30 +1407,22 @@ def commit(self, msg=None, options=None, _datalad_msg=False, careless=True,

lgr.debug("Committing via direct call of git: %s", cmd)

file_chunks = generate_file_chunks(files, cmd) if files else [[]]

# store pre-commit state to be able to check if anything was committed
prev_sha = self.get_hexsha()

# Old code was doing clever --amend'ing of chunked series of commits manually
# here, but with pathspec_from_file it is no longer needed.
# store pre-commit state to be able to check if anything was committed
try:
for i, chunk in enumerate(file_chunks):
cur_cmd = cmd.copy()
# if this is an explicit dry-run, there is no point in
# amending, because no commit was ever made
# otherwise, amend the first commit, and prevent
# leaving multiple commits behind
if i > 0 and '--dry-run' not in cmd:
if '--amend' not in cmd:
cur_cmd.append('--amend')
if '--no-edit' not in cmd:
cur_cmd.append('--no-edit')
cur_cmd += ['--'] + chunk
self._git_runner.run(
cur_cmd,
protocol=StdOutErrCapture,
stdin=None,
# Note: call_git operates via joining call_git_items_ and that one wipes out
# .stdout from exception and collects/repopulates stderr only. Let's use
# _call_git which returns both outputs and collects/re-populates both stdout
# **and** stderr
_ = self._call_git(
cmd,
files=files,
env=env,
)
pathspec_from_file=True,
)
except CommandError as e:
# real errors first
if "did not match any file(s) known to git" in e.stderr:
Expand All @@ -1453,7 +1446,6 @@ def commit(self, msg=None, options=None, _datalad_msg=False, careless=True,
lgr.debug("no changes added to commit in %s. Ignored.", self)
else:
raise

if orig_msg \
or '--dry-run' in cmd \
or prev_sha == self.get_hexsha() \
Expand Down Expand Up @@ -3571,6 +3563,7 @@ def _save_add(self, files, git_opts=None):
['-c', 'annex.largefiles=nothing', 'add'] +
ensure_list(git_opts) + ['--verbose'],
files=list(files.keys()),
pathspec_from_file=True,
)
# get all the entries
for r in self._process_git_get_output(*add_out):
Expand Down

0 comments on commit 85102f6

Please sign in to comment.