Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved gitattribute handling #2744

Merged
merged 12 commits into from Aug 8, 2018
52 changes: 32 additions & 20 deletions datalad/distribution/create.py
Expand Up @@ -15,6 +15,7 @@
import uuid

from os import listdir
import os.path as op
from os.path import isdir
from os.path import join as opj

Expand Down Expand Up @@ -342,16 +343,14 @@ def __call__(
)

if text_no_annex:
git_attributes_file = opj(tbds.path, '.gitattributes')
with open(git_attributes_file, 'a') as f:
f.write('* annex.largefiles=(not(mimetype=text/*))\n')
# TODO just use add_to_git and avoid separate commit
tbrepo.add([git_attributes_file], git=True)
tbrepo.commit(
"Instructed annex to add text files to git",
_datalad_msg=True,
files=[git_attributes_file]
)
attrs = tbrepo.get_gitattributes('.')
# some basic protection against useless duplication
# on rerun with --force
if not attrs.get('.', {}).get('annex.largefiles', None) == '(not(mimetype=text/*))':
tbrepo.set_gitattributes([
# XXX shouldn't this be '**' to have the desired effect?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, a single '*' works recursively there.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK.

('*', {'annex.largefiles': '(not(mimetype=text/*))'})])
add_to_git.append('.gitattributes')

if native_metadata_type is not None:
if not isinstance(native_metadata_type, list):
Expand Down Expand Up @@ -380,18 +379,31 @@ def __call__(
add_to_git.append('.datalad')

# make sure that v6 annex repos never commit content under .datalad
with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr:
# TODO this will need adjusting, when annex'ed aggregate metadata
# comes around
gitattr.write('config annex.largefiles=nothing\n')
gitattr.write('metadata/aggregate* annex.largefiles=nothing\n')
gitattr.write('metadata/objects/** annex.largefiles=({})\n'.format(
cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))
attrs_cfg = (
('config', 'annex.largefiles', 'nothing'),
('metadata/aggregate*', 'annex.largefiles', 'nothing'),
('metadata/objects/**', 'annex.largefiles',
'({})'.format(cfg.obtain(
'datalad.metadata.create-aggregate-annex-limit'))))
attrs = tbds.repo.get_gitattributes(
[op.join('.datalad', i[0]) for i in attrs_cfg])
set_attrs = []
for p, k, v in attrs_cfg:
if not attrs.get(
op.join('.datalad', p), {}).get(k, None) == v:
set_attrs.append((p, {k: v}))
if set_attrs:
tbds.repo.set_gitattributes(
set_attrs,
attrfile=op.join('.datalad', '.gitattributes'))
add_to_git.append('.datalad')

# prevent git annex from ever annexing .git* stuff (gh-1597)
with open(opj(tbds.path, '.gitattributes'), 'a') as gitattr:
gitattr.write('**/.git* annex.largefiles=nothing\n')
add_to_git.append('.gitattributes')
attrs = tbds.repo.get_gitattributes('.git')
if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing':
tbds.repo.set_gitattributes([
('**/.git*', {'annex.largefiles': 'nothing'})])
add_to_git.append('.gitattributes')

# save everything, we need to do this now and cannot merge with the
# call below, because we may need to add this subdataset to a parent
Expand Down
10 changes: 5 additions & 5 deletions datalad/plugin/no_annex.py
Expand Up @@ -131,11 +131,11 @@ def __call__(dataset, pattern, ref_dir='.', makedirs=False):
message='target directory for `no_annex` does not exist (consider makedirs=True)')
return

gitattr_file = opj(gitattr_dir, '.gitattributes')
with open(gitattr_file, 'a') as fp:
for p in pattern:
fp.write('{} annex.largefiles=nothing\n'.format(p))
yield dict(res_kwargs, status='ok')
gitattr_file = opj(ref_dir, '.gitattributes')
dataset.repo.set_gitattributes(
[(p, {'annex.largefiles': 'nothing'}) for p in pattern],
attrfile=gitattr_file)
yield dict(res_kwargs, status='ok')

for r in dataset.add(
gitattr_file,
Expand Down
2 changes: 1 addition & 1 deletion datalad/support/annexrepo.py
Expand Up @@ -307,7 +307,7 @@ def set_default_backend(self, backend, persistent=True, commit=True):
# do, if there is sth in that setting already
if persistent:
# could be set in .gitattributes or $GIT_DIR/info/attributes
if 'annex.backend' in self.get_git_attributes():
if 'annex.backend' in self.get_gitattributes('.')['.']:
lgr.debug(
"Not (re)setting backend since seems already set in git attributes"
)
Expand Down
87 changes: 73 additions & 14 deletions datalad/support/gitrepo.py
Expand Up @@ -17,6 +17,7 @@
import shlex
import time
import os
import os.path as op
from os import linesep
from os.path import join as opj
from os.path import exists
Expand Down Expand Up @@ -2319,28 +2320,86 @@ def get_deleted_files(self):
for f in self.repo.git.diff('--raw', '--name-status', '--staged').split('\n')
if f.split('\t')[0] == 'D']

def get_git_attributes(self):
"""Check git attribute for the current repository (not per-file support for now)
def get_gitattributes(self, path, index_only=False):
"""Query gitattributes for one or more paths

Parameters
----------
all_: bool
Adds --all to git check-attr call
path: path or list
Path(s) to query. Paths may be relative or absolute.
index_only: bool
Flag whether to consider only gitattribute setting that are reflected
in the repository index, not just in the work tree content.

Returns
-------
dict:
attribute: value pairs
Each key is a queried path, each value is a dictionary with attribute
name and value items. Attribute values are either True or False,
for set and unset attributes, or are the literal attribute value.
"""
out, err = self._git_custom_command(["."], ["git", "check-attr", "--all"])
assert not err, "no stderr output is expected"
out_split = [
# splitting by : would leave leading space(s)
[e.lstrip(' ') for e in l.split(':', 2)]
for l in out.split('\n') if l
]
assert all(o[0] == '.' for o in out_split) # for paranoid
return dict(o[1:] for o in out_split)
path = assure_list(path)
cmd = ["git", "check-attr", "-z", "--all"]
if index_only:
cmd.append('--cached')
stdout, stderr = self._git_custom_command(path, cmd)
# make sure we have one entry for each query path to
# simplify work with the result
attributes = {_normalize_path(self.path, p): {} for p in path}
attr = []
for item in stdout.split('\0'):
attr.append(item)
if len(attr) < 3:
continue
# we have a full record
p, name, value = attr
attrs = attributes[p]
attrs[name] = \
True if value == 'set' else False if value == 'unset' else value
# done, reset item
attr = []
return attributes

def set_gitattributes(self, attrs, attrfile='.gitattributes'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this one should be named add_gitattributes (or even append_) since that is what it does -- it appends, and thus potentially augments existing or it doesn't really "reset" or modify already existing ones, so it cannot remove previously set (besides to reset to an empty value I guess).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thus potentially augments existing or it doesn't really "reset" or modify already existing ones

Is it true that a later value can augment a previous one? IIRC a later value in a .gitattributes file overrides the previous one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kyleam but it is done on a per attribute basis.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yarikoptic I think neither add nor append indicate the right semantics. It appends attribute spec lines, but whether that results in a set, add, or unset behavior is determined by a complex interplay of interpreting all gitattribute specs from multiple locations.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kyleam but it is done on a per attribute basis.

I'm confused. I thought "existing ones" in @yarikoptic's comment referred to attributes. Perhaps I should have read it to mean a spec line.

"""Set gitattributes

Parameters
----------
attrs : list
Each item is a 2-tuple, where the first element is a path pattern,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feels that it is a use-case to use a dict (if needed ordered) not a list of 2-tuples.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would work in the general case, because it is possible to set attributes on a single paths on multiple lines.

and the second element is a dictionary with attribute key/value
pairs. The attribute dictionary must use the same semantics as those
returned by `get_gitattributes()`. Path patterns can use absolute paths,
in which case they will be normalized relative to the directory
that contains the target .gitattribute file (see `attrfile`).
attrfile: path
Path relative to the repository root of the .gitattributes file the
attributes shall be set in.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious if you know about a use-case where having it as an option would come handy already with git? i.e. if there is another file which might follow this syntax to be used for storing some attrs?

Copy link
Contributor

@kyleam kyleam Aug 8, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a use-case where having it as an option would come handy already with git?

Any .gitattributes file in a subdirectory? We could assume the base name .gitattributes and change this argument to take a directory, though the current approach seems straightforward enough.

We don't use it at the moment, but there is also $GITDIR/info/attributes, though taking that relative to the repo root isn't convenient for that use case since $GITDIR isn't necessarily "root/.git".

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, it isn't necessarily always .gitattributes, hence I went this way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks

"""
git_attributes_file = op.join(self.path, attrfile)
attrdir = op.dirname(git_attributes_file)
if not op.exists(attrdir):
os.makedirs(attrdir)
with open(git_attributes_file, 'a') as f:
for pattern, attr in sorted(attrs, key=lambda x: x[0]):
# normalize the pattern relative to the target .gitattributes file
npath = _normalize_path(
op.join(self.path, op.dirname(attrfile)), pattern)
attrline = u''
if npath.count(' '):
# quote patterns with spaces
attrline += u'"{}"'.format(npath)
else:
attrline += npath
for a in sorted(attr):
val = attr[a]
if val is True:
attrline += ' {}'.format(a)
elif val is False:
attrline += ' -{}'.format(a)
else:
attrline += ' {}={}'.format(a, val)
f.write('{}\n'.format(attrline))


# TODO
Expand Down
4 changes: 2 additions & 2 deletions datalad/support/tests/test_annexrepo.py
Expand Up @@ -579,7 +579,7 @@ def test_AnnexRepo_backend_option(path, url):
ar = AnnexRepo(path, backend='MD5')

# backend recorded in .gitattributes
eq_(ar.get_git_attributes()['annex.backend'], 'MD5')
eq_(ar.get_gitattributes('.')['.']['annex.backend'], 'MD5')

ar.add('firstfile', backend='SHA1')
ar.add('secondfile')
Expand Down Expand Up @@ -2283,4 +2283,4 @@ def test_get_size_from_perc_complete():
eq_(f(0, 0), 0)
eq_(f(0, '0'), 0)
eq_(f(100, '0'), 0) # we do not know better
eq_(f(1, '1'), 100)
eq_(f(1, '1'), 100)