Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RF: get_modules_ - avoid O(N^2) if paths provided via use of get_limited_paths #7220

Merged
merged 10 commits into from Dec 23, 2022
Merged
2 changes: 1 addition & 1 deletion changelog.d/pr-7189.md
@@ -1,3 +1,3 @@
### 🏎 Performance

- Reimplement `get_submodules_()` without `get_content_info()` for substantial performance boosts especially for large datasets with few subdatasets. Note: might result for slightly slower performance when many subdataset paths are provided (e.g. via glob) to the function explicitly. Originally proposed in [PR #6942](https://github.com/datalad/datalad/pull/6942) by [@mih](https://github.com/mih), fixing [#6940](https://github.com/datalad/datalad/issues/6940). [PR #7189](https://github.com/datalad/datalad/pull/7189) (by [@adswa](https://github.com/adswa))
- Reimplement `get_submodules_()` without `get_content_info()` for substantial performance boosts especially for large datasets with few subdatasets. Originally proposed in [PR #6942](https://github.com/datalad/datalad/pull/6942) by [@mih](https://github.com/mih), fixing [#6940](https://github.com/datalad/datalad/issues/6940). [PR #7189](https://github.com/datalad/datalad/pull/7189) (by [@adswa](https://github.com/adswa)). Complemented with [PR #7220](https://github.com/datalad/datalad/pull/7220) (by [@yarikoptic](https://github.com/yarikoptic)) to avoid `O(N^2)` (instead of `O(N*log(N))` performance in some cases.
50 changes: 25 additions & 25 deletions datalad/support/gitrepo.py
Expand Up @@ -92,7 +92,7 @@
PathRI,
is_ssh,
)
from .path import get_parent_paths
from .path import get_parent_paths, get_limited_paths

# shortcuts
_curdirsep = curdir + sep
Expand Down Expand Up @@ -2388,34 +2388,34 @@ def get_submodules_(self, paths=None):
# below
return

# taken from 3.9's pathlib, can be removed once the minimally
# supported python version
def is_relative_to(self, *other):
"""Return True if the path is relative to another path or False.
"""
try:
self.relative_to(*other)
return True
except ValueError:
return False

posix_mod_paths = [m.relative_to(self.pathobj).as_posix() for m in modinfo]
yarikoptic marked this conversation as resolved.
Show resolved Hide resolved
if paths:
# ease comparison
paths = [self.pathobj / p for p in paths]
# constrain the report by the given paths

modinfo = {
# modpath is absolute
modpath: modprobs
for modpath, modprobs in modinfo.items()
# is_relative_to() also match equal paths
if (any(is_relative_to(p, modpath) for p in paths) or
any(is_relative_to(modpath, p) for p in paths))
}
# harmonize them into relative to the repository
posix_paths = []
for path in paths:
path = ut.PurePath(path)
if path.is_absolute():
try:
path = path.relative_to(self.pathobj)
except ValueError as exc:
lgr.debug(
"Path %s it not relative to %s, skipping since nothing should match it: %s",
yarikoptic marked this conversation as resolved.
Show resolved Hide resolved
path, self.pathobj, CapturedException(exc)
)
continue
posix_paths.append(path.as_posix())

# constrain the report by the given paths, make sure all paths are POSIX
posix_mod_paths = list(get_limited_paths(
posix_mod_paths,
posix_paths,
include_within_path=True,
))

for r in self.call_git_items_(
['ls-files', '--stage', '-z'],
sep='\0',
files=[str(p.relative_to(self.pathobj)) for p in modinfo.keys()],
files=posix_mod_paths,
read_only=True,
keep_ends=True,
):
Expand Down
61 changes: 59 additions & 2 deletions datalad/support/path.py
Expand Up @@ -10,16 +10,20 @@

One of the reasons is also to robustify operation with unicode filenames
"""
from __future__ import annotations

# TODO: RF and move all paths related functions from datalad.utils in here
import os
import os.path as op

# to not pollute API importing as _
from collections import defaultdict as _defaultdict

from functools import wraps
from itertools import dropwhile
from pathlib import (
Path,
PurePosixPath,
)
from typing import Generator

from ..utils import (
ensure_bytes,
Expand Down Expand Up @@ -207,6 +211,59 @@ def get_parent_paths(paths, parents, only_with_parents=False, *, sep='/'):
return res


def get_limited_paths(paths: list[str|Path], limits: list[str|Path], *, include_within_path: bool = False) \
-> Generator[str, None, None]:
"""Given list of relative POSIX paths (or Path objects), select the ones within limits (also relative and POSIX).

In case of include_with_path=True, if limit points to some path under a 'path' within 'paths',
that path would be returned as well.

Yields
-------
list of paths, sorted (so order is not preserved), which reside under 'limits' paths or path within 'limits' is
yarikoptic marked this conversation as resolved.
Show resolved Hide resolved
under that path.
"""
# do conversion and sanity checks, O(N)
def _harmonize_paths(l: list) -> list:
ps = []
for p in l:
if isinstance(p, str):
p = PurePosixPath(p)
if p.is_absolute():
raise ValueError(f"Got absolute path {p}, expected relative")
if p.parts and p.parts[0] == '..':
raise ValueError(f"Path {p} leads outside")
ps.append(p.parts) # store parts
return sorted(ps) # O(N * log(N))

paths_parts = _harmonize_paths(paths)
limits_parts = _harmonize_paths(limits)

# we will pretty much "scroll" through sorted paths and limits at the same time
for path_parts in paths_parts:
while limits_parts:
limit_parts = limits_parts[0]
l = min(len(path_parts), len(limit_parts))
# if common part is "greater" in the path -- we can go to the next "limit"
if limit_parts[:l] < path_parts[:l]:
# get to the next one
limits_parts = limits_parts[1:]
adswa marked this conversation as resolved.
Show resolved Hide resolved
else:
break # otherwise -- consider this one!
else:
# no limiting path left - the other paths cannot be the selected ones
break
if include_within_path:
# if one identical or subpath of another one -- their parts match in the beginning
# and we will just reuse that 'l'
pass
else:
# if all components of the limit match, for that we also add len(path_parts) check below
l = len(limit_parts)
if len(path_parts) >= l and (path_parts[:l] == limit_parts[:l]):
yield '/'.join(path_parts)


def _get_parent_paths_check(path, sep, asep):
"""A little helper for get_parent_paths"""
if isabs(path) or path.startswith(pardir + sep) or path.startswith(curdir + sep):
Expand Down
10 changes: 10 additions & 0 deletions datalad/support/tests/test_gitrepo.py
Expand Up @@ -964,6 +964,16 @@ def test_GitRepo_get_submodules(path=None):
for s in repo.get_submodules(paths=["s_abc"])],
["s_abc"])

# Pointing to a path within submodule should include it too
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_abc/unrelated"])],
["s_abc"])

# top level should list all submodules
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=[repo.path])],
["s_abc", "s_xyz"])

# Limit by non-existing/non-matching path
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_unknown"])],
Expand Down
33 changes: 33 additions & 0 deletions datalad/support/tests/test_path.py
Expand Up @@ -8,6 +8,7 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##

import os
from pathlib import PurePosixPath

from ...tests.utils_pytest import (
SkipTest,
Expand All @@ -24,6 +25,7 @@
abspath,
curdir,
get_parent_paths,
get_limited_paths,
robust_abspath,
split_ext,
)
Expand Down Expand Up @@ -114,3 +116,34 @@ def _p(path):

# and we get the deepest parent
eq_(gpp(_pp(['a/b/file', 'a/b/file2']), _pp(['a', 'a/b'])), _pp(['a/b']))


def test_get_limited_paths():
# just to avoid typing all the same
def glp(*args, **kwargs):
return list(get_limited_paths(*args, **kwargs))

assert glp(['a', 'b'], ['a', 'c']) == ['a']
assert glp(['a', 'b'], ['b']) == ['b']
assert glp(['a', 'b'], ['c']) == []

assert glp(['a', 'b'], ['a/b', 'c']) == [] # a is not subpath of a/b
assert glp(['a', 'b'], ['a/b', 'c'], include_within_path=True) == ['a'] # a is not subpath of a/b

# all paths returned due to '.', and order is sorted
paths = ['a', 'b', '1/2/3', 'abc']
paths_sorted = sorted(paths)
assert glp(paths, ['.']) == paths_sorted
assert glp(paths, paths_sorted) == paths_sorted
assert glp(paths, paths_sorted, include_within_path=True) == paths_sorted
# we can take a mix of str and Path
assert glp([PurePosixPath(paths[0])] + paths[1:], ['.']) == paths_sorted


# nothing within empty "limits" matches -- so no paths yielded
assert glp(paths, []) == []

assert_raises(ValueError, glp, ['/a'], [])
assert_raises(ValueError, glp, [PurePosixPath('/a')], [])
assert_raises(ValueError, glp, ['a'], ['/a'])
assert_raises(ValueError, glp, ['../a'], ['a'])
3 changes: 2 additions & 1 deletion tox.ini
Expand Up @@ -39,7 +39,8 @@ commands =
datalad/cmd.py \
datalad/downloaders/providers.py \
datalad/runner \
datalad/support/collections.py
datalad/support/collections.py \
datalad/support/path.py

[testenv:venv]
commands = {posargs}
Expand Down