Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RF: get_modules_ - avoid O(N^2) if paths provided via use of get_limited_paths #7220

Merged
merged 10 commits into from Dec 23, 2022
Merged
6 changes: 6 additions & 0 deletions benchmarks/api.py
Expand Up @@ -86,6 +86,12 @@ def time_ls_recursive_long_all(self):
def time_subdatasets(self):
self.ds.subdatasets()

def time_subdatasets_with_all_paths_recursive(self):
# to see if we do not get O(N^2) performance
subdatasets = self.ds.subdatasets(recursive=True, result_xfm='relpaths')
subdatasets2 = self.ds.subdatasets(path=subdatasets, recursive=True, result_xfm='relpaths')
assert subdatasets == subdatasets2

def time_subdatasets_recursive(self):
self.ds.subdatasets(recursive=True)

Expand Down
2 changes: 1 addition & 1 deletion changelog.d/pr-7189.md
@@ -1,3 +1,3 @@
### 🏎 Performance

- Reimplement `get_submodules_()` without `get_content_info()` for substantial performance boosts especially for large datasets with few subdatasets. Note: might result for slightly slower performance when many subdataset paths are provided (e.g. via glob) to the function explicitly. Originally proposed in [PR #6942](https://github.com/datalad/datalad/pull/6942) by [@mih](https://github.com/mih), fixing [#6940](https://github.com/datalad/datalad/issues/6940). [PR #7189](https://github.com/datalad/datalad/pull/7189) (by [@adswa](https://github.com/adswa))
- Reimplement `get_submodules_()` without `get_content_info()` for substantial performance boosts especially for large datasets with few subdatasets. Originally proposed in [PR #6942](https://github.com/datalad/datalad/pull/6942) by [@mih](https://github.com/mih), fixing [#6940](https://github.com/datalad/datalad/issues/6940). [PR #7189](https://github.com/datalad/datalad/pull/7189) (by [@adswa](https://github.com/adswa)). Complemented with [PR #7220](https://github.com/datalad/datalad/pull/7220) (by [@yarikoptic](https://github.com/yarikoptic)) to avoid `O(N^2)` (instead of `O(N*log(N))` performance in some cases.
58 changes: 30 additions & 28 deletions datalad/support/gitrepo.py
Expand Up @@ -92,7 +92,7 @@
PathRI,
is_ssh,
)
from .path import get_parent_paths
from .path import get_parent_paths, get_filtered_paths_

# shortcuts
_curdirsep = curdir + sep
Expand Down Expand Up @@ -2354,7 +2354,8 @@ def _parse_gitmodules(self):
modprops = {'gitmodule_{}'.format(k): v
for k, v in props.items()
if not (k.startswith('__') or k == 'path')}
modpath = self.pathobj / PurePosixPath(props['path'])
# Keep as PurePosixPath for possible normalization of / in the path etc
modpath = PurePosixPath(props['path'])
modprops['gitmodule_name'] = name
out[modpath] = modprops
return out
Expand Down Expand Up @@ -2388,34 +2389,34 @@ def get_submodules_(self, paths=None):
# below
return

# taken from 3.9's pathlib, can be removed once the minimally
# supported python version
def is_relative_to(self, *other):
"""Return True if the path is relative to another path or False.
"""
try:
self.relative_to(*other)
return True
except ValueError:
return False

posix_mod_paths = [m.as_posix() for m in modinfo]
if paths:
# ease comparison
paths = [self.pathobj / p for p in paths]
# constrain the report by the given paths

modinfo = {
# modpath is absolute
modpath: modprobs
for modpath, modprobs in modinfo.items()
# is_relative_to() also match equal paths
if (any(is_relative_to(p, modpath) for p in paths) or
any(is_relative_to(modpath, p) for p in paths))
}
# harmonize them into relative to the repository
posix_paths = []
for path in paths:
path = ut.PurePath(path)
if path.is_absolute():
try:
path = path.relative_to(self.pathobj)
except ValueError as exc:
lgr.debug(
"Path %s it not underneath %s, skipping since nothing should match it: %s",
path, self.pathobj, CapturedException(exc)
)
continue
posix_paths.append(path.as_posix())

# constrain the report by the given paths, make sure all paths are POSIX
posix_mod_paths = list(get_filtered_paths_(
posix_mod_paths,
posix_paths,
include_within_path=True,
))

for r in self.call_git_items_(
['ls-files', '--stage', '-z'],
sep='\0',
files=[str(p.relative_to(self.pathobj)) for p in modinfo.keys()],
files=posix_mod_paths,
read_only=True,
keep_ends=True,
):
Expand All @@ -2435,9 +2436,10 @@ def is_relative_to(self, *other):
# there is either a stage 2 or stage 0, never both
continue
# remove the expected line separator from the path
path = self.pathobj / PurePosixPath(rpath[:-1])
rpath = rpath[:-1]
path = PurePosixPath(rpath)
yield dict(
path=path,
path=self.pathobj / rpath, # full path returned here
type='dataset',
gitshasum=gitsha,
**modinfo.get(path, {})
Expand Down
68 changes: 66 additions & 2 deletions datalad/support/path.py
Expand Up @@ -10,16 +10,20 @@

One of the reasons is also to robustify operation with unicode filenames
"""
from __future__ import annotations

# TODO: RF and move all paths related functions from datalad.utils in here
import os
import os.path as op

# to not pollute API importing as _
from collections import defaultdict as _defaultdict

from functools import wraps
from itertools import dropwhile
from pathlib import (
Path,
PurePosixPath,
)
from typing import Generator

from ..utils import (
ensure_bytes,
Expand Down Expand Up @@ -207,6 +211,66 @@ def get_parent_paths(paths, parents, only_with_parents=False, *, sep='/'):
return res


def get_filtered_paths_(paths: list[str|Path], filter_paths: list[str | Path],
*, include_within_path: bool = False) \
-> Generator[str, None, None]:
"""Among paths (or Path objects) select the ones within filter_paths.

All `paths` and `filter_paths` must be relative and POSIX.

In case of `include_with_path=True`, if a `filter_path` points to some path
under a `path` within `paths`, that path would be returned as well, e.g.
`path` 'submod' would be returned if there is a `filter_path` 'submod/subsub/file'.

Complexity is O(N*log(N)), where N is the largest of the lengths of `paths`
or `filter_paths`.

Yields
------
paths, sorted (so order is not preserved), which reside under 'filter_paths' or
path within 'filter_paths' is under that path.
"""
# do conversion and sanity checks, O(N)
def _harmonize_paths(l: list) -> list:
ps = []
for p in l:
if isinstance(p, str):
p = PurePosixPath(p)
if p.is_absolute():
raise ValueError(f"Got absolute path {p}, expected relative")
if p.parts and p.parts[0] == '..':
raise ValueError(f"Path {p} leads outside")
ps.append(p.parts) # store parts
return sorted(ps) # O(N * log(N))

paths_parts = _harmonize_paths(paths)
filter_paths_parts = _harmonize_paths(filter_paths)

# we will pretty much "scroll" through sorted paths and filter_paths at the same time
for path_parts in paths_parts:
while filter_paths_parts:
filter_path_parts = filter_paths_parts[0]
l = min(len(path_parts), len(filter_path_parts))
# if common part is "greater" in the path -- we can go to the next "filter"
if filter_path_parts[:l] < path_parts[:l]:
# get to the next one
filter_paths_parts = filter_paths_parts[1:]
else:
break # otherwise -- consider this one!
else:
# no filter path left - the other paths cannot be the selected ones
break
if include_within_path:
# if one identical or subpath of another one -- their parts match in the beginning
# and we will just reuse that 'l'
pass
else:
# if all components of the filter match, for that we also add len(path_parts) check below
l = len(filter_path_parts)
if len(path_parts) >= l and (path_parts[:l] == filter_path_parts[:l]):
yield '/'.join(path_parts)


def _get_parent_paths_check(path, sep, asep):
"""A little helper for get_parent_paths"""
if isabs(path) or path.startswith(pardir + sep) or path.startswith(curdir + sep):
Expand Down
10 changes: 10 additions & 0 deletions datalad/support/tests/test_gitrepo.py
Expand Up @@ -964,6 +964,16 @@ def test_GitRepo_get_submodules(path=None):
for s in repo.get_submodules(paths=["s_abc"])],
["s_abc"])

# Pointing to a path within submodule should include it too
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_abc/unrelated"])],
["s_abc"])

# top level should list all submodules
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=[repo.path])],
["s_abc", "s_xyz"])

# Limit by non-existing/non-matching path
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_unknown"])],
Expand Down
33 changes: 33 additions & 0 deletions datalad/support/tests/test_path.py
Expand Up @@ -8,6 +8,7 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##

import os
from pathlib import PurePosixPath

from ...tests.utils_pytest import (
SkipTest,
Expand All @@ -24,6 +25,7 @@
abspath,
curdir,
get_parent_paths,
get_filtered_paths_,
robust_abspath,
split_ext,
)
Expand Down Expand Up @@ -114,3 +116,34 @@ def _p(path):

# and we get the deepest parent
eq_(gpp(_pp(['a/b/file', 'a/b/file2']), _pp(['a', 'a/b'])), _pp(['a/b']))


def test_get_filtered_paths_():
# just to avoid typing all the same
def gfp(*args, **kwargs):
return list(get_filtered_paths_(*args, **kwargs))

assert gfp(['a', 'b'], ['a', 'c']) == ['a']
assert gfp(['a', 'b'], ['b']) == ['b']
assert gfp(['a', 'b'], ['c']) == []

assert gfp(['a', 'b'], ['a/b', 'c']) == [] # a is not subpath of a/b
assert gfp(['a', 'b'], ['a/b', 'c'], include_within_path=True) == ['a'] # a is not subpath of a/b

# all paths returned due to '.', and order is sorted
paths = ['a', 'b', '1/2/3', 'abc']
paths_sorted = sorted(paths)
assert gfp(paths, ['.']) == paths_sorted
assert gfp(paths, paths_sorted) == paths_sorted
assert gfp(paths, paths_sorted, include_within_path=True) == paths_sorted
# we can take a mix of str and Path
assert gfp([PurePosixPath(paths[0])] + paths[1:], ['.']) == paths_sorted


# nothing within empty "filter_paths" matches -- so no paths yielded
assert gfp(paths, []) == []

assert_raises(ValueError, gfp, ['/a'], [])
assert_raises(ValueError, gfp, [PurePosixPath('/a')], [])
assert_raises(ValueError, gfp, ['a'], ['/a'])
assert_raises(ValueError, gfp, ['../a'], ['a'])
3 changes: 2 additions & 1 deletion tox.ini
Expand Up @@ -39,7 +39,8 @@ commands =
datalad/cmd.py \
datalad/downloaders/providers.py \
datalad/runner \
datalad/support/collections.py
datalad/support/collections.py \
datalad/support/path.py

[testenv:venv]
commands = {posargs}
Expand Down