Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support backing up Zarrs #134

Merged
merged 21 commits into from
May 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ jobs:
if: matrix.python-version == '3.9'
run: sudo apt-get update && sudo apt-get install -y libhdf5-dev

- name: Configure Git
run: |
git config --global user.email "github@test.test"
git config --global user.name "GitHub Almighty"

- name: Run tests with coverage
if: matrix.noxenv == 'test'
run: nox -e test -- --cov-report=xml
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
files: ^tools/
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
rev: v4.1.0
hooks:
- id: check-added-large-files
- id: check-json
Expand All @@ -11,12 +11,12 @@ repos:
- id: trailing-whitespace

- repo: https://github.com/psf/black
rev: 21.9b0
rev: 22.3.0
hooks:
- id: black

- repo: https://github.com/PyCQA/isort
rev: 5.9.3
rev: 5.10.1
hooks:
- id: isort

Expand Down
10 changes: 10 additions & 0 deletions tools/backups2datalad-cron
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ PS4='> '
ds=$(dirname "$0")
ds=$(dirname "$ds")

zarrds=/mnt/backup/dandi/dandizarrs

source ~/.bashrc-miniconda
conda activate dandisets

Expand All @@ -28,8 +30,11 @@ eval python -m tools.backups2datalad \
-J 5 \
--target "$ds" \
update-from-backup \
--zarr-target "$zarrds" \
--backup-remote dandi-dandisets-dropbox \
--zarr-backup-remote dandi-dandizarrs-dropbox \
--gh-org dandisets \
--zarr-gh-org dandizarrs \
-e '000108$' \
"$*" 2>&1 | grep -v 'nothing to save, working tree clean'

Expand All @@ -39,6 +44,11 @@ python -m tools.backups2datalad \
--target "$ds" \
populate dandi-dandisets-dropbox "$@"

python -m tools.backups2datalad \
-l DEBUG \
-J 5 \
populate-zarrs --zarr-target "$zarrds" dandi-dandizarrs-dropbox

git pull # so we possibly merge changes on the server
datalad push -J 5
) 2>|.git/tmp/stderr
Expand Down
7 changes: 4 additions & 3 deletions tools/backups2datalad.req.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# Python ~= 3.8
# git-annex >= 8.20210903
# git-annex >= 10.20220222
Copy link
Member

@yarikoptic yarikoptic May 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

upgrading in dandisets env: git-annex 8.20211028-alldep_h27987b5_100 --> 10.20220322-alldep_hc98582e_100

async_generator ~= 1.10; python_version < '3.10'
boto3
click >= 8.0.1
click-loglevel ~= 0.2
dandi >= 0.36.0
dandischema
datalad
ghrepo ~= 0.1
httpx ~= 0.20.0
httpx ~= 0.22.0
humanize
identify ~= 2.0
morecontext ~= 0.1
packaging
PyGithub ~= 1.53
trio ~= 0.19.0
trio ~= 0.20.0
6 changes: 0 additions & 6 deletions tools/backups2datalad/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,3 @@
- parallelize across datasets or may be better files (would that be possible within
dataset?) using DataLad's #5022 ConsumerProducer?
"""

import logging

DEFAULT_BRANCH = "draft"

log = logging.getLogger("backups2datalad")
136 changes: 94 additions & 42 deletions tools/backups2datalad/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
from dandi.dandiapi import DandiAPIClient
from datalad.api import Dataset

from . import log
from .datasetter import DandiDatasetter
from .util import Config, pdb_excepthook
from .util import Config, log, pdb_excepthook


@click.group()
Expand Down Expand Up @@ -120,25 +119,44 @@ def main(
metavar="REGEX",
type=re.compile,
)
@click.option("--gh-org", help="GitHub organization to create repositories under")
@click.option(
"--gh-org", help="GitHub organization to create Dandiset repositories under"
)
@click.option(
"--tags/--no-tags",
default=True,
help="Enable/disable creation of tags for releases [default: enabled]",
)
@click.option("--zarr-backup-remote", help="Name of the rclone remote to push Zarrs to")
@click.option(
"--zarr-gh-org", help="GitHub organization to create Zarr repositories under"
)
@click.option(
"-Z",
"--zarr-target",
type=click.Path(file_okay=False, path_type=Path),
required=True,
)
@click.argument("dandisets", nargs=-1)
@click.pass_obj
def update_from_backup(
datasetter: DandiDatasetter,
dandisets: Sequence[str],
backup_remote: Optional[str],
zarr_backup_remote: Optional[str],
gh_org: Optional[str],
zarr_gh_org: Optional[str],
zarr_target: Path,
exclude: Optional[re.Pattern[str]],
tags: bool,
) -> None:
datasetter.config.backup_remote = backup_remote
datasetter.config.zarr_backup_remote = zarr_backup_remote
datasetter.config.enable_tags = tags
datasetter.update_from_backup(dandisets, exclude=exclude, gh_org=gh_org)
datasetter.config.gh_org = gh_org
datasetter.config.zarr_gh_org = zarr_gh_org
datasetter.config.zarr_target = zarr_target
datasetter.update_from_backup(dandisets, exclude=exclude)


@main.command()
Expand Down Expand Up @@ -187,7 +205,7 @@ def release(
dataset.push(to="github", jobs=datasetter.config.jobs)


@main.command()
@main.command("populate")
@click.option(
"-e",
"--exclude",
Expand All @@ -198,7 +216,7 @@ def release(
@click.argument("backup_remote")
@click.argument("dandisets", nargs=-1)
@click.pass_obj
def populate(
def populate_cmd(
datasetter: DandiDatasetter,
dandisets: Sequence[str],
backup_remote: str,
Expand All @@ -215,45 +233,79 @@ def populate(
elif not Dataset(p).is_installed():
log.info("Dataset %s is not installed; skipping", p.name)
else:
log.info("Downloading assets for Dandiset %s", p.name)
subprocess.run(
[
"git-annex",
"get",
"-c",
"annex.retry=3",
"--jobs",
str(datasetter.config.jobs),
"--from=web",
"--not",
"--in",
backup_remote,
"--and",
"--not",
"--in",
"here",
],
check=True,
cwd=p,
)
log.info("Moving assets for Dandiset %s to backup remote", p.name)
subprocess.run(
[
"git-annex",
"move",
"-c",
"annex.retry=3",
"--jobs",
str(datasetter.config.jobs),
"--to",
backup_remote,
],
check=True,
cwd=p,
)
populate(p, backup_remote, f"Dandiset {p.name}", datasetter.config.jobs)
else:
log.debug("Skipping non-Dandiset node %s", p.name)


@main.command()
@click.option(
"-Z",
"--zarr-target",
type=click.Path(file_okay=False, path_type=Path),
required=True,
)
@click.argument("backup_remote")
@click.argument("zarrs", nargs=-1)
@click.pass_obj
def populate_zarrs(
datasetter: DandiDatasetter,
zarr_target: Path,
zarrs: Sequence[str],
backup_remote: str,
) -> None:
if zarrs:
dirs = [zarr_target / z for z in zarrs]
else:
dirs = list(zarr_target.iterdir())
for p in dirs:
if p.is_dir() and p.name not in (".git", ".datalad"):
if not Dataset(p).is_installed():
log.info("Zarr %s is not installed; skipping", p.name)
else:
populate(p, backup_remote, f"Zarr {p.name}", datasetter.config.jobs)
else:
log.debug("Skipping non-Zarr node %s", p.name)


def populate(dirpath: Path, backup_remote: str, desc: str, jobs: int) -> None:
log.info("Downloading files for %s", desc)
subprocess.run(
[
"git-annex",
"get",
"-c",
"annex.retry=3",
"--jobs",
str(jobs),
"--from=web",
"--not",
"--in",
backup_remote,
"--and",
"--not",
"--in",
"here",
],
check=True,
cwd=dirpath,
)
log.info("Moving files for %s to backup remote", desc)
subprocess.run(
[
"git-annex",
"move",
"-c",
"annex.retry=3",
"--jobs",
str(jobs),
"--to",
backup_remote,
],
check=True,
cwd=dirpath,
)


if __name__ == "__main__":
main()
33 changes: 27 additions & 6 deletions tools/backups2datalad/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@

import trio

from . import log
from .util import TextProcess, format_errors, open_git_annex
from .util import TextProcess, format_errors, log, open_git_annex


@dataclass
class AsyncAnnex(trio.abc.AsyncResource):
repo: Path
nursery: trio.Nursery
digest_type: str = "SHA256"
pfromkey: Optional[TextProcess] = None
pexaminekey: Optional[TextProcess] = None
pwhereis: Optional[TextProcess] = None
Expand All @@ -30,6 +31,7 @@ async def from_key(self, key: str, path: str) -> None:
async with self.locks["fromkey"]:
if self.pfromkey is None:
self.pfromkey = await open_git_annex(
self.nursery,
"fromkey",
"--force",
"--batch",
Expand All @@ -49,16 +51,19 @@ async def from_key(self, key: str, path: str) -> None:
)
### TODO: Raise an exception?

async def mkkey(self, filename: str, size: int, sha256_digest: str) -> str:
async def mkkey(self, filename: str, size: int, digest: str) -> str:
async with self.locks["examinekey"]:
if self.pexaminekey is None:
self.pexaminekey = await open_git_annex(
self.nursery,
"examinekey",
"--batch",
"--migrate-to-backend=SHA256E",
f"--migrate-to-backend={self.digest_type}E",
path=self.repo,
)
await self.pexaminekey.send(f"SHA256-s{size}--{sha256_digest} {filename}\n")
await self.pexaminekey.send(
f"{self.digest_type}-s{size}--{digest} {filename}\n"
)
### TODO: Do something if readline() returns "" (signalling EOF)
return (await self.pexaminekey.readline()).strip()

Expand All @@ -67,6 +72,7 @@ async def get_key_remotes(self, key: str) -> Optional[List[str]]:
async with self.locks["whereis"]:
if self.pwhereis is None:
self.pwhereis = await open_git_annex(
self.nursery,
"whereis",
"--batch-keys",
"--json",
Expand All @@ -88,6 +94,21 @@ async def register_url(self, key: str, url: str) -> None:
async with self.locks["registerurl"]:
if self.pregisterurl is None:
self.pregisterurl = await open_git_annex(
"registerurl", "--batch", path=self.repo
self.nursery,
"registerurl",
"--batch",
"--json",
"--json-error-messages",
path=self.repo,
)
await self.pregisterurl.send(f"{key} {url}\n")
### TODO: Do something if readline() returns "" (signalling EOF)
r = json.loads(await self.pregisterurl.readline())
if not r["success"]:
log.error(
"`git annex registerurl %s %s` call failed:%s",
key,
url,
format_errors(r["error-messages"]),
)
### TODO: Raise an exception?