Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differential tarball tool #9

Merged
merged 20 commits into from Feb 27, 2020
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
201 changes: 201 additions & 0 deletions conda_mirror/diff_tar.py
@@ -0,0 +1,201 @@
"""
Implementation of the conda-diff-tar command, a tools which allows creating
differential tarballs of a (usually mirrored) conda repository. The resulting
tarball can be used to update a copy of the mirror on a remote (air-gapped)
system, without having to copy the entire conda repository.
"""
import os
scopatz marked this conversation as resolved.
Show resolved Hide resolved
import sys
import json
import hashlib
import tarfile
from os.path import abspath, isdir, join, relpath


REFERENCE_PATH = './reference.json'


def md5_file(path):
"""
Return the MD5 hashsum of the file given by `path` in hexadecimal
representation.
"""
h = hashlib.new('md5')
with open(path, 'rb') as fi:
while 1:
chunk = fi.read(262144)
if not chunk:
break
h.update(chunk)
return h.hexdigest()


def find_repos(mirror_dir):
"""
Given the path to a directory, iterate all sub-directories
which contain a repodata.json and repodata.json.bz2 file.
"""
for root, unused_dirs, files in os.walk(mirror_dir):
if 'repodata.json' in files and 'repodata.json.bz2' in files:
yield root


def all_repodata(mirror_dir):
"""
Given the path to a directory, return a dictionary mapping all repository
sub-directories to the conda package list as respresented by
the 'packages' field in repodata.json.
"""
d = {}
for repo_path in find_repos(mirror_dir):
with open(join(repo_path, 'repodata.json')) as fi:
index = json.load(fi)['packages']
d[repo_path] = index
return d


def verify_all_repos(mirror_dir):
"""
Verify all the MD5 sum of all conda packages listed in all repodata.json
files in the repository.
"""
d = all_repodata(mirror_dir)
for repo_path, index in d.items():
for fn, info in index.items():
path = join(repo_path, fn)
if info['md5'] == md5_file(path):
continue
print('MD5 mismatch: %s' % path)


def write_reference(mirror_dir):
"""
Write the "reference file", which is a collection of the content of all
repodata.json files.
"""
data = json.dumps(all_repodata(mirror_dir), indent=2, sort_keys=True)
# make sure we have newline at the end
if not data.endswith('\n'):
data += '\n'
with open(REFERENCE_PATH, 'w') as fo:
fo.write(data)


def read_reference():
"""
Read the "reference file" from disk and return its content as a dictionary.
"""
try:
with open(REFERENCE_PATH) as fi:
return json.load(fi)
except FileNotFoundError:
sys.exit("""\
Error: no such file: %s
Please use the --reference option before creating a differential tarball.
""" % REFERENCE_PATH)
ilanschnell marked this conversation as resolved.
Show resolved Hide resolved


def get_updates(mirror_dir):
"""
Compare the "reference file" to the actual the repository (all the
repodata.json files) and iterate the new and updates files in the
repository. That is, the files which need to go into the differential
tarball.
"""
d1 = read_reference()
d2 = all_repodata(mirror_dir)
for repo_path, index2 in d2.items():
index1 = d1.get(repo_path, {})
if index1 != index2:
for fn in 'repodata.json', 'repodata.json.bz2':
yield relpath(join(repo_path, fn), mirror_dir)
for fn, info2 in index2.items():
info1 = index1.get(fn, {})
if info1.get('md5') != info2['md5']:
yield relpath(join(repo_path, fn), mirror_dir)


def tar_repo(mirror_dir, outfile='update.tar', verbose=False):
"""
Write the so-called differential tarball, see get_updates().
"""
t = tarfile.open(outfile, 'w')
for f in get_updates(mirror_dir):
if verbose:
print('adding: %s' % f)
t.add(join(mirror_dir, f), f)
t.close()
if verbose:
print("Wrote: %s" % outfile)
ilanschnell marked this conversation as resolved.
Show resolved Hide resolved


def main():
import argparse

p = argparse.ArgumentParser(
description='create "differential" tarballs of a conda repository')

p.add_argument('repo_dir',
nargs='?',
action="store",
metavar='REPOSITORY',
help="path to repository directory")

p.add_argument('--create',
action="store_true",
help="create differential tarball")

p.add_argument('--reference',
action="store_true",
help="create a reference point file")

p.add_argument('--show',
action="store_true",
help="show the files in respect to the latest reference "
"point file (which would be included in the "
"differential tarball)")

p.add_argument('--verify',
action="store_true",
help="verify the mirror repository and exit")

p.add_argument('-v', '--verbose',
action="store_true")

p.add_argument('--version',
action="store_true",
help="print version and exit")

args = p.parse_args()

if args.version:
from conda_mirror import __version__
print('conda-mirror: %s' % __version__)
return

if not args.repo_dir:
p.error('exactly one REPOSITORY is required, try -h')

mirror_dir = abspath(args.repo_dir)
if not isdir(mirror_dir):
sys.exit("No such directory: %r" % mirror_dir)

if args.create:
tar_repo(mirror_dir, verbose=args.verbose)

elif args.verify:
verify_all_repos(mirror_dir)

elif args.show:
for path in get_updates(mirror_dir):
print(path)

elif args.reference:
write_reference(mirror_dir)

else:
print("Nothing done.")


if __name__ == '__main__':
main()
63 changes: 63 additions & 0 deletions diff-tar.md
@@ -0,0 +1,63 @@
Create differential tarballs
============================

This tools allows you to create differential tarballs of a (usually
mirrored) conda repository. The resulting tarball can be used to update
a copy of the mirror on a remote (air-gapped) system, without having to
copy the entire conda repository. The workflow is a follows:

1. we assume that the remote and local repository are in sync
2. create a `reference.json` file of the local repository
3. update the local repository using `conda-mirror` or some other tools
4. create the "differential" tarball
5. move the differential tarball to the remote machine, and unpack it
6. now that the remote repository is up-to-date, we should create a new
`reference.json` on the local machine. That is, step 2


Notes:
------

The file `reference.json` is a collection of all `repodata.json`
files (`linux-64`, `win-32`, `noarch`, etc.) of the local repository.
It is created in order to compare a future state of the repository to the
state of the repository when `reference.json` it was created.

The differential tarball contains files which either have been updated (such
as `repodata.json`) or new files (new conda packages). It is meant to be
unpacked on top of the existing mirror on the remote machine by:

cd <repository>
tar xf update.tar
# or y using tar's -C option from any directory
tar xf update.tar -C <repository>


Example:
--------

In this example we assume that a conda mirror is located in `./repo`.
Create `reference.json`:

conda-diff-tar --reference ./repo

Show the files in respect to the latest reference point file (which would be
included in the differential tarball). Since we just created the reference
file, we don't expect any output:

conda-diff-tar --show ./repo

Now, we can update the mirror:

conda-mirror --upstream-channel conda-forge --target-directory ./repo ...

Create the actual differential tarball:

$ conda-diff-tar --create ./repo
Wrote: update.tar
$ tar tf update.tar
noarch/repodata.json
noarch/repodata.json.bz2
noarch/ablog-0.9.2-py_0.tar.bz2
noarch/aws-amicleaner-0.2.2-py_0.tar.bz2
...
3 changes: 2 additions & 1 deletion setup.py
Expand Up @@ -27,7 +27,8 @@
],
entry_points={
"console_scripts": [
'conda-mirror = conda_mirror.conda_mirror:cli'
'conda-mirror = conda_mirror.conda_mirror:cli',
'conda-diff-tar = conda_mirror.diff_tar:main',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be a subcommand of conda-mirror, rather than a separate command?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure, I realized that just adding the new functionality as options wouldn't make too much sense. Having a sub command it of course possible too, for example conda-mirror diff-tar.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xhochy @mariusvniekerk - any opinions?

]
}
)
109 changes: 109 additions & 0 deletions test/test_diff_tar.py
@@ -0,0 +1,109 @@
import os
ilanschnell marked this conversation as resolved.
Show resolved Hide resolved
import sys
import json
import shutil
import tempfile
from os.path import isfile, join

import pytest

import conda_mirror.diff_tar as dt


EMPTY_MD5 = 'd41d8cd98f00b204e9800998ecf8427e'


@pytest.fixture
def tmpdir():
tmpdir = tempfile.mkdtemp()
dt.mirror_dir = join(tmpdir, 'repo')
dt.REFERENCE_PATH = join(tmpdir, 'reference.json')
yield tmpdir
shutil.rmtree(tmpdir)


def test_md5_file(tmpdir):
tmpfile = join(tmpdir, 'testfile')
with open(tmpfile, 'wb') as fo:
fo.write(b'A\n')
assert dt.md5_file(tmpfile) == 'bf072e9119077b4e76437a93986787ef'


def create_test_repo(subdirname='linux-64'):
subdir = join(dt.mirror_dir, subdirname)
os.makedirs(subdir)
with open(join(subdir, 'repodata.json'), 'w') as fo:
fo.write(json.dumps({'packages':
{'a-1.0-0.tar.bz2': {'md5': EMPTY_MD5}}}))
for fn in 'repodata.json.bz2', 'a-1.0-0.tar.bz2':
with open(join(subdir, fn), 'wb') as fo:
pass


def test_find_repos(tmpdir):
create_test_repo()
assert list(dt.find_repos(dt.mirror_dir)) == \
[join(dt.mirror_dir, 'linux-64')]


def test_all_repodata_repos(tmpdir):
create_test_repo()
d = dt.all_repodata(dt.mirror_dir)
assert d[join(dt.mirror_dir, 'linux-64')]['a-1.0-0.tar.bz2']['md5'] == \
EMPTY_MD5


def test_verify_all_repos(tmpdir):
create_test_repo()
dt.verify_all_repos(dt.mirror_dir)


def test_write_and_read_reference(tmpdir):
create_test_repo()
dt.write_reference(join(tmpdir, 'repo'))
ref = dt.read_reference()
assert ref[join(dt.mirror_dir, 'linux-64')]['a-1.0-0.tar.bz2']['md5'] == \
EMPTY_MD5


def test_get_updates(tmpdir):
create_test_repo()
dt.write_reference(join(tmpdir, 'repo'))
assert list(dt.get_updates(dt.mirror_dir)) == []

create_test_repo('win-32')
lst = sorted(dt.get_updates(dt.mirror_dir))
assert lst == ['win-32/a-1.0-0.tar.bz2',
'win-32/repodata.json',
'win-32/repodata.json.bz2']


def test_tar_repo(tmpdir):
create_test_repo()
tarball = join(tmpdir, 'up.tar')
dt.write_reference(dt.mirror_dir)
create_test_repo('win-32')
dt.tar_repo(dt.mirror_dir, tarball)
assert isfile(tarball)


def run_with_args(args):
old_args = list(sys.argv)
sys.argv = ['conda-diff-tar'] + args
dt.main()
sys.argv = old_args


def test_version():
run_with_args(['--version'])


def test_misc(tmpdir):
create_test_repo()
run_with_args(['--reference', dt.mirror_dir])
assert isfile(dt.REFERENCE_PATH)
create_test_repo('win-32')
run_with_args(['--show', dt.mirror_dir])
run_with_args(['--create', '--verbose', dt.mirror_dir])
run_with_args(['--verify', dt.mirror_dir])
run_with_args([dt.mirror_dir]) # do nothing