forked from vericast/conda-mirror
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from ilanschnell/diff-tar
Differential tarball tool
- Loading branch information
Showing
4 changed files
with
389 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
""" | ||
Implementation of the conda-diff-tar command, a tools which allows creating | ||
differential tarballs of a (usually mirrored) conda repository. The resulting | ||
tarball can be used to update a copy of the mirror on a remote (air-gapped) | ||
system, without having to copy the entire conda repository. | ||
""" | ||
import os | ||
import sys | ||
import json | ||
import hashlib | ||
import tarfile | ||
from os.path import abspath, isdir, join, relpath | ||
|
||
|
||
REFERENCE_PATH = './reference.json' | ||
|
||
|
||
class NoReferenceError(FileNotFoundError): | ||
pass | ||
|
||
|
||
def md5_file(path): | ||
""" | ||
Return the MD5 hashsum of the file given by `path` in hexadecimal | ||
representation. | ||
""" | ||
h = hashlib.new('md5') | ||
with open(path, 'rb') as fi: | ||
while 1: | ||
chunk = fi.read(262144) | ||
if not chunk: | ||
break | ||
h.update(chunk) | ||
return h.hexdigest() | ||
|
||
|
||
def find_repos(mirror_dir): | ||
""" | ||
Given the path to a directory, iterate all sub-directories | ||
which contain a repodata.json and repodata.json.bz2 file. | ||
""" | ||
for root, unused_dirs, files in os.walk(mirror_dir): | ||
if 'repodata.json' in files and 'repodata.json.bz2' in files: | ||
yield root | ||
|
||
|
||
def all_repodata(mirror_dir): | ||
""" | ||
Given the path to a directory, return a dictionary mapping all repository | ||
sub-directories to the conda package list as respresented by | ||
the 'packages' field in repodata.json. | ||
""" | ||
d = {} | ||
for repo_path in find_repos(mirror_dir): | ||
with open(join(repo_path, 'repodata.json')) as fi: | ||
index = json.load(fi)['packages'] | ||
d[repo_path] = index | ||
return d | ||
|
||
|
||
def verify_all_repos(mirror_dir): | ||
""" | ||
Verify all the MD5 sum of all conda packages listed in all repodata.json | ||
files in the repository. | ||
""" | ||
d = all_repodata(mirror_dir) | ||
for repo_path, index in d.items(): | ||
for fn, info in index.items(): | ||
path = join(repo_path, fn) | ||
if info['md5'] == md5_file(path): | ||
continue | ||
print('MD5 mismatch: %s' % path) | ||
|
||
|
||
def write_reference(mirror_dir): | ||
""" | ||
Write the "reference file", which is a collection of the content of all | ||
repodata.json files. | ||
""" | ||
data = json.dumps(all_repodata(mirror_dir), indent=2, sort_keys=True) | ||
# make sure we have newline at the end | ||
if not data.endswith('\n'): | ||
data += '\n' | ||
with open(REFERENCE_PATH, 'w') as fo: | ||
fo.write(data) | ||
|
||
|
||
def read_reference(): | ||
""" | ||
Read the "reference file" from disk and return its content as a dictionary. | ||
""" | ||
try: | ||
with open(REFERENCE_PATH) as fi: | ||
return json.load(fi) | ||
except FileNotFoundError as e: | ||
raise NoReferenceError(e) | ||
|
||
|
||
def get_updates(mirror_dir): | ||
""" | ||
Compare the "reference file" to the actual the repository (all the | ||
repodata.json files) and iterate the new and updates files in the | ||
repository. That is, the files which need to go into the differential | ||
tarball. | ||
""" | ||
d1 = read_reference() | ||
d2 = all_repodata(mirror_dir) | ||
for repo_path, index2 in d2.items(): | ||
index1 = d1.get(repo_path, {}) | ||
if index1 != index2: | ||
for fn in 'repodata.json', 'repodata.json.bz2': | ||
yield relpath(join(repo_path, fn), mirror_dir) | ||
for fn, info2 in index2.items(): | ||
info1 = index1.get(fn, {}) | ||
if info1.get('md5') != info2['md5']: | ||
yield relpath(join(repo_path, fn), mirror_dir) | ||
|
||
|
||
def tar_repo(mirror_dir, outfile='update.tar', verbose=False): | ||
""" | ||
Write the so-called differential tarball, see get_updates(). | ||
""" | ||
t = tarfile.open(outfile, 'w') | ||
for f in get_updates(mirror_dir): | ||
if verbose: | ||
print('adding: %s' % f) | ||
t.add(join(mirror_dir, f), f) | ||
t.close() | ||
if verbose: | ||
print("Wrote: %s" % outfile) | ||
|
||
|
||
def main(): | ||
import argparse | ||
|
||
p = argparse.ArgumentParser( | ||
description='create "differential" tarballs of a conda repository') | ||
|
||
p.add_argument('repo_dir', | ||
nargs='?', | ||
action="store", | ||
metavar='REPOSITORY', | ||
help="path to repository directory") | ||
|
||
p.add_argument('--create', | ||
action="store_true", | ||
help="create differential tarball") | ||
|
||
p.add_argument('--reference', | ||
action="store_true", | ||
help="create a reference point file") | ||
|
||
p.add_argument('--show', | ||
action="store_true", | ||
help="show the files in respect to the latest reference " | ||
"point file (which would be included in the " | ||
"differential tarball)") | ||
|
||
p.add_argument('--verify', | ||
action="store_true", | ||
help="verify the mirror repository and exit") | ||
|
||
p.add_argument('-v', '--verbose', | ||
action="store_true") | ||
|
||
p.add_argument('--version', | ||
action="store_true", | ||
help="print version and exit") | ||
|
||
args = p.parse_args() | ||
|
||
if args.version: | ||
from conda_mirror import __version__ | ||
print('conda-mirror: %s' % __version__) | ||
return | ||
|
||
if not args.repo_dir: | ||
p.error('exactly one REPOSITORY is required, try -h') | ||
|
||
mirror_dir = abspath(args.repo_dir) | ||
if not isdir(mirror_dir): | ||
sys.exit("No such directory: %r" % mirror_dir) | ||
|
||
try: | ||
if args.create: | ||
tar_repo(mirror_dir, verbose=args.verbose) | ||
|
||
elif args.verify: | ||
verify_all_repos(mirror_dir) | ||
|
||
elif args.show: | ||
for path in get_updates(mirror_dir): | ||
print(path) | ||
|
||
elif args.reference: | ||
write_reference(mirror_dir) | ||
|
||
else: | ||
print("Nothing done.") | ||
|
||
except NoReferenceError: | ||
sys.exit("""\ | ||
Error: no such file: %s | ||
Please use the --reference option before creating a differential tarball.\ | ||
""" % REFERENCE_PATH) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
Create differential tarballs | ||
============================ | ||
|
||
This tools allows you to create differential tarballs of a (usually | ||
mirrored) conda repository. The resulting tarball can be used to update | ||
a copy of the mirror on a remote (air-gapped) system, without having to | ||
copy the entire conda repository. The workflow is a follows: | ||
|
||
1. we assume that the remote and local repository are in sync | ||
2. create a `reference.json` file of the local repository | ||
3. update the local repository using `conda-mirror` or some other tools | ||
4. create the "differential" tarball | ||
5. move the differential tarball to the remote machine, and unpack it | ||
6. now that the remote repository is up-to-date, we should create a new | ||
`reference.json` on the local machine. That is, step 2 | ||
|
||
|
||
Notes: | ||
------ | ||
|
||
The file `reference.json` is a collection of all `repodata.json` | ||
files (`linux-64`, `win-32`, `noarch`, etc.) of the local repository. | ||
It is created in order to compare a future state of the repository to the | ||
state of the repository when `reference.json` it was created. | ||
|
||
The differential tarball contains files which either have been updated (such | ||
as `repodata.json`) or new files (new conda packages). It is meant to be | ||
unpacked on top of the existing mirror on the remote machine by: | ||
|
||
cd <repository> | ||
tar xf update.tar | ||
# or y using tar's -C option from any directory | ||
tar xf update.tar -C <repository> | ||
|
||
|
||
Example: | ||
-------- | ||
|
||
In this example we assume that a conda mirror is located in `./repo`. | ||
Create `reference.json`: | ||
|
||
conda-diff-tar --reference ./repo | ||
|
||
Show the files in respect to the latest reference point file (which would be | ||
included in the differential tarball). Since we just created the reference | ||
file, we don't expect any output: | ||
|
||
conda-diff-tar --show ./repo | ||
|
||
Now, we can update the mirror: | ||
|
||
conda-mirror --upstream-channel conda-forge --target-directory ./repo ... | ||
|
||
Create the actual differential tarball: | ||
|
||
$ conda-diff-tar --create ./repo | ||
Wrote: update.tar | ||
$ tar tf update.tar | ||
noarch/repodata.json | ||
noarch/repodata.json.bz2 | ||
noarch/ablog-0.9.2-py_0.tar.bz2 | ||
noarch/aws-amicleaner-0.2.2-py_0.tar.bz2 | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import os | ||
import sys | ||
import json | ||
import shutil | ||
import tempfile | ||
from os.path import isfile, join | ||
|
||
import pytest | ||
|
||
import conda_mirror.diff_tar as dt | ||
|
||
|
||
EMPTY_MD5 = 'd41d8cd98f00b204e9800998ecf8427e' | ||
|
||
|
||
@pytest.fixture | ||
def tmpdir(): | ||
tmpdir = tempfile.mkdtemp() | ||
dt.mirror_dir = join(tmpdir, 'repo') | ||
dt.REFERENCE_PATH = join(tmpdir, 'reference.json') | ||
yield tmpdir | ||
shutil.rmtree(tmpdir) | ||
|
||
|
||
def test_md5_file(tmpdir): | ||
tmpfile = join(tmpdir, 'testfile') | ||
with open(tmpfile, 'wb') as fo: | ||
fo.write(b'A\n') | ||
assert dt.md5_file(tmpfile) == 'bf072e9119077b4e76437a93986787ef' | ||
|
||
|
||
def create_test_repo(subdirname='linux-64'): | ||
subdir = join(dt.mirror_dir, subdirname) | ||
os.makedirs(subdir) | ||
with open(join(subdir, 'repodata.json'), 'w') as fo: | ||
fo.write(json.dumps({'packages': | ||
{'a-1.0-0.tar.bz2': {'md5': EMPTY_MD5}}})) | ||
for fn in 'repodata.json.bz2', 'a-1.0-0.tar.bz2': | ||
with open(join(subdir, fn), 'wb') as fo: | ||
pass | ||
|
||
|
||
def test_find_repos(tmpdir): | ||
create_test_repo() | ||
assert list(dt.find_repos(dt.mirror_dir)) == \ | ||
[join(dt.mirror_dir, 'linux-64')] | ||
|
||
|
||
def test_all_repodata_repos(tmpdir): | ||
create_test_repo() | ||
d = dt.all_repodata(dt.mirror_dir) | ||
assert d[join(dt.mirror_dir, 'linux-64')]['a-1.0-0.tar.bz2']['md5'] == \ | ||
EMPTY_MD5 | ||
|
||
|
||
def test_verify_all_repos(tmpdir): | ||
create_test_repo() | ||
dt.verify_all_repos(dt.mirror_dir) | ||
|
||
|
||
def test_read_no_reference(tmpdir): | ||
# tmpdir is empty - join(tmpdir, 'reference.json') does not exist | ||
with pytest.raises(dt.NoReferenceError): | ||
dt.read_reference() | ||
|
||
|
||
def test_write_and_read_reference(tmpdir): | ||
create_test_repo() | ||
dt.write_reference(join(tmpdir, 'repo')) | ||
ref = dt.read_reference() | ||
assert ref[join(dt.mirror_dir, 'linux-64')]['a-1.0-0.tar.bz2']['md5'] == \ | ||
EMPTY_MD5 | ||
|
||
|
||
def test_get_updates(tmpdir): | ||
create_test_repo() | ||
dt.write_reference(join(tmpdir, 'repo')) | ||
assert list(dt.get_updates(dt.mirror_dir)) == [] | ||
|
||
create_test_repo('win-32') | ||
lst = sorted(dt.get_updates(dt.mirror_dir)) | ||
assert lst == ['win-32/a-1.0-0.tar.bz2', | ||
'win-32/repodata.json', | ||
'win-32/repodata.json.bz2'] | ||
|
||
|
||
def test_tar_repo(tmpdir): | ||
create_test_repo() | ||
tarball = join(tmpdir, 'up.tar') | ||
dt.write_reference(dt.mirror_dir) | ||
create_test_repo('win-32') | ||
dt.tar_repo(dt.mirror_dir, tarball) | ||
assert isfile(tarball) | ||
|
||
|
||
def run_with_args(args): | ||
old_args = list(sys.argv) | ||
sys.argv = ['conda-diff-tar'] + args | ||
dt.main() | ||
sys.argv = old_args | ||
|
||
|
||
def test_version(): | ||
run_with_args(['--version']) | ||
|
||
|
||
def test_misc(tmpdir): | ||
create_test_repo() | ||
run_with_args(['--reference', dt.mirror_dir]) | ||
assert isfile(dt.REFERENCE_PATH) | ||
create_test_repo('win-32') | ||
run_with_args(['--show', dt.mirror_dir]) | ||
run_with_args(['--create', '--verbose', dt.mirror_dir]) | ||
run_with_args(['--verify', dt.mirror_dir]) | ||
run_with_args([dt.mirror_dir]) # do nothing |