From 5f250b47a750d6f9f7dbe8b77c9bcd1c6fdd31ec Mon Sep 17 00:00:00 2001 From: Christian Ocker Date: Wed, 6 Oct 2021 11:54:46 +0200 Subject: [PATCH] Add support for reading from TAR archives to flatdata-py (#182) Signed-off-by: Christian Ocker --- flatdata-py/README.md | 6 ++ .../lib/tar_archive_resource_storage.py | 101 ++++++++++++++++++ .../tests/test_tar_resource_storage.py | 40 +++++++ 3 files changed, 147 insertions(+) create mode 100644 flatdata-py/flatdata/lib/tar_archive_resource_storage.py create mode 100644 flatdata-py/tests/test_tar_resource_storage.py diff --git a/flatdata-py/README.md b/flatdata-py/README.md index f0549a88..9d194c1d 100644 --- a/flatdata-py/README.md +++ b/flatdata-py/README.md @@ -4,6 +4,12 @@ Python 3 implementation of [flatdata](https://github.com/heremaps/flatdata). +## Running the tests + +```sh +python3 -m nose +``` + ## Basic usage Once you have [created a flatdata schema file](../README.md#creating-a-schema), you can generate a Python module to read your existing `flatdata` archive: diff --git a/flatdata-py/flatdata/lib/tar_archive_resource_storage.py b/flatdata-py/flatdata/lib/tar_archive_resource_storage.py new file mode 100644 index 00000000..cfad76ac --- /dev/null +++ b/flatdata-py/flatdata/lib/tar_archive_resource_storage.py @@ -0,0 +1,101 @@ +''' + Copyright (c) 2021 HERE Europe B.V. + See the LICENSE file in the root of this project for license details. +''' + +import tarfile + +from .errors import CorruptResourceError +from .errors import MissingResourceError +from .file_resource_storage import FileResourceStorage + + +class TarArchiveResourceStorage: + """ + Resource storage based on a memory-mapped TAR archive. + """ + + def __init__(self, tar_map, file_entries, dir_entries, sub_path): + self.tar_map = tar_map + self.file_entries = file_entries + self.dir_entries = dir_entries + self.sub_path = sub_path + + @classmethod + def create(cls, tar_path, sub_path=""): + tar_map = FileResourceStorage.memory_map(tar_path) + file_entries = dict() + dir_entries = set() + with tarfile.open(tar_path, "r:") as tar: + for file in tar: + name = file.name + if name.startswith("./"): + name = name[2:] + if file.type == tarfile.GNUTYPE_SPARSE: + raise CorruptResourceError("Sparse files are not supported") + if file.isreg(): + file_entries[name] = (file.offset_data, file.size) + if file.isdir(): + dir_entries.add(name) + + return cls(tar_map, file_entries, dir_entries, sub_path) + + def get(self, key, is_optional=False): + path = self._path(key) + if path in self.file_entries: + (offset, length) = self.file_entries[path] + return MemoryMapSection(self.tar_map, offset, length) + + if path in self.dir_entries: + return TarArchiveResourceStorage(self.tar_map, self.file_entries, self.dir_entries, path) + + if not is_optional: + raise MissingResourceError(key) + else: + return None + + def _path(self, key): + if not self.sub_path: + return key + else: + return self.sub_path + '/' + key + + +class MemoryMapSection: + """ + Represent a slice of a memory mapped file. + Keeps track of its position, as to emulate pointing to a dedicated file. + """ + + def __init__(self, inner, offset, length): + self.inner = inner + self.offset = offset + self.length = length; + self.pos = 0 + + def __len__(self): + return self.size() + + def __getitem__(self, key): + if isinstance(key, slice): + start = key.start if key.start is not None else 0 + start = self.offset + min(start, self.length) + stop = key.stop if key.stop is not None else self.length + stop = self.offset + min(stop, self.length) + return self.inner[slice(start, stop, key.step)] + else: + if key < self.length: + return self.inner.__getitem__(self.offset + key) + else: + raise IndexError('index out of range') + + def read(self, n=None): + if n is None: + n = self.length - self.pos + self.inner.seek(self.offset + self.pos) + data = self.inner.read(min(n, self.length - self.pos)) + self.pos += len(data) + return data + + def size(self): + return min(self.length, self.inner.size() - self.offset) diff --git a/flatdata-py/tests/test_tar_resource_storage.py b/flatdata-py/tests/test_tar_resource_storage.py new file mode 100644 index 00000000..fcb40e0d --- /dev/null +++ b/flatdata-py/tests/test_tar_resource_storage.py @@ -0,0 +1,40 @@ +from common import * +from flatdata.generator.engine import Engine +from flatdata.lib.tar_archive_resource_storage import TarArchiveResourceStorage + +import tarfile +import tempfile +import os +from nose.tools import eq_, assert_is_instance + + +def check_signed_struct(s): + eq_(-0x1, s.a) + eq_(0x01234567, s.b) + eq_(-0x28, s.c) + eq_(0, s.d) + + +def test_tar_resource_storage(): + module = Engine(INSTANCE_TEST_SCHEMA).render_python_module() + valid_data = { + "Archive.archive": ARCHIVE_SIGNATURE_PAYLOAD, + "Archive.archive.schema": module.backward_compatibility_Archive.schema().encode(), + "resource": RESOURCE_PAYLOAD, + "resource.schema": module.backward_compatibility_Archive.resource_schema('resource').encode() + } + + with tempfile.TemporaryDirectory() as tmpdir: + archive_path = os.path.join(tmpdir, "archive.tar") + cwd = os.getcwd() + os.chdir(tmpdir) + tar = tarfile.open(archive_path, "w") + for key, value in valid_data.items(): + with open(os.path.join(tmpdir, key), "wb") as file: + file.write(value) + tar.add(key) + tar.close() + os.chdir(cwd) + + archive = module.backward_compatibility_Archive(TarArchiveResourceStorage.create(archive_path)) + check_signed_struct(archive.resource)