From 7e13fda9086ef923aad09fa42adec6e0d77ce010 Mon Sep 17 00:00:00 2001 From: Adam Kariv Date: Sun, 18 Nov 2018 09:12:36 +0200 Subject: [PATCH] Add support for parsing XML files (#51) * Add support for parsing XML files * Add sample.xml test * lint --- data/sample.xml | 14 +++++++++ dataflows/processors/load.py | 59 ++++++++++++++++++++++++++++++++++++ setup.py | 1 + tests/test_lib.py | 13 +++++++- 4 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 data/sample.xml diff --git a/data/sample.xml b/data/sample.xml new file mode 100644 index 0000000..b617958 --- /dev/null +++ b/data/sample.xml @@ -0,0 +1,14 @@ + + + The Fellowship of the Ring + 1954 + + + The Two Towers + 1954 + + + The Return of the King + 1955 + + \ No newline at end of file diff --git a/dataflows/processors/load.py b/dataflows/processors/load.py index 13cb12e..d7bdbda 100644 --- a/dataflows/processors/load.py +++ b/dataflows/processors/load.py @@ -1,11 +1,68 @@ import os from datapackage import Package, Resource +from tabulator.parser import Parser +from tabulator.helpers import reset_stream from .. import DataStreamProcessor from ..base.schema_validator import schema_validator from ..helpers.resource_matcher import ResourceMatcher +class XMLParser(Parser): + options = [] + + def __init__(self, loader, force_parse, **options): + self.__loader = loader + self.__force_parse = force_parse + self.__extended_rows = None + self.__encoding = None + self.__chars = None + + def open(self, source, encoding=None): + self.close() + self.__chars = self.__loader.load(source, encoding=encoding) + self.__encoding = getattr(self.__chars, 'encoding', encoding) + if self.__encoding: + self.__encoding.lower() + self.reset() + + def close(self): + if not self.closed: + self.__chars.close() + + def reset(self): + reset_stream(self.__chars) + self.__extended_rows = self.__iter_extended_rows() + + @property + def closed(self): + return self.__chars is None or self.__chars.closed + + @property + def encoding(self): + return self.__encoding + + @property + def extended_rows(self): + return self.__extended_rows + + # Private + + def __iter_extended_rows(self): + from xml.etree.ElementTree import parse + from xmljson import parker + + parsed = parker.data(parse(self.__chars).getroot()) + elements = list(parsed.values()) + if len(elements) > 0: + elements = elements[0] + else: + elements = [] + for row_number, row in enumerate(elements, start=1): + keys, values = zip(*(row.items())) + yield (row_number, list(keys), list(values)) + + class load(DataStreamProcessor): def __init__(self, load_source, name=None, resources=None, validate=False, strip=True, **options): @@ -50,6 +107,8 @@ def process_datapackage(self, dp: Package): descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] + if descriptor['format'] == 'xml' or self.load_source.endswith('.xml'): + self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, diff --git a/setup.py b/setup.py index ee27cba..cf9eec2 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ def read(*paths): 'inquirer', 'tabulate', 'tableschema-sql', + 'xmljson', ] SPEEDUP_REQUIRES = [ 'plyvel', diff --git a/tests/test_lib.py b/tests/test_lib.py index 5e5679f..009aa7e 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -535,4 +535,15 @@ def func(row): {'a': 3, 'b': 4}, {'a': 5, 'b': 6} ] - assert len(dp.resources[0].schema.fields) == 2 \ No newline at end of file + assert len(dp.resources[0].schema.fields) == 2 + +def test_load_xml(): + from dataflows import Flow, load + + results, dp, stats = Flow(load('data/sample.xml')).results() + + assert results[0] == [ + {'publication-year': 1954, 'title': 'The Fellowship of the Ring'}, + {'publication-year': 1954, 'title': 'The Two Towers'}, + {'publication-year': 1955, 'title': 'The Return of the King'} + ]