Skip to content

Commit

Permalink
Add support for parsing XML files (#51)
Browse files Browse the repository at this point in the history
* Add support for parsing XML files

* Add sample.xml test

* lint
  • Loading branch information
akariv committed Nov 18, 2018
1 parent 605fe37 commit 7e13fda
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 1 deletion.
14 changes: 14 additions & 0 deletions data/sample.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<library>
<book>
<title>The Fellowship of the Ring</title>
<publication-year>1954</publication-year>
</book>
<book>
<title>The Two Towers</title>
<publication-year>1954</publication-year>
</book>
<book>
<title>The Return of the King</title>
<publication-year>1955</publication-year>
</book>
</library>
59 changes: 59 additions & 0 deletions dataflows/processors/load.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,68 @@
import os

from datapackage import Package, Resource
from tabulator.parser import Parser
from tabulator.helpers import reset_stream
from .. import DataStreamProcessor
from ..base.schema_validator import schema_validator
from ..helpers.resource_matcher import ResourceMatcher


class XMLParser(Parser):
options = []

def __init__(self, loader, force_parse, **options):
self.__loader = loader
self.__force_parse = force_parse
self.__extended_rows = None
self.__encoding = None
self.__chars = None

def open(self, source, encoding=None):
self.close()
self.__chars = self.__loader.load(source, encoding=encoding)
self.__encoding = getattr(self.__chars, 'encoding', encoding)
if self.__encoding:
self.__encoding.lower()
self.reset()

def close(self):
if not self.closed:
self.__chars.close()

def reset(self):
reset_stream(self.__chars)
self.__extended_rows = self.__iter_extended_rows()

@property
def closed(self):
return self.__chars is None or self.__chars.closed

@property
def encoding(self):
return self.__encoding

@property
def extended_rows(self):
return self.__extended_rows

# Private

def __iter_extended_rows(self):
from xml.etree.ElementTree import parse
from xmljson import parker

parsed = parker.data(parse(self.__chars).getroot())
elements = list(parsed.values())
if len(elements) > 0:
elements = elements[0]
else:
elements = []
for row_number, row in enumerate(elements, start=1):
keys, values = zip(*(row.items()))
yield (row_number, list(keys), list(values))


class load(DataStreamProcessor):

def __init__(self, load_source, name=None, resources=None, validate=False, strip=True, **options):
Expand Down Expand Up @@ -50,6 +107,8 @@ def process_datapackage(self, dp: Package):
descriptor['format'] = self.options.get('format')
if 'encoding' in self.options:
descriptor['encoding'] = self.options['encoding']
if descriptor['format'] == 'xml' or self.load_source.endswith('.xml'):
self.options.setdefault('custom_parsers', {})['xml'] = XMLParser
self.options.setdefault('ignore_blank_headers', True)
self.options.setdefault('headers', 1)
self.res = Resource(descriptor,
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def read(*paths):
'inquirer',
'tabulate',
'tableschema-sql',
'xmljson',
]
SPEEDUP_REQUIRES = [
'plyvel',
Expand Down
13 changes: 12 additions & 1 deletion tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,15 @@ def func(row):
{'a': 3, 'b': 4},
{'a': 5, 'b': 6}
]
assert len(dp.resources[0].schema.fields) == 2
assert len(dp.resources[0].schema.fields) == 2

def test_load_xml():
from dataflows import Flow, load

results, dp, stats = Flow(load('data/sample.xml')).results()

assert results[0] == [
{'publication-year': 1954, 'title': 'The Fellowship of the Ring'},
{'publication-year': 1954, 'title': 'The Two Towers'},
{'publication-year': 1955, 'title': 'The Return of the King'}
]

0 comments on commit 7e13fda

Please sign in to comment.