Permalink
Browse files

Adding support for XML namespaces.

  • Loading branch information...
1 parent 95d471e commit 2efd7e3c796f7692a6f4bff4693ae3a17095d62a @martinblech martinblech committed Aug 25, 2013
Showing with 141 additions and 8 deletions.
  1. +46 −1 README.md
  2. +1 −1 tests/test_dicttoxml.py
  3. +65 −0 tests/test_xmltodict.py
  4. +29 −6 xmltodict.py
View
@@ -27,7 +27,50 @@ u'complex'
u'element as well'
```
-It's very fast ([Expat](http://docs.python.org/library/pyexpat.html)-based) and has a streaming mode with a small memory footprint, suitable for big XML dumps like [Discogs](http://discogs.com/data/) or [Wikipedia](http://dumps.wikimedia.org/):
+## Namespace support
+
+By default, `xmltodict` does no XML namespace processing (it just treats namespace declarations as regular node attributes), but passing `process_namespaces=True` will make it expand namespaces for you:
+
+```python
+>>> xml = """
+... <root xmlns="http://defaultns.com/"
+... xmlns:a="http://a.com/"
+... xmlns:b="http://b.com/">
+... <x>1</x>
+... <a:y>2</a:y>
+... <b:z>3</b:z>
+... </root>
+... """
+>>> assert xmltodict.parse(xml, process_namespaces=True) == {
+... 'http://defaultns.com/:root': {
+... 'http://defaultns.com/:x': '1',
+... 'http://a.com/:y': '2',
+... 'http://b.com/:z': '3',
+... }
+... }
+True
+```
+
+It also lets you collapse certain namespaces to shorthand prefixes, or skip them altogether:
+
+```python
+>>> namespaces = {
+... 'http://defaultns.com/': None, # skip this namespace
+... 'http://a.com/': 'ns_a', # collapse "http://a.com/" -> "ns_a"
+... }
+>>> assert xmltodict.parse(xml, namespaces=namespaces) == {
+... 'root': {
+... 'x': '1',
+... 'ns_a:y': '2',
+... 'http://b.com/:z': '3',
+... },
+... }
+True
+```
+
+## Streaming mode
+
+`xmltodict` is very fast ([Expat](http://docs.python.org/library/pyexpat.html)-based) and has a streaming mode with a small memory footprint, suitable for big XML dumps like [Discogs](http://discogs.com/data/) or [Wikipedia](http://dumps.wikimedia.org/):
```python
>>> def handle_artist(_, artist):
@@ -77,6 +120,8 @@ $ cat enwiki.dicts.gz | gunzip | script2.py
...
```
+## Roundtripping
+
You can also convert in the other direction, using the `unparse()` method:
```python
View
@@ -79,7 +79,7 @@ def p(key, value):
self.assertEqual(_strip(unparse(obj, preprocessor=p)),
'<a><c>2</c></a>')
- if 'OrderedDict' in dir(collections):
+ if hasattr(collections, 'OrderedDict'):
def test_attr_order_roundtrip(self):
xml = '<root a="1" b="2" c="3"></root>'
self.assertEqual(xml, _strip(unparse(parse(xml))))
View
@@ -154,3 +154,68 @@ def test_encoded_string(self):
xml = '<a>%s</a>' % value
self.assertEqual(parse(xml),
parse(xml.encode('utf-8')))
+
+ def test_namespace_support(self):
+ xml = """
+ <root xmlns="http://defaultns.com/"
+ xmlns:a="http://a.com/"
+ xmlns:b="http://b.com/">
+ <x>1</x>
+ <a:y>2</a:y>
+ <b:z>3</b:z>
+ </root>
+ """
+ d = {
+ 'http://defaultns.com/:root': {
+ 'http://defaultns.com/:x': '1',
+ 'http://a.com/:y': '2',
+ 'http://b.com/:z': '3',
+ }
+ }
+ self.assertEqual(parse(xml, process_namespaces=True), d)
+
+ def test_namespace_collapse(self):
+ xml = """
+ <root xmlns="http://defaultns.com/"
+ xmlns:a="http://a.com/"
+ xmlns:b="http://b.com/">
+ <x>1</x>
+ <a:y>2</a:y>
+ <b:z>3</b:z>
+ </root>
+ """
+ namespaces = {
+ 'http://defaultns.com/': None,
+ 'http://a.com/': 'ns_a',
+ }
+ d = {
+ 'root': {
+ 'x': '1',
+ 'ns_a:y': '2',
+ 'http://b.com/:z': '3',
+ },
+ }
+ self.assertEqual(
+ parse(xml, process_namespaces=True, namespaces=namespaces), d)
+
+ def test_namespace_ignore(self):
+ xml = """
+ <root xmlns="http://defaultns.com/"
+ xmlns:a="http://a.com/"
+ xmlns:b="http://b.com/">
+ <x>1</x>
+ <a:y>2</a:y>
+ <b:z>3</b:z>
+ </root>
+ """
+ d = {
+ 'root': {
+ '@xmlns': 'http://defaultns.com/',
+ '@xmlns:a': 'http://a.com/',
+ '@xmlns:b': 'http://b.com/',
+ 'x': '1',
+ 'a:y': '2',
+ 'b:z': '3',
+ },
+ }
+ self.assertEqual(parse(xml), d)
View
@@ -45,7 +45,9 @@ def __init__(self,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
- strip_whitespace=True):
+ strip_whitespace=True,
+ namespace_separator=':',
+ namespaces=None):
self.path = []
self.stack = []
self.data = None
@@ -60,8 +62,24 @@ def __init__(self,
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
+ self.namespace_separator = namespace_separator
+ self.namespaces = namespaces
+
+ def _build_name(self, full_name):
+ if not self.namespaces:
+ return full_name
+ i = full_name.rfind(self.namespace_separator)
+ if i == -1:
+ return full_name
+ namespace, name = full_name[:i], full_name[i+1:]
+ short_namespace = self.namespaces.get(namespace, namespace)
+ if not short_namespace:
+ return name
+ else:
+ return self.namespace_separator.join((short_namespace, name))
- def startElement(self, name, attrs):
+ def startElement(self, full_name, attrs):
+ name = self._build_name(full_name)
attrs = self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
@@ -75,7 +93,8 @@ def startElement(self, name, attrs):
self.item = attrs or None
self.data = None
- def endElement(self, name):
+ def endElement(self, full_name):
+ name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
@@ -124,7 +143,8 @@ def push_data(self, item, key, data):
item[key] = data
return item
-def parse(xml_input, encoding='utf-8', expat=expat, *args, **kwargs):
+def parse(xml_input, encoding='utf-8', expat=expat, process_namespaces=False,
+ namespace_separator=':', **kwargs):
"""Parse the given XML input and convert it into a dictionary.
`xml_input` can either be a `string` or a file-like object.
@@ -191,8 +211,11 @@ def parse(xml_input, encoding='utf-8', expat=expat, *args, **kwargs):
OrderedDict([(u'a', u'hello')])
"""
- handler = _DictSAXHandler(*args, **kwargs)
- parser = expat.ParserCreate()
+ handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs)
+ parser = expat.ParserCreate(
+ encoding,
+ namespace_separator if process_namespaces else None
+ )
parser.ordered_attributes = True
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement

0 comments on commit 2efd7e3

Please sign in to comment.