Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

allow to remove xhtml or all namespace

  • Loading branch information...
commit a8ba163087d601570a19297b5e95c2b9356a88dc 1 parent de4d555
@gawel authored
Showing with 68 additions and 6 deletions.
  1. +41 −6 pyquery/pyquery.py
  2. +27 −0 pyquery/test.py
View
47 pyquery/pyquery.py
@@ -221,12 +221,10 @@ def __init__(self, *args, **kwargs):
# select nodes
if elements and selector is not no_default:
xpath = self._css_to_xpath(selector)
- results = [tag.xpath(xpath, namespaces=namespaces) \
- for tag in elements]
- # Flatten the results
- elements = []
- for r in results:
- elements.extend(r)
+ results = []
+ for tag in elements:
+ results.extend(tag.xpath(xpath, namespaces=namespaces))
+ elements = results
list.__init__(self, elements)
@@ -276,6 +274,43 @@ def items(self, selector=None):
for elem in elems:
yield self.__class__(elem)
+ def xhtml_to_html(self):
+ """Remove xhtml namespace:
+
+ >>> doc = PyQuery(
+ ... '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
+ >>> doc
+ [<{http://www.w3.org/1999/xhtml}html>]
+ >>> doc.remove_namespaces()
+ [<html>]
+ """
+ try:
+ root = self[0].getroottree()
+ except IndexError:
+ pass
+ else:
+ lxml.html.xhtml_to_html(root)
+ return self
+
+ def remove_namespaces(self):
+ """Remove all namespaces:
+
+ >>> doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
+ >>> doc
+ [<{http://example.com/foo}foo>]
+ >>> doc.remove_namespaces()
+ [<foo>]
+ """
+ try:
+ root = self[0].getroottree()
+ except IndexError:
+ pass
+ else:
+ for el in root.iter('{*}*'):
+ if el.tag.startswith('{'):
+ el.tag = el.tag.split('}', 1)[1]
+ return self
+
def __str__(self):
"""xml representation of current nodes::
View
27 pyquery/test.py
@@ -502,6 +502,13 @@ class TestXMLNamespace(unittest.TestCase):
<idiot>123</idiot>
</foo>'''
+ xhtml = '''
+ <html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+ <div>What</div>
+ </body>
+ </html>'''
+
def test_selector(self):
expected = 'What'
d = pq(b(self.xml), parser='xml')
@@ -522,6 +529,26 @@ def test_selector_html(self):
val = d.text()
self.assertEqual(repr(val), repr(expected))
+ def test_xhtml_namespace(self):
+ expected = 'What'
+ d = pq(b(self.xhtml), parser='xml')
+ d.xhtml_to_html()
+ val = d('div').text()
+ self.assertEqual(repr(val), repr(expected))
+
+ def test_xhtml_namespace_html_parser(self):
+ expected = 'What'
+ d = pq(self.xhtml, parser='html')
+ d.xhtml_to_html()
+ val = d('div').text()
+ self.assertEqual(repr(val), repr(expected))
+
+ def test_remove_namespaces(self):
+ expected = 'What'
+ d = pq(b(self.xml), parser='xml').remove_namespaces()
+ val = d('blah').text()
+ self.assertEqual(repr(val), repr(expected))
+
class TestWebScrapping(unittest.TestCase):
Please sign in to comment.
Something went wrong with that request. Please try again.