In [2]:
from lxml import etree

In [3]:
root = etree.Element('root')

In [4]:
root.tag

'root'

In [5]:
root.append(etree.Element('child1'))

In [6]:
child2 = etree.SubElement(root, 'child2')
child3 = etree.SubElement(root, 'child3')

In [7]:
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'


In [8]:
# subelements are list

In [9]:
child = root[0]

In [10]:
child.tag

'child1'

In [11]:
len(root)

3

In [12]:
root.index(root[1])

1

In [13]:
for child in root:
    print(child.tag)

child1
child2
child3


In [14]:
root.insert(0, element=etree.Element('child0'))

In [15]:
root[:1]

[<Element child0 at 0x112e3cd08>]

In [16]:
root[-1:]

[<Element child3 at 0x112e3c308>]

In [17]:
len(etree.Element('root'))

0

In [18]:
etree.iselement(root)

True

In [19]:
len(root)

4

In [20]:
etree.tostring(root)

b'<root><child0/><child1/><child2/><child3/></root>'

In [21]:
etree.tounicode(root)

'<root><child0/><child1/><child2/><child3/></root>'

In [22]:
for child in root:
    print(child.tag)

child0
child1
child2
child3


In [23]:
root[0] = root[-1]  # this moves the element in lxml.etree!

In [24]:
for child in root:
    print(child.tag)

child3
child1
child2


In [25]:
# in the original ElementTree, a single Element object can sit in any
# number of places in any number of trees, which allows for the same copy
# operation as with lists. The obvious drawback is that modifications to
# such an Element will apply to all places where it appears in a tree,
# which may or may not be intended

# the upside of this difference is that an Element in lxml.etree always has
# exactly one parent.

In [26]:
root is root[0].getparent()

True

In [27]:
from copy import deepcopy

In [28]:
element = etree.Element('neu')

In [29]:
element.append(deepcopy(root[1]))

In [30]:
element[0].tag

'child1'

In [31]:
print([x.tag for x in root])

['child3', 'child1', 'child2']


In [32]:
root[0] is root[1].getprevious()  # lxml.etree only!

True

In [33]:
root[1] is root[0].getnext()  # lxml.etree only!

True

In [34]:
# Elements carry attributes as dict

root = etree.Element('root', interesting='totally')

In [35]:
etree.tostring(root)

b'<root interesting="totally"/>'

In [36]:
root.get('interesting')

'totally'

In [37]:
print(root.get('h'))

None


In [38]:
root.set('hello', 'huhu')

In [39]:
root.get('hello')

'huhu'

In [40]:
etree.tostring(root)

b'<root interesting="totally" hello="huhu"/>'

In [41]:
root.keys()

['interesting', 'hello']

In [42]:
for k, v in sorted(root.items()):
    print('%s = %r' % (k, v))
    print('%s = %s' % (k, v))

hello = 'huhu'
hello = huhu
interesting = 'totally'
interesting = totally


In [43]:
# any changes to the Element are reflected in attrib and vice versa

root.attrib

{'interesting': 'totally', 'hello': 'huhu'}

In [44]:
root.attrib['hello'] = 'guten tag'

In [45]:
etree.tostring(root)

b'<root interesting="totally" hello="guten tag"/>'

In [46]:
root.get('hello')

'guten tag'

In [47]:
root = etree.Element('root')

In [48]:
root.text = 'TEXT'

In [49]:
etree.tostring(root)

b'<root>TEXT</root>'

In [50]:
root.text

'TEXT'

In [51]:
html = etree.Element('html')

In [52]:
body = etree.SubElement(html, 'body')

In [53]:
body.text = 'TEXT'

In [54]:
etree.tostring(html)

b'<html><body>TEXT</body></html>'

In [55]:
br = etree.SubElement(body, 'br')

In [56]:
etree.tostring(html)

b'<html><body>TEXT<br/></body></html>'

In [57]:
br.tail = 'TAIL'  # tail!

In [58]:
etree.tostring(html)

b'<html><body>TEXT<br/>TAIL</body></html>'

In [59]:
etree.tostring(br)

b'<br/>TAIL'

In [60]:
etree.tostring(br, with_tail=False)  # lxml.etree only!

b'<br/>'

In [61]:
etree.tostring(html, method='text')

b'TEXTTAIL'

In [62]:
print(html.xpath('string()'))  # lxml.etree only!

TEXTTAIL


In [63]:
print(html.xpath('//text()'))  # lxml.etree only!

['TEXT', 'TAIL']


In [64]:
build_text_list = etree.XPath('//text()')  # lxml.etree only!

In [65]:
build_text_list(html)

['TEXT', 'TAIL']

In [66]:
texts = build_text_list(html)

In [67]:
texts[0]

'TEXT'

In [68]:
texts[0].getparent().tag

'body'

In [69]:
texts[0].is_text

True

In [70]:
texts[1].is_text

False

In [71]:
texts[1].is_tail

True

In [72]:
# While this works for the results of the text() function, lxml will **not**
# tell you the origin of a string value that was constructed by the XPath
# functions string() or concat()

In [73]:
stringify = etree.XPath('string()')
print(stringify(html))
print(stringify(html).getparent())

TEXTTAIL
None


In [74]:
# Tree iteration

root = etree.Element("root")
etree.SubElement(root, "child").text = "Child 1"
etree.SubElement(root, "child").text = "Child 2"
etree.SubElement(root, "another").text = "Child 3"

In [75]:
print(etree.tounicode(root, pretty_print=True))

<root>
  <child>Child 1</child>
  <child>Child 2</child>
  <another>Child 3</another>
</root>



In [76]:
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child>Child 1</child>\n  <child>Child 2</child>\n  <another>Child 3</another>\n</root>\n'


In [77]:
for element in root.iter():
    print('%s - %s' % (element.tag, element.text))

root - None
child - Child 1
child - Child 2
another - Child 3


In [78]:
for element in root.iter('another', 'child'):
    print('%s - %s' % (element.tag, element.text))

child - Child 1
child - Child 2
another - Child 3


In [79]:
root.append(etree.Entity("#234"))
root.append(etree.Comment("some comment"))

In [80]:
print(etree.tounicode(root, pretty_print=True))

<root><child>Child 1</child><child>Child 2</child><another>Child 3</another>&#234;<!--some comment--></root>



In [81]:
for ele in root.iter():
    if isinstance(ele.tag, str):
        print('%s - %s' % (ele.tag, ele.text))
    else:
        print("SPECIAL: %s (%s)- %s" % (ele, ele.tag, ele.text))

root - None
child - Child 1
child - Child 2
another - Child 3
SPECIAL: &#234; (<cyfunction Entity at 0x112def8e8>)- &#234;
SPECIAL: <!--some comment--> (<cyfunction Comment at 0x112def778>)- some comment


In [82]:
for ele in root.iter(tag=etree.Element):
    print('%s - %s' % (ele.tag, ele.text))

root - None
child - Child 1
child - Child 2
another - Child 3


In [83]:
for ele in root.iter(tag=etree.Entity):
    print(ele.tag, ele.text)

<cyfunction Entity at 0x112def8e8> &#234;


In [84]:
# passing a wildcard `*` will also yield all Element nodes(and only elements)

for ele in root.iter('*'):
    print(ele.tag, ele.text)
    print(list(ele))

root None
[<Element child at 0x110a5db88>, <Element child at 0x110a64a08>, <Element another at 0x112e3c888>, &#234;, <!--some comment-->]
child Child 1
[]
child Child 2
[]
another Child 3
[]


In [85]:
root = etree.XML('<root><a><b/></a></root>')

In [86]:
etree.tostring(root)

b'<root><a><b/></a></root>'

In [87]:
print(etree.tostring(root, xml_declaration=True))

b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"


In [88]:
root = etree.XML('<html><head/><body><p>Hello<br/>World</p></body></html>')

In [89]:
etree.tostring(root)  # default to xml

b'<html><head/><body><p>Hello<br/>World</p></body></html>'

In [90]:
etree.tostring(root, method='html')

b'<html><head></head><body><p>Hello<br>World</p></body></html>'

In [91]:
print(etree.tostring(root, method='html', pretty_print=True).decode('ascii'))

<html>
<head></head>
<body><p>Hello<br>World</p></body>
</html>



In [92]:
etree.tostring(root, method='text')

b'HelloWorld'

In [93]:
# As for XML serialisation, the default encoding for plain text
# serialisation is ASCII

In [100]:
etree.tostring(root, method='text', encoding='utf8')

b'HelloWorld'

In [97]:
etree.tostring(root, method='text', encoding='unicode')

'HelloWorld'

In [None]:
# an ElementTree is mainly a document wrapper around a tree
# with a root node

In [101]:
# Parsing from strings and files

some_xml_data = "<root>data</root>"
root = etree.fromstring(some_xml_data)

In [102]:
root.tag

'root'

In [103]:
etree.tostring(root)

b'<root>data</root>'

In [104]:
root = etree.XML('<root>data</root>')

In [113]:
root = etree.HTML('<p>data</p>')
type(root)

# type is Element!

lxml.etree._Element

In [106]:
etree.tostring(root)

b'<html><body><p>data</p></body></html>'

In [107]:
# The parse() used to parse from files and file-like objects.

In [108]:
from io import BytesIO

In [109]:
some_file_or_file_like_obj = BytesIO(b"<root>data</root>")

In [110]:
tree = etree.parse(some_file_or_file_like_obj)

In [111]:
etree.tostring(tree)

b'<root>data</root>'

In [112]:
# Type is ElementTree!
type(tree)

# The reasoning behind this difference is that parse() returns
# a complete document from a file, while the string parsing
# functions are commonly used to parse XML fragments.

lxml.etree._ElementTree

In [114]:
# Parser objects

parser = etree.XMLParser(remove_blank_text=True)  # lxml.etree only

In [115]:
root = etree.XML("<root>  <a/>   <b>  </b>     </root>", parser)

In [116]:
etree.tostring(root)

b'<root><a/><b>  </b></root>'

In [117]:
# Incremental parsing

class DataSource(object):
    data = [b"<roo", b"t><", b"a/", b"><", b"/root>"]
    def read(self, request_size):
        try:
            return self.data.pop(0)
        except IndexError:
            return b''

tree = etree.parse(DataSource())

In [118]:
etree.tostring(tree)

b'<root><a/></root>'

In [119]:
parser = etree.XMLParser()
parser.feed("<roo")
parser.feed("t><")
parser.feed("a/")
parser.feed("><")
parser.feed("/root>")

root = parser.close()

In [120]:
etree.tostring(root)

b'<root><a/></root>'

In [121]:
# can reuse the parser

parser.feed('<root/>')
root = parser.close()
etree.tostring(root)

b'<root/>'

In [122]:
# Event-driven parsing

In [126]:
some_file_like = BytesIO(b"<root><a>data</a></root>")
for event, ele in etree.iterparse(some_file_like):
    print('%s, %4s, %s' % (event, ele.tag, ele.text))

end,    a, data
end, root, None


In [127]:
# By default, iterparse() only generates events when it is done
# parsing an element, but you can control this through the 
# events keyword argument

In [128]:
some_file_like = BytesIO(b"<root><a>data</a></root>")
for event, ele in etree.iterparse(some_file_like, events=('start', 'end')):
    print('%s, %4s, %s' % (event, ele.tag, ele.text))

start, root, None
start,    a, data
end,    a, data
end, root, None


In [130]:
# use .clear() to save memory
some_file_like = BytesIO(
    b"<root><a><b>data</b></a><a><b/></a></root>")
for event, ele in etree.iterparse(some_file_like):
    if ele.tag == 'b':
        print(ele.text)
    elif ele.tag == 'a':
        print('* clearning up the subtree')
        ele.clear()

data
* clearning up the subtree
None
* clearning up the subtree


In [131]:
# http://effbot.org/zone/element.htm

In [133]:
def iterparent(tree):
    for parent in tree.getiterator():
        for child in parent:
            yield parent, child

for parent, child in iterparent(tree):
    pass
    # ... work on parent/child tuple

In [137]:
root = etree.XML('<root><a><b/></a></root>')
for p, c in iterparent(root):
    print(p, c)

<Element root at 0x112fdb048> <Element a at 0x112fdbb08>
<Element a at 0x112fdbb08> <Element b at 0x112fdbf88>


In [138]:
etree.dump(root)

<root>
  <a>
    <b/>
  </a>
</root>


In [None]:
# searching for subelements

# find(pattern)
# findtext(pattern)
# findall(pattern)

# getiterator => in depth-first order