In [1]:
from lxml import etree

In [2]:
xmlstring = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
"""

### Parsing 

In [29]:
# etree.parse('country_data.xml') from file
root = etree.fromstring(xmlstring)

In [30]:
root.tag, root.attrib

('data', {})

In [31]:
for child in root:
    print(child.tag, child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [32]:
# Children are nested, and we can access specific child nodes by index:
root[1].attrib

{'name': 'Singapore'}

In [33]:
root[1][0].tag, root[1][0].text

('rank', '4')

In [34]:
# first element with tag country
root.find('country')

<Element country at 0x1071aba40>

In [35]:
# all elements with tag country
root.findall('country')

[<Element country at 0x1071aba40>,
 <Element country at 0x1071b0d80>,
 <Element country at 0x1071b3dc0>]

Exercise: Get a list of the arributes for all elements with tag 'country'.

# Xpath examples

There are two usedfull functions depending on how you parse the xml.

In [36]:
root = etree.fromstring(xmlstring)

In [37]:
# Top-level elements
# "." = selects current node
root.findall(".")

[<Element data at 0x107152640>]

In [38]:
root.xpath(".")

[<Element data at 0x107152640>]

Boths are very useful and xpath is more powefull. 

In [40]:
# All neighbor grand-children of 'country' of the top-level elements
# matches country tag and then neighbor tag
root.xpath("./country/neighbor")

[<Element neighbor at 0x1070db880>,
 <Element neighbor at 0x10709db40>,
 <Element neighbor at 0x1071dbc00>,
 <Element neighbor at 0x1071da7c0>,
 <Element neighbor at 0x1071dbf80>]

In [41]:
for neigh in root.xpath("./country/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [42]:
# similar result * matches everything
for neigh in root.xpath("./*/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [43]:
# matches neighbor anywhere
# // matches anywhere
for neigh in root.xpath("//neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [44]:
root.xpath(".//year")

[<Element year at 0x1071bbe00>,
 <Element year at 0x1071276c0>,
 <Element year at 0x1071d8d00>]

In [45]:
# printing text
root.xpath(".//year/text()")

['2008', '2011', '2011']

In [46]:
# printing atribute values
root.xpath("//neighbor/@name")

['Austria', 'Switzerland', 'Malaysia', 'Costa Rica', 'Colombia']

### Diference between "/" and "//"

In [47]:
xmlstring = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <neighbor name="Colombia" direction="E"/>
</data>
"""

In [48]:
root = etree.fromstring(xmlstring)

In [49]:
for neigh in root.xpath("./country/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}


In [50]:
for neigh in root.xpath("./*/neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}


In [51]:
for neigh in root.xpath("//neighbor"):
    print(neigh.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


## filtering 

In [52]:
xmlstring = """<?xml version="1.0"?>
<moviestore>
    <movie lang="en">
        <title>Inception</title>
        <director>Christopher Nolan</director>
        <genre>Science Fiction</genre>
        <year>2010</year>
    </movie>
    <movie lang="fr">
        <title>Le Fabuleux Destin d'Amélie Poulain</title>
        <director>Jean-Pierre Jeunet</director>
        <genre>Romantic Comedy</genre>
        <year>2001</year>
    </movie>
    <movie lang="en">
        <title>The Dark Knight</title>
        <director>Christopher Nolan</director>
        <genre>Action</genre>
        <year>2008</year>
    </movie>
</moviestore>"""

In [53]:
root = etree.fromstring(xmlstring)

In [54]:
# elments with movies with director "Christopher Nolan"
root.xpath('./movie[director="Christopher Nolan"]')

[<Element movie at 0x10718be00>, <Element movie at 0x1071dbc40>]

In [55]:
# now let's print the title
root.xpath('./movie[director="Christopher Nolan"]/title/text()')

['Inception', 'The Dark Knight']

## filtering atributes with @

In [56]:
# movies in english
root.xpath('./movie[@lang="en"]')

[<Element movie at 0x10718be00>, <Element movie at 0x1071dbc40>]

In [33]:
# exercise print the name of the movies in english

## match a tag anywhere

In [57]:
xmlstring = """<?xml version="1.0"?>
<root>
  <node1>
    <nodename>Matched Value 1</nodename>
  </node1>
  <node2>
    <nodename>Matched Value 2</nodename>
  </node2>
</root>"""

In [58]:
root = etree.fromstring(xmlstring)

In [59]:
root.xpath('//nodename')

[<Element nodename at 0x1071a21c0>, <Element nodename at 0x1071a3d40>]

### More complex example

In [60]:
xmlstring = """<?xml version="1.0"?>
<bookstore>
    <book>
        <title lang="en">Introduction to XPath</title>
        <author>John Doe</author>
        <price>29.99</price>
    </book>
    <book>
        <title lang="fr">XPath et XML</title>
        <author>Marie Dupont</author>
        <price>34.95</price>
    </book>
    <book>
        <title lang="en">XML Programming</title>
        <author>David Smith</author>
        <price>19.99</price>
    </book>
</bookstore>"""

In [61]:
# Parse the XML string
root = etree.fromstring(xmlstring)

# Find authors who write books in English
root.xpath("./book[title/@lang='en']/author/text()")

['John Doe', 'David Smith']

### Filtering based on numerical values

In [6]:
xmlstring="""
<products>
    <product>
        <name>Phone</name>
        <price>500</price>
        <rating>4.5</rating>
    </product>
    <product>
        <name>Laptop</name>
        <price>1200</price>
        <rating>4.7</rating>
    </product>
    <product>
        <name>Headphones</name>
        <price>150</price>
        <rating>4.2</rating>
    </product>
    <product>
        <name>Tablet</name>
        <price>300</price>
        <rating>4.0</rating>
    </product>
</products>
"""

In [7]:
root = etree.fromstring(xmlstring)

In [8]:
root.xpath("//product[price > 300]")

[<Element product at 0x107fc3900>, <Element product at 0x11d9c0400>]

In [9]:
root.xpath("//product[price >= 200 and price <= 600]")


[<Element product at 0x107fc3900>, <Element product at 0x11d9cf740>]