In [1]:
import requests
from lxml import html
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context("talk")
sns.set(style="white")

# Practice with Beautiful Soup and XPath

We will use the simple plant catalog to show how to retrieve information from XML and xHTML documents.



## Tasks

We will carry out four tasks:

1. Retrieve the common names of all plants 
2. Retrieve the plants that grow in zone 4
3. Retrieve the common names of plants that grow in zone 4
4. Retrieve the prices of plants whose prices are listed in USD

# Beautiful Soup Examples

In [2]:
soup = BeautifulSoup(open("plant_catalog.xml"), "lxml-xml")

In [3]:
soup

<?xml version="1.0" encoding="utf-8"?>
<catalog>
<plant>
<common>Bloodroot</common>
<botanical>Sanguinaria canadensis</botanical>
<zone>4</zone>
<light>Mostly Shady</light>
<price currency="USD">$2.44</price>
<availability>031599</availability>
</plant>
<plant>
<common>Columbine</common>
<botanical>Aquilegia canadensis</botanical>
<zone>3</zone>
<light>Mostly Shady</light>
<price currency="USD">$9.37</price>
<availability>030699</availability>
</plant>
<plant>
<common>Marsh Marigold</common>
<botanical>Caltha palustris</botanical>
<zone>4</zone>
<light>Mostly Sunny</light>
<price currency="CAD">$6.81</price>
<availability>051799</availability>
</plant>
</catalog>

In [4]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<catalog>
 <plant>
  <common>
   Bloodroot
  </common>
  <botanical>
   Sanguinaria canadensis
  </botanical>
  <zone>
   4
  </zone>
  <light>
   Mostly Shady
  </light>
  <price currency="USD">
   $2.44
  </price>
  <availability>
   031599
  </availability>
 </plant>
 <plant>
  <common>
   Columbine
  </common>
  <botanical>
   Aquilegia canadensis
  </botanical>
  <zone>
   3
  </zone>
  <light>
   Mostly Shady
  </light>
  <price currency="USD">
   $9.37
  </price>
  <availability>
   030699
  </availability>
 </plant>
 <plant>
  <common>
   Marsh Marigold
  </common>
  <botanical>
   Caltha palustris
  </botanical>
  <zone>
   4
  </zone>
  <light>
   Mostly Sunny
  </light>
  <price currency="CAD">
   $6.81
  </price>
  <availability>
   051799
  </availability>
 </plant>
</catalog>


### 1. Retrieve the common names of all plants 

In [5]:
soup.catalog.plant

<plant>
<common>Bloodroot</common>
<botanical>Sanguinaria canadensis</botanical>
<zone>4</zone>
<light>Mostly Shady</light>
<price currency="USD">$2.44</price>
<availability>031599</availability>
</plant>

We locate all common nodes with `find_all` and then from each we retrieve the text (string) content. 

In [6]:
c_nodes = soup.find_all('common')

In [7]:
for c in c_nodes:
    print(c.string)

Bloodroot
Columbine
Marsh Marigold


Alternatively, we locate all plant nodes with `find_all` and then from each we travel to the common child (can only have one) and on to the text (string) content. 

In [8]:
plants = soup.find_all('plant')
common_names = []
for p in plants:
    common_names.append(p.common.string)
    
common_names    

['Bloodroot', 'Columbine', 'Marsh Marigold']

### 2. Retrieve the plants that grow in zone 4

The following call to 'find_all' finds all zone nodes that have a text content of "4". That is, it filters on node names of 'zone' with text content "4". Then, for each of these zone nodes, we go up one step to the parent (e.g., 'z.parent')

In [9]:
zone4 = soup.find_all('zone', text = "4")

zone4

[<zone>4</zone>, <zone>4</zone>]

In [10]:
zone4_plants = []
for z in zone4:
    zone4_plants.append(z.parent)

In [11]:
zone4_plants

[<plant>
 <common>Bloodroot</common>
 <botanical>Sanguinaria canadensis</botanical>
 <zone>4</zone>
 <light>Mostly Shady</light>
 <price currency="USD">$2.44</price>
 <availability>031599</availability>
 </plant>, <plant>
 <common>Marsh Marigold</common>
 <botanical>Caltha palustris</botanical>
 <zone>4</zone>
 <light>Mostly Sunny</light>
 <price currency="CAD">$6.81</price>
 <availability>051799</availability>
 </plant>]

### 3. Retrieve the common names of plants that grow in zone 4

We can use the zone nodes that we found in the previous step. 
For each of these nodes, we go up the tree to the parent, then down one level to the child named 'common' and on to its text content.

Note that this will be problematic if a plant node has more than one common node.

In [12]:
zone4_names = []
for z in zone4:
    zone4_names.append(z.parent.common.string)
    
zone4_names    

['Bloodroot', 'Marsh Marigold']

### 4. Retrieve the prices of plants whose prices are listed in USD

In addition to filtering on the node name in 'find_all', we can also include a filter on the value of an attribute. The following code adds the constraint that the price nodes must have a currency attribute with a value of "USD". 

In [13]:
soup.find_all('price', currency="USD")

[<price currency="USD">$2.44</price>, <price currency="USD">$9.37</price>]

In [14]:
us_price_nodes = soup.find_all('price', currency="USD")

prices = []
for p in us_price_nodes:
    prices.append(p.string)
    
prices

['$2.44', '$9.37']

# XPath Examples

In [15]:
from lxml import etree

root = etree.parse("plant_catalog.xml")

### 1. Retrieve the common names of all plants 

Here are three ways to access this content.

+ from all common nodes, retrieve the text content

In [16]:
root.xpath('//common/text()')

['Bloodroot', 'Columbine', 'Marsh Marigold']

+ From all plant nodes, go to the child nodes named 'common' and then to the text content.

In [17]:
root.xpath('//plant/common/text()')

['Bloodroot', 'Columbine', 'Marsh Marigold']

+ Beginning at the top go one step to the catalog node then to all children named 'plant', then to all children of the plant nodes that are named 'common' and on the to the text content of these common nodes.

In [18]:
root.xpath('/catalog/plant/common/text()')

['Bloodroot', 'Columbine', 'Marsh Marigold']

### 2. Retrieve the plants that grow in zone 4

Here are two approaches.

+ From all of the plant nodes, filter to those that have a zone child(ren) with text content of "4". 

In [19]:
zone4_plants = root.xpath('//plant[./zone/text()="4"]')

for k in zone4_plants:
    print(html.tostring(k, pretty_print=True).decode('UTF8'))

<plant>
        <common>Bloodroot</common>
        <botanical>Sanguinaria canadensis</botanical>
        <zone>4</zone>
        <light>Mostly Shady</light>
        <price currency="USD">$2.44</price>
        <availability>031599</availability>
    </plant>
    

<plant>
        <common>Marsh Marigold</common>
        <botanical>Caltha palustris</botanical>
        <zone>4</zone>
        <light>Mostly Sunny</light>
        <price currency="CAD">$6.81</price>
        <availability>051799</availability>
    </plant>




Alternatively, 

+ From all of the zone nodes, filter to those that have text content of "4". Then go up one level to the parent of the zone node.

In [20]:
zone4_plants = root.xpath('//zone[text()="4"]/..')

for k in zone4_plants:
    print(html.tostring(k, pretty_print=True).decode('UTF8'))

<plant>
        <common>Bloodroot</common>
        <botanical>Sanguinaria canadensis</botanical>
        <zone>4</zone>
        <light>Mostly Shady</light>
        <price currency="USD">$2.44</price>
        <availability>031599</availability>
    </plant>
    

<plant>
        <common>Marsh Marigold</common>
        <botanical>Caltha palustris</botanical>
        <zone>4</zone>
        <light>Mostly Sunny</light>
        <price currency="CAD">$6.81</price>
        <availability>051799</availability>
    </plant>




### 3. Retrieve the common names of plants that grow in zone 4

Here are two ways to retrieve the common names.

+ Filter all plant nodes to keep only those with a zone child that has text content "4". Then from those plant nodes, go to the child named 'common' and on to the 'common' nodes text content.

In [21]:
root.xpath('//plant[zone/text() = "4"]/common/text()')

['Bloodroot', 'Marsh Marigold']

+ Filter all of the common nodes to keep only those whose parent has a zone child with text content "4". Then from those common nodes get the text content.

In [22]:
root.xpath('//common[../zone/text() = "4"]/text()')

['Bloodroot', 'Marsh Marigold']

### 4. Retrieve the prices of plants whose prices are listed in USD

Here is one way to get the prices. 

Start with all price nodes, and filter them according to whether they have a 'currency' attribute with  value "USD". 

In [23]:
root.xpath('//price[@currency = "USD"]/text()')

['$2.44', '$9.37']

Note that we can retrieve the values of an attribute as follows:

In [24]:
root.xpath('//price/@currency')

['USD', 'USD', 'CAD']