# CSS Selectors

In [1]:
from bs4 import BeautifulSoup

In [2]:
html = open('../data_in/html.txt', 'r', encoding='utf-8').read()
soup = BeautifulSoup(html, 'html.parser')
# make a soup object from the html

In [3]:
# use select to find all the elements with the tag 'title'
# it returns a list of elements
soup.select("title")

[<title>The King's story</title>]

In [9]:
# find all the elements with the class 'sister' and the id 'link2'
# limit the number of results to 2
soup.select('.sister, #link2', limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [10]:
soup.select_one('.sister, #link2')
# select the first element that matches the selector

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [11]:
# find all the elements with the id 'link3'
soup.select('#link3')

[<a class="sister" id="link3">Tillie</a>]

In [12]:
# find all the elements with the tag 'b' which are inside the body
soup.select("body b") # children, grandchildren, etc.

[<b>The King's story</b>]

In [13]:
# find all the elements with the tag 'span' which are directly inside the body
soup.select("p > span") # children only

[<span>Meili</span>, <span class="brother">Eric</span>]

In [14]:
soup.select("body > b") # no elements

[]

In [15]:
soup.select("p:nth-of-type(3)") # the third type of p element
# there must be no space in such selectors

[<p class="story">...</p>]

In [16]:
soup.select("a:nth-of-type(2)") # the second type of a element

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [17]:
soup.select("a:nth-child(4)") # the fourth child of a parent element

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [18]:
soup.select("p > *:nth-child(1)") # the first child of a p element

[<b>The King's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [19]:
soup.select("p > *:nth-of-type(1)") 
# the first type of element who is a child of a p element

[<b>The King's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <span>Meili</span>]

In [20]:
soup.select('span:nth-of-type(2)') # the second type of span element

[<span class="brother">Eric</span>]

In [21]:
soup.select('span.brother') # the span element with the class 'brother'

[<span class="brother">Eric</span>]

In [22]:
soup.select('a + span') # the span element that is immediately preceded by an a element

[<span>Meili</span>]

In [23]:
soup.select('a ~ span') # the span element that is preceded by an a element

[<span>Meili</span>, <span class="brother">Eric</span>]

In [24]:
soup.select('body > p > span + a') 
# the a element that is immediately preceded by a span element

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# XML

In [25]:
import pandas as pd
from bs4 import BeautifulSoup

In [26]:
# read the xml file, create a dataframe
pd.read_xml('../data_in/teachers.xml')

Unnamed: 0,name,age,subject
0,Alex Wong,35,Maths
1,Justin Li,29,Science
2,Kang Chen,32,History


In [27]:
# open the xml file, create a soup object
raw = open('../data_in/teachers.xml', 'r').read()
xmlsoup = BeautifulSoup(raw, 'xml')

In [28]:
xmlsoup

<?xml version="1.0" encoding="utf-8"?>
<teachers>
<teacher>
<name>Alex Wong</name>
<age>35</age>
<subject>Maths</subject>
</teacher>
<teacher>
<name>Justin Li</name>
<age>29</age>
<subject>Science</subject>
</teacher>
<teacher>
<name>Kang Chen</name>
<age>32</age>
<subject>History</subject>
</teacher>
</teachers>

In [29]:
[teacher.text for teacher in xmlsoup.find_all('name')]

['Alex Wong', 'Justin Li', 'Kang Chen']

# XPath

In [30]:
from bs4 import BeautifulSoup
import lxml
# BeautifulSoup by default doesn't support working with XPath.
# lxml supports XPath 1.0. It has a BeautifulSoup compatible mode,
# where it'll try and parse broken HTML the way Soup does.

In [32]:
raw = open('../data_in/html.txt', 'r', encoding='utf-8').read()
xmlsoup = BeautifulSoup(raw)
dom = lxml.etree.HTML(str(xmlsoup))

In [3]:
dom.xpath('//title')[0].text

"The King's story"

In [4]:
# select the 3rd a element of the 2nd p element under the body element
dom.xpath('//body/p[2]/a[3]')[0].text

'Tillie'

In [5]:
# select all a elements with href attribute
dom.xpath('//a[@href]')

[<Element a at 0x103bca8c0>,
 <Element a at 0x103bcb300>,
 <Element a at 0x103bcb4c0>]

In [6]:
[a.get('href') for a in dom.xpath('//a[@href]')]

['http://example.com/elsie', 'http://example.com/lacie', 'http://hku.hk/chao']