# Making a soup object

In [1]:
from bs4 import BeautifulSoup

In [2]:
html = open('../data_in/html.txt', 'r', encoding='utf-8').read()

In [73]:
print(html[:100])

<html>
    <head><title>The King's story</title>
    </head>
    <body>
        <p class="title"><b>


In [6]:
soup = BeautifulSoup(html, 'html.parser')
# the first argument is the html content
# which can be a string or a file-like object representing markup to be parsed

In [8]:
type(soup) # it is beautifulsoup object

bs4.BeautifulSoup

# Navigating the tree

In [74]:
# using element name to get the first element
print(
    soup.title,
    soup.a,
    soup.p,
    type(soup.p), # it is tag object
    sep = '\n' 
) 

<title>The King's story</title>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<p class="title"><b>The King's story</b></p>
<class 'bs4.element.Tag'>


In [13]:
a_tag = soup.a

In [35]:
a_tag.next_siblings
# it's a generator object. we use it in a loop to see details
# or, we can convert it to a list by using list() function
# similar methods include previous_siblings, next_sibling, previous_sibling, ...

<generator object PageElement.next_siblings at 0x000001CAFECCD780>

In [17]:
list(a_tag.next_siblings)

[',\n            ',
 <span>Meili</span>,
 ', \n            ',
 <span class="brother">Eric</span>,
 '\n',
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 ',\n            ',
 <a class="sister" id="link3">Tillie</a>,
 ', and\n            ',
 <a class="brother" href="http://hku.hk/chao" id="link4">Chao</a>,
 ',and they lived at the bottom of a\n            well.']

In [18]:
a_tag.parent # its parent is the <p> tag

<p class="story">Once upon a time there were five siblings; and their names were:
            <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
            <span>Meili</span>, 
            <span class="brother">Eric</span>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
            <a class="sister" id="link3">Tillie</a>, and
            <a class="brother" href="http://hku.hk/chao" id="link4">Chao</a>,and they lived at the bottom of a
            well.</p>

In [19]:
# find out the name of the parent tag
a_tag.parent.name

'p'

# `find` one element

In [78]:
# search by tag name
print(
    soup.find('title'), # equivalent to soup.title
    soup.find("MSBA"), # if the tag is not found, it returns None
    sep = '\n'
) 

<title>The King's story</title>
None


In [15]:
# search by class name. class is a reserved word in Python, hence we use class_
soup.find(class_ = 'title')

<p class="title"><b>The King's story</b></p>

In [32]:
# or, use attributes. it should be a dictionary
soup.find(attrs = {'class':'title'})

<p class="title"><b>The King's story</b></p>

In [26]:
soup.find("head").find("title") # equivalent to soup.head.title

<title>The King's story</title>

In [33]:
soup.title.find_parent()
# similar methods include:
# find_parents, find_next_sibling, find_previous_sibling, find_next_siblings, find_previous_siblings

<head><title>The King's story</title>
</head>

# `find_all` / `findAll` elements, return a list

In [40]:
hrefs = soup.find_all('a')
hrefs # the result is a list

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" id="link3">Tillie</a>,
 <a class="brother" href="http://hku.hk/chao" id="link4">Chao</a>]

In [80]:
# get the element names of the links
[link.name for link in hrefs]

['a', 'a', 'a', 'a']

In [41]:
# the parameters are the same as find()
soup.find_all('a', attrs={'class':'sister'})  

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" id="link3">Tillie</a>]

In [42]:
soup.find_all(attrs={'id':'link2'})

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [43]:
# find all tags with id attribute
soup.find_all(id = True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" id="link3">Tillie</a>,
 <a class="brother" href="http://hku.hk/chao" id="link4">Chao</a>]

In [44]:
# find an element with a specific string
# it returns a list of strings, not tags
soup.find_all(string = 'Elsie')

['Elsie']

In [46]:
soup.find_all("a", string = 'Elsie') # find all <a> tags with string 'Elsie'

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [47]:
soup.find_all("a", limit = 2)    # limit the number of results

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [49]:
import re
soup.find_all(href = re.compile(r'hku')) # find all tags with href containing 'hku'
# equivalent to soup.find_all(attrs = {'href':re.compile(r'hku')}) 

[<a class="brother" href="http://hku.hk/chao" id="link4">Chao</a>]

In [81]:
# find all tags with href attribute
[tag.name for tag in soup.find_all(href = True)]

['a', 'a', 'a']

In [53]:
# to find all tags with either 'span' or 'b'
soup.find_all(['span', 'b'])

[<b>The King's story</b>,
 <span>Meili</span>,
 <span class="brother">Eric</span>]

# Extracting attribute values

In [54]:
soup.a.attrs

{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

In [66]:
(
soup.a['href'],
soup.a['class'],
soup.a['id'],
soup.a.get('msba', None), # the get method is safe as it allows a default value
)

('http://example.com/elsie', ['sister'], 'link1', None)

# Extracting text by `text`, `get_text()`, `string`

In [67]:
(
soup.a.text,
soup.p.string,
soup.span.get_text(strip = True), # strip removes leading and trailing whitespaces
)


('Elsie', "The King's story", 'Meili')

In [68]:
type(soup.p.string) # it is a navigablestring object

bs4.element.NavigableString

In [72]:
# you may use it to find other tags, hence, navigable
soup.p.string.find_parent()
# similar methods include:
# find_parents, find_next_sibling, find_previous_sibling, find_next_siblings, find_previous_siblings

<b>The King's story</b>