# Beautiful Soap Quick Guide

In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [3]:
# One common task is extracting all the URLs found within a page’s <a> tags:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [4]:
# Another common task is extracting all the text from a page:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



## Kinds of objects

Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you’ll only ever have to deal with about four kinds of objects: 

* Tag
* NavigableString 
* BeautifulSoup
* Comment

### Tag
A Tag object corresponds to an XML or HTML tag in the original document:

In [5]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
type(tag)

bs4.element.Tag

### Name
Every tag has a name, accessible as .name:

In [6]:
tag.name

'b'

In [7]:
tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

### Attributes
You can access a tag’s attributes by treating the tag like a dictionary:

In [8]:
tag['class']

['boldest']

In [9]:
# You can access that dictionary directly as .attrs:
tag.attrs

{'class': ['boldest']}

In [10]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag

<blockquote another-attribute="1" class="boldest" id="verybold">Extremely bold</blockquote>

In [11]:
del tag['id']
del tag['another-attribute']
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [13]:
tag['id']

KeyError: 'id'

In [14]:
print(tag.get('id'))

None


### Multi-valued attributes
HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more. The most common multi-valued attribute is class (that is, a tag can have more than one CSS class). Others include rel, rev, accept-charset, headers, and accesskey. Beautiful Soup presents the value(s) of a multi-valued attribute as a list:

In [15]:
css_soup = BeautifulSoup('<p class="body"></p>')
css_soup.p['class']

['body']

In [16]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

If an attribute looks like it has more than one value, but it’s not a multi-valued attribute as defined by any version of the HTML standard, Beautiful Soup will leave the attribute alone:

In [18]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'

When you turn a tag back into a string, multiple attribute values are consolidated:

In [20]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']

['index']

In [21]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


You can use **get_attribute_list** to get a value that’s always a list, whether or not it’s a multi-valued atribute:

In [22]:
id_soup.p.get_attribute_list('id')

['my id']

If you parse a document as XML, there are no multi-valued attributes:

In [23]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

### NavigableString
A string corresponds to a bit of text within a tag. Beautiful Soup uses the NavigableString class to contain these bits of text:

In [24]:
tag.string

'Extremely bold'

You can’t edit a string in place, but you can replace one string with another, using replace_with():

In [25]:
tag.string.replace_with("No longer bold")
tag

<blockquote class="boldest">No longer bold</blockquote>

### BeautifulSoup
The BeautifulSoup object itself represents the document as a whole. For most purposes, you can treat it as a Tag object.

In [26]:
soup.name

'[document]'

Since the BeautifulSoup object doesn’t correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it’s useful to look at its .name, so it’s been given the special .name “[document]”:

### Comments and other special strings

In [28]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
comment

'Hey, buddy. Want to buy a used parser?'

In [29]:
type(comment)

bs4.element.Comment

In [30]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


# Navigating the tree

In [73]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

## Going down
### Navigating using tag names


In [74]:
soup.head

<head><title>The Dormouse's story</title></head>

In [75]:
soup.title

<title>The Dormouse's story</title>

In [76]:
soup.body.b

<b>The Dormouse's story</b>

In [77]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [78]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### .contents and .children

In [79]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

A tag’s children are available in a list called .contents:

In [80]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [81]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [82]:
title_tag.contents

["The Dormouse's story"]

The BeautifulSoup object itself has children. In this case, the <html> tag is the child of the BeautifulSoup object.:

In [83]:
len(soup.contents)

2

In [84]:
soup.contents

['\n', <html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body></html>]

In [85]:
soup.contents[0]

'\n'

In [86]:
soup.contents[1]

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [87]:
soup.contents[1].name

'html'

Instead of getting them as a list, you can iterate over a tag’s children using the .children generator:

In [88]:
for child in title_tag.children:
    print(child)

The Dormouse's story


### .descendants
The .contents and .children attributes only consider a tag’s direct children. For instance, the <head> tag has a single direct child–the <title> tag:

In [None]:
head_tag.contents

But the title tag itself has a child: the string “The Dormouse’s story”. There’s a sense in which that string is also a child of the head tag. The .descendants attribute lets you iterate over all of a tag’s children, recursively: its direct children, the children of its direct children, and so on:

In [None]:
for child in head_tag.descendants:
    print(child)

In [None]:
len(list(soup.children))

In [None]:
len(list(soup.descendants))

### .string

If a tag has only one child, and that child is a NavigableString, the child is made available as .string:

In [None]:
title_tag.string

If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child:

In [None]:
head_tag.contents

In [None]:
head_tag.string

If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:

In [None]:
print(soup.html.string)

### .strings and stripped_strings

If there’s more than one thing inside a tag, you can still look at just the strings. Use the .strings generator:

In [None]:
for string in soup.strings:
    print(repr(string))

These strings tend to have a lot of extra whitespace, which you can remove by using the .stripped_strings generator instead:

In [None]:
for string in soup.stripped_strings:
    print(repr(string))

## Going up

### .parent

In [None]:
title_tag = soup.title
title_tag

In [None]:
title_tag.parent

The title string itself has a parent: the title tag that contains it:

In [None]:
title_tag.string.parent

In [None]:
html_tag = soup.html
type(html_tag.parent)

In [None]:
print(soup.parent)

### .parents

In [None]:
link = soup.a
link

In [None]:
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

## Going sideways

In [None]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

In [None]:
sibling_soup.b.next_sibling

In [None]:
sibling_soup.c.previous_sibling

In [None]:
print(sibling_soup.b.previous_sibling)

In [None]:
print(sibling_soup.c.next_sibling)

In [None]:
sibling_soup.b.string

In [None]:
print(sibling_soup.b.string.next_sibling)

In [None]:
link = soup.a
link

In [None]:
link.next_sibling

In [None]:
link.next_sibling.next_sibling

### .next_siblings and .previous_siblings

In [None]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

In [None]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

## Going back and forth

### .next_element and .previous_element


In [None]:
last_a_tag = soup.find("a", id="link3")
last_a_tag

In [None]:
last_a_tag.next_sibling

In [None]:
last_a_tag.next_element

In [None]:
last_a_tag.previous_element

In [None]:
last_a_tag.previous_element.next_element

### .next_elements and .previous_elements

In [None]:
for element in last_a_tag.next_elements:
    print(repr(element))

# Searching the tree


In [None]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [None]:
soup.find_all('b')

In [None]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

In [None]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

In [None]:
soup.find_all(["a", "b"])

In [None]:
for tag in soup.find_all(True):
    print(tag.name)

In [None]:
soup.find_all("title")

In [None]:
soup.find_all("p", "title")

In [None]:
soup.find_all("a")

In [None]:
soup.find_all(id="link2")

In [None]:
import re
soup.find(string=re.compile("sisters"))

In [None]:
soup.find_all(href=re.compile("elsie"))

In [None]:
soup.find_all(id=True)

In [None]:
soup.find_all(href=re.compile("elsie"), id='link1')

In [None]:
# Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")

In [None]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})

You can’t use a keyword argument to search for HTML’s ‘name’ element, because Beautiful Soup uses the name argument to contain the name of the tag itself. Instead, you can give a value to ‘name’ in the attrs argument:

In [None]:
name_soup = BeautifulSoup('<input name="email"/>')
name_soup.find_all(name="email")

In [None]:
name_soup.find_all(attrs={"name": "email"})

## Searching by CSS class

In [None]:
soup.find_all("a", class_="sister")

In [None]:
soup.find_all(class_=re.compile("itl"))

In [None]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")

In [None]:
css_soup.find_all("p", class_="body")

In [None]:
css_soup.find_all("p", class_="body strikeout")

In [None]:
css_soup.find_all("p", class_="strikeout body")

In [None]:
css_soup.select("p.strikeout.body")

In [None]:
soup.find_all("a", attrs={"class": "sister"})

In [None]:
soup.find_all(string="Elsie")

In [None]:
soup.find_all(string=["Tillie", "Elsie", "Lacie"])

In [None]:
soup.find_all(string=re.compile("Dormouse"))

In [None]:
def is_the_only_string_within_a_tag(s):
    return (s == s.parent.string)

In [None]:
soup.find_all(string=is_the_only_string_within_a_tag)

In [None]:
soup.find_all("a", string="Elsie")

In [None]:
soup.find_all("a", text="Elsie")

In [None]:
soup.find_all("a", limit=2)

In [None]:
soup.html.find_all("title")

In [None]:
soup.html.find_all("title", recursive=False)

In [None]:
soup.find_all("a")
soup("a")

In [None]:
soup.title.find_all(string=True)
soup.title(string=True)

In [None]:
soup.find_all('title', limit=1)
soup.find('title')

In [None]:
print(soup.find("nosuchtag"))

In [None]:
soup.head.title

In [None]:
soup.find("head").find("title")

In [None]:
a_string = soup.find(string="Lacie")
a_string

In [None]:
a_string.find_parents("a")

In [None]:
a_string.find_parent("p")

In [None]:
first_link = soup.a
first_link

In [None]:
first_link.find_next_siblings("a")

In [None]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")

In [None]:
last_link = soup.find("a", id="link3")
last_link

In [None]:
last_link.find_previous_siblings("a")

In [None]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")

In [None]:
first_link = soup.a
first_link

In [None]:
first_link.find_all_next(string=True)

In [None]:
first_link.find_next("p")

In [None]:
first_link = soup.a
first_link

In [None]:
first_link.find_all_previous("p")

In [None]:
first_link.find_previous("title")

## CSS selectors

In [None]:
soup.select("title")

In [None]:
soup.select("p:nth-of-type(3)")

In [None]:
soup.select("body a")

In [None]:
soup.select("html head title")

In [None]:
soup.select("head > title")

In [None]:
soup.select("p > a")

In [None]:
soup.select("p > a:nth-of-type(2)")

In [None]:
soup.select("p > #link1")

In [None]:
soup.select("body > a")

In [None]:
soup.select("#link1 ~ .sister")

In [None]:
soup.select("#link1 + .sister")

In [None]:
soup.select(".sister")

In [None]:
soup.select("[class~=sister]")

In [None]:
soup.select("#link1")

In [None]:
soup.select("a#link2")

In [None]:
soup.select("#link1,#link2")

In [None]:
soup.select('a[href]')

In [None]:
soup.select('a[href="http://example.com/elsie"]')

In [None]:
soup.select('a[href^="http://example.com/"]')

In [None]:
soup.select('a[href$="tillie"]')

In [None]:
soup.select('a[href*=".com/el"]')

In [None]:
soup.select_one(".sister")