BeautifulSoup is a Python library for pulling data out of HTML and XML files

In [1]:
#Let use a HTML document having a part of story from Alice in Wonderland 
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
from bs4 import BeautifulSoup

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser' )
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [4]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.title.name

'title'

In [8]:
soup.title.string

"The Dormouse's story"

In [9]:
soup.p
#get first p

<p class="title"><b>The Dormouse's story</b></p>

In [10]:
soup.p['class']
#name of class of first p

['title']

In [11]:
soup.a
#get first a tag 

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [12]:
soup.find_all('a')
#get all tags with a

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [13]:
#extract all URLs found within <a> tags
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [14]:
#get tag with given id
soup.find(id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [15]:
#Extract all text
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



Four kinds of objects we deal with are :
Tag, NavigableString, BeautifulSoup, Comment

In [16]:
#Tag above examples like soup.a bcz a is a tag
tag = soup.title
tag.name

'title'

In [17]:
tag.name= 'change'
tag
#We can observe change in name of tag

<change>The Dormouse's story</change>

In [28]:
tag.name= 'title'
tag

<title>The Dormouse's story</title>

In [18]:
soup.a.attrs
#We get a dictionary of attributes for a given tag

{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link1'}

In [19]:
tag['insert'] = 1
tag
#adding a attribute to a tag

<change insert="1">The Dormouse's story</change>

In [20]:
#delete an attribute in a tag 
del tag['insert']
tag

<change>The Dormouse's story</change>

In [21]:
#printing multiple classes
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

In [22]:
#If an attribute looks like it has more than one value, 
#but it’s not a multi-valued attribute as defined by any version of the HTML standard, 
#Beautiful Soup will leave the attribute alone
id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
id_soup.p['id']

'my id'

In [24]:
#If you parse a document as XML, there are no multi-valued attributes
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [29]:
#Navigable String like soup.title.string
type(soup.title.string)

bs4.element.NavigableString

In [30]:
#Convert navigable string to Unicode String
unicode_string = str(soup.title.string)
type(unicode_string)
#soup.title.string.replace_with("Replaced content")
#It will replace the string with "Replaced content"

str

If you want to use a NavigableString outside of Beautiful Soup, you should call unicode() on it to turn it into a normal Python Unicode string. If you don’t, your string will carry around a reference to the entire Beautiful Soup parse tree, even when you’re done using Beautiful Soup. This is a big waste of memory.

In [None]:
#The BeautifulSoup object represents the parsed document as a whole.
#Comment is for getting the comment in the HTML or XML file

In [31]:
#list of content for the tag
head_tag = soup.head
head_tag.contents

[<title>The Dormouse's story</title>]

In [32]:
#The BeautifulSoup object itself has children. In this case, the <html> tag is the child of the BeautifulSoup object.
#Navigable String won't have any contents
#Instead of list ,we can parse it using .children
for child in head_tag.children:
    print(child) 

<title>The Dormouse's story</title>


In [33]:
#The .descendants attribute lets you iterate over all of a tag’s children, 
#recursively
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [34]:
len(list(soup.children))
#output 1 bcz of only one html tag child

1

In [35]:
len(list(soup.descendants))
#children of children recursively

26

In [36]:
#for string in soup.strings:
#It will have extra white space
#these extra spaces are removed by stripped_strings
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [37]:
#head tag parent is html tag so it will output full parent tag
head_tag.parent

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [38]:
link = soup.a
for parent in link.parents:
    print(parent.name)
#All parents for the tag

p
body
html
[document]


In [39]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>", 'html.parser')
print(sibling_soup.prettify())

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>


In [40]:
sibling_soup.b.next_sibling

<c>text2</c>

In [41]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [42]:
#You might think that the .next_sibling of the first <a> tag would be the second <a> tag. But actually, it’s a string: 
#the comma and newline that separate the first <a> tag from the second
link = soup.a
link.next_sibling

',\n'

In [43]:
#Multiple siblings
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [44]:
#.next_element - data after <a> tag
last_a_tag = soup.find("a", id="link3")
last_a_tag.next_element

'Tillie'

In [45]:
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

In [46]:
soup.find_all('b')

[<b>The Dormouse's story</b>]

In [47]:
#Use Regular expression in Beautiful Soup
import re 
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [48]:
#soup.find_all(True) will output all the tags
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [49]:
import re
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [50]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [51]:
#If you want to search for tags that match two or more CSS classes, you should use a CSS selector
css_soup.select("p.strikeout.body")

[<p class="body strikeout"></p>]

In [52]:
soup.find_all(string=re.compile("Dormouse"))

["The Dormouse's story", "The Dormouse's story"]

In [53]:
#Limit for finding
soup.find_all("a", limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [None]:
#Let’s consider find_parents() and find_parent(). Remember that find_all() and find() work their way down the tree, looking at tag’s descendants. 
#These methods do the opposite: they work their way up the tree

In [None]:
#The find_next_siblings() method returns all the siblings that match, 
#and find_next_sibling() only returns the first one
#use .next_siblings to iterate over the rest of an element’s siblings in the tree.
#ly find_previous_siblings() and find_previous_sibling()
#ly find_all_next() and find_next()
#ly find_all_previous() and find_previous()

In [54]:
#Tags beneath other tags
soup.select("html head title")

[<title>The Dormouse's story</title>]

In [55]:
soup.select("p > a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [56]:
soup.select("p > #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [57]:
#Find with tags that match any selector
soup.select("#link1,#link2")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [58]:
#Find tags by CSS class
soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [59]:
#Find siblings of tags
soup.select("#link1 ~ .sister")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [60]:
#Find tags by attribute value
soup.select('a[href^="http://example.com/"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]