In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [6]:
soup=BeautifulSoup(html_doc,'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.title

<title>The Dormouse's story</title>

In [9]:
soup.title.name

'title'

In [15]:
soup.title.string

"The Dormouse's story"

In [18]:
soup.title.parent.name

'head'

In [11]:
soup.find('p')

<p class="title"><b>The Dormouse's story</b></p>

In [12]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [13]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [19]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [20]:
soup.find(id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [22]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



## Parsers


**Html parser** - Beautifulsoup(markup,'html.parser') : *pros* -decent speed, linient; *cons* -not as fast as html5lib


 **Lxml's html parser** - Beautifulsoup(markup,'lxml') : *pros* -very fast,linient; *cons* - external C dependency


**Lxml's xml parser** - Beautifulsoup(markup,'lxml-xml'|'xml') : *pros* - very fast,only XML parser; *cons* - external C dependency 


 **html5lib** - Beautifulsoup(markup,'html5lib') : *pros* - creates valid HTML,parses same as web browser ; *cons* - very slow, external python dependency

## Making the soup

The document is passed to the Beautifulsoup constructor which can be a string or open filehandle.

In [25]:
#with open('index.html') as fp:
    #soup=BeautifulSoup(fp)
    
#OR

#soup=BeautifulSoup('<html>This is html example</html>')


In [30]:
print(BeautifulSoup("<html><head></head><body>Sacre&acute; bleu!</body></html>"))


<html><head></head><body>Sacre´ bleu!</body></html>


## Kinds of objects

Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you’ll only ever have to deal with about four kinds of objects: **Tag, NavigableString, BeautifulSoup, and Comment.**

### 1.Tag
Tags have a lot of attributes and methods.the most important features of a tag are its **name** and **attributes**.

In [31]:
soup=BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag=soup.b
tag

<b class="boldest">Extremely bold</b>

In [32]:
type(tag)

bs4.element.Tag

#### Name

In [33]:

tag.name

'b'

In [34]:
#changing the tag name 
tag.name='blockquote'
tag

<blockquote class="boldest">Extremely bold</blockquote>

#### Attributes

In [35]:
#Attributes
tag.attrs

{'class': ['boldest']}

In [36]:
tag['class']

['boldest']

In [39]:
#changing the attributes of a tag
tag['id']='verybold'
tag['another-attribute']=1
tag

<blockquote another-attribute="1" class="boldest" id="verybold">Extremely bold</blockquote>

In [41]:
del tag['another-attribute']
tag

<blockquote class="boldest" id="verybold">Extremely bold</blockquote>

In [42]:
print(tag.get('id'))

verybold


In [43]:
tag.get_text()

'Extremely bold'

#### mutli values attributes

In [48]:
#multi valued attributes
css_soup=BeautifulSoup('<p class="good design"></p>')
css_soup.p['class']

['good', 'design']

In [49]:
id_soup=BeautifulSoup('<p id="good design"></p>',)
id_soup.p['id']


'good design'

In [56]:
id_soup.p.get_attribute_list('id')

['good design']

In [65]:
rel_soup=BeautifulSoup('<p>Go back to <a rel="index">homepage</a></p>')
print(rel_soup.a['rel'])

rel_soup.a['rel']=['index', 'name']
print(rel_soup.a)

['index']
<a rel="index name">homepage</a>


In [66]:
#if the parsing is done by 'xml' , then there will be no multi values attributes
xml_soup=BeautifulSoup('<p class="multi attribute"></p>','xml')
xml_soup.p['class']

'multi attribute'

### 2.Navigable String

A string corresponds to a bit of tet in a tag.BeautifulSoup uses **NavigableString** class to contain these strings

In [67]:
soup=BeautifulSoup('<p class="index">Bolded text</p>')
tag=soup.p
tag.string

'Bolded text'

In [75]:
type(tag.string)

bs4.element.NavigableString

In [76]:
#replacing string with another string
tag.string.replace_with('Html text')
tag.string

'Html text'

### 3.BeautifulSoup

BeautifulSoup is an object representing the entire document just as the **Tag** .So we can pass BeautifulSoup object into another object to combine the two parsed documents

In [78]:
doc=BeautifulSoup('<document><content/>INSERT FOOTER HERE</document>','xml')
footer=BeautifulSoup('<footer>Here is the footer</footer>','xml')
doc.find(text='INSERT FOOTER HERE').replace_with(footer)
doc

<?xml version="1.0" encoding="utf-8"?>
<document><content/><footer>Here is the footer</footer></document>

In [79]:
#BeautifulSoup also has name just like tag but its default name is 'document'
doc.name

'[document]'

### 4.Comments

In [85]:
markup = '<b><!-- hey buddy! want to buy a used parser --></b>'
soup=BeautifulSoup(markup)
comment=soup.b.string
soup

<html><body><b><!-- hey buddy! want to buy a used parser --></b></body></html>

In [86]:
type(comment)

bs4.element.Comment

In [87]:
comment

' hey buddy! want to buy a used parser '

In [88]:
print(soup.b.prettify())

<b>
 <!-- hey buddy! want to buy a used parser -->
</b>


Beautiful Soup also defines classes called **Stylesheet, Script, and TemplateString**, for embedded CSS stylesheets (any strings found inside a *style* tag), embedded Javascript (any strings found in a *script* tag), and HTML templates (any strings inside a *template* tag). These classes work exactly the same way as NavigableString; their only purpose is to make it easier to pick out the main body of the page, by ignoring strings that represent something else.