In [2]:
import requests as rq
from bs4 import BeautifulSoup as bs


In [3]:
html = '<!DOCTYPE html>\
<html>\
<head>\
<title> Testing Web Page </title>\
</head>\
<body>\
<h1> Web Scraping </h1>\
<p id = "first_para">\
Let\'s start learning \
<b>\
Web Scraping\
</b>\
</p>\
<p class = "abc" id = "second_para">\
You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>\
</p>\
<p class = "abc">\
<a href = "https://codingninjas.in/"> Coding Ninjas </a>\
</p>\
</body>\
</html>'

In [4]:
data = bs(html, 'html.parser')
print(data)
print(type(data))


<!DOCTYPE html>
<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>
<class 'bs4.BeautifulSoup'>


In [5]:
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Testing Web Page
  </title>
 </head>
 <body>
  <h1>
   Web Scraping
  </h1>
  <p id="first_para">
   Let's start learning
   <b>
    Web Scraping
   </b>
  </p>
  <p class="abc" id="second_para">
   You can read more about BeautifulSoup from
   <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">
    here
   </a>
  </p>
  <p class="abc">
   <a href="https://codingninjas.in/">
    Coding Ninjas
   </a>
  </p>
 </body>
</html>


In [6]:
data.title

<title> Testing Web Page </title>

In [7]:
data.head

<head><title> Testing Web Page </title></head>

In [8]:
data.h1

<h1> Web Scraping </h1>

In [9]:
data.p # get detail only first p tag

<p id="first_para">Let's start learning <b>Web Scraping</b></p>

In [10]:
print(data.title)
print(data.title.name)
print(data.title.string)

<title> Testing Web Page </title>
title
 Testing Web Page 


In [11]:
data.title.attrs

{}

In [12]:
data.p.attrs

{'id': 'first_para'}

In [13]:
data.p['id']  # attributes are in form of python dictionary

'first_para'

In [14]:
data.get_text()

" Testing Web Page  Web Scraping Let's start learning Web ScrapingYou can read more about BeautifulSoup from  here  Coding Ninjas "

In [16]:
data.get_attribute_list

<bound method Tag.get_attribute_list of <!DOCTYPE html>
<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>>

In [17]:
data.find('p')

<p id="first_para">Let's start learning <b>Web Scraping</b></p>

In [None]:
data.find_all('p')

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

In [18]:
data.find_all(['p','a'])

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>,
 <a href="https://codingninjas.in/"> Coding Ninjas </a>]

In [19]:
data.find_all(True)  # find all tags

[<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>,
 <head><title> Testing Web Page </title></head>,
 <title> Testing Web Page </title>,
 <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>,
 <h1> Web Scraping </h1>,
 <p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <b>Web Scraping</b>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href=

In [20]:
data.find_all(id = 'first_para') # find all tags with id = first_para

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>]

In [21]:
data.find_all(class_ = 'abc') # find all tags with class = abc

[<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

In [22]:
li = data.find_all('p')

for i in li:
    print(i.get_text())

Let's start learning Web Scraping
You can read more about BeautifulSoup from  here 
 Coding Ninjas 


In [24]:
li = data.find_all('p')

for i in li:
    print(i.string)   # string is used to get text from tags when there is only one child

None
None
 Coding Ninjas 


In [25]:
li = data.find_all('p')

for i in li:
    print(i.strings) # strings is used to get text from tags when there are multiple children

<generator object Tag._all_strings at 0x7fd054df5c78>
<generator object Tag._all_strings at 0x7fd054df5c78>
<generator object Tag._all_strings at 0x7fd054df5c78>


In [26]:
li = data.find_all('p')

for i in li:
    print(list(i.strings)) # list is used to get text from tags when there are multiple children

["Let's start learning ", 'Web Scraping']
['You can read more about BeautifulSoup from ', ' here ']
[' Coding Ninjas ']


In [27]:
li = data.find_all('p')

for i in li:
    print(list(i.stripped_strings)) # stripped_strings is used to get text from tags when there are multiple childre with deleting extra spaces

["Let's start learning", 'Web Scraping']
['You can read more about BeautifulSoup from', 'here']
['Coding Ninjas']
