## Parsing a basic html page

In [None]:
from bs4 import BeautifulSoup as bs
import requests

In [11]:
link = "http://www.apptronix.net/webscrap/demo1.html"

In [12]:
page = requests.get(link)

In [13]:
page

<Response [200]>

In [14]:
page.content

b'<!DOCTYPE html>\n<html>\n<head>Head Of The Page</head>\n<body>\n<p>Simple Paragraph</p>\n</body>\n</html>'

In [16]:
soup = bs(page.content,'html.parser')

In [17]:
soup

<!DOCTYPE html>

<html>
<head>Head Of The Page</head>
<body>
<p>Simple Paragraph</p>
</body>
</html>

In [18]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  Head Of The Page
 </head>
 <body>
  <p>
   Simple Paragraph
  </p>
 </body>
</html>


In [20]:
list(soup.children)

['html', '\n', <html>
 <head>Head Of The Page</head>
 <body>
 <p>Simple Paragraph</p>
 </body>
 </html>]

In [22]:
html=list(soup.children)[2]
html

<html>
<head>Head Of The Page</head>
<body>
<p>Simple Paragraph</p>
</body>
</html>

In [23]:
list(html.children)

['\n', <head>Head Of The Page</head>, '\n', <body>
 <p>Simple Paragraph</p>
 </body>, '\n']

In [26]:
body = list(html.children)[3]

In [27]:
body

<body>
<p>Simple Paragraph</p>
</body>

In [28]:
list(body.children)

['\n', <p>Simple Paragraph</p>, '\n']

In [29]:
p = list(body.children)[1]

In [30]:
p

<p>Simple Paragraph</p>

In [31]:
data = p.get_text()

In [32]:
data

'Simple Paragraph'

## Finding all instances of a tag at once

In [33]:
from bs4 import BeautifulSoup as bs
import requests

In [34]:
link = "http://www.apptronix.net/webscrap/demo2.html"

In [35]:
link

'http://www.apptronix.net/webscrap/demo2.html'

In [36]:
page = requests.get(link)

In [37]:
page

<Response [200]>

In [38]:
page.content

b'<!DOCTYPE html>\n<html>\n<head>Head Of The Page</head>\n<body>\n<p>First Paragraph</p>\n<p>Second Paragraph</p>\n<p>Third Paragraph</p>\n</body>\n</html>'

In [39]:
soup = bs(page.content,'html.parser')

In [40]:
soup

<!DOCTYPE html>

<html>
<head>Head Of The Page</head>
<body>
<p>First Paragraph</p>
<p>Second Paragraph</p>
<p>Third Paragraph</p>
</body>
</html>

In [41]:
soup.find_all('p')

[<p>First Paragraph</p>, <p>Second Paragraph</p>, <p>Third Paragraph</p>]

In [42]:
soup.find_all('p')[1].get_text()

'Second Paragraph'

In [43]:
soup.find('p')

<p>First Paragraph</p>

## Serching the tags by class or id

In [45]:
link = "http://www.apptronix.net/webscrap/demo3.html"

In [46]:
page = requests.get(link)

In [47]:
page

<Response [200]>

In [48]:
page.content

b'<html>\n<head>\n<title>A simple page</title>\n</head>\n<body>\n<div>\n<p class="inner-text first-item" id="first">\nFirst paragraph.\n</p>\n<p class="inner-text">\nSecond paragraph.\n</p>\n</div>\n<p class="outer-text first-item" id="second">\n<b>\nFirst outer paragraph.\n</b>\n</p>\n<p class="outer-text">\n<b>\nSecond outer paragraph.\n</b>\n</p>\n<a href = \'http://www.google.com\' id=\'link1\'>Google</a>\n<a href = \'http://www.facebook.com\' id=\'link2\'>Facebook</a>\n<a href = \'http://www.instagram.com\' id=\'link3\'>Instagram</a>\n</body>\n</html>'

In [49]:
soup = bs(page.content,'html.parser')

In [51]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
  <a href="http://www.google.com" id="link1">
   Google
  </a>
  <a href="http://www.facebook.com" id="link2">
   Facebook
  </a>
  <a href="http://www.instagram.com" id="link3">
   Instagram
  </a>
 </body>
</html>


In [52]:
soup.find_all('p',class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
 First outer paragraph.
 </b>
 </p>, <p class="outer-text">
 <b>
 Second outer paragraph.
 </b>
 </p>]

In [None]:
soup.find_all('p',id='first')

### Using CSS Selectors

In [54]:
soup.select("div p")

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>, <p class="inner-text">
 Second paragraph.
 </p>]

In [56]:
soup.select("div p.first-item")

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>]

In [57]:
soup.select("div p#first")

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>]

In [59]:
soup.select("body p.outer-text")

[<p class="outer-text first-item" id="second">
 <b>
 First outer paragraph.
 </b>
 </p>, <p class="outer-text">
 <b>
 Second outer paragraph.
 </b>
 </p>]

In [60]:
soup.find_all('a')

[<a href="http://www.google.com" id="link1">Google</a>,
 <a href="http://www.facebook.com" id="link2">Facebook</a>,
 <a href="http://www.instagram.com" id="link3">Instagram</a>]

In [61]:
soup.find(id="link3")

<a href="http://www.instagram.com" id="link3">Instagram</a>

In [62]:
my_links = soup.find_all('a')

In [63]:
my_links

[<a href="http://www.google.com" id="link1">Google</a>,
 <a href="http://www.facebook.com" id="link2">Facebook</a>,
 <a href="http://www.instagram.com" id="link3">Instagram</a>]

In [64]:
links = []
for link in my_links:
    links.append(link.get('href'))

In [65]:
links

['http://www.google.com',
 'http://www.facebook.com',
 'http://www.instagram.com']