Load in the necessary libraries

In [3]:
import requests
from bs4 import BeautifulSoup as bs

Load the page

In [6]:
# load the page content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# convert to a beautiful soup object
soup = bs(r.content)

# print out the html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



Start to scrape

In [9]:
# find 
soup.find("h2")

<h2>A Header</h2>

In [10]:
# find_all
soup.find_all("h2")

[<h2>A Header</h2>, <h2>Another header</h2>]

In [12]:
# pass in a list of elements to look for
soup.find_all(["h1", "h2"])

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [14]:
# you can pass in attributes to the find/find_all function
soup.find_all("p", attrs={"id": "paragraph-id"})

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
# you can nest find/find_all calls
soup.find('body').find('div').find('h1')

<h1>HTML Webpage</h1>

In [22]:
# we can search specific strings in the find/find_all function
import re
soup.find_all("p", string = re.compile('Some'))

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
soup.find_all("h2", string = re.compile("(H|h)eader"))

[<h2>A Header</h2>, <h2>Another header</h2>]

Select (CSS selector)
https://www.w3schools.com/cssref/css_selectors.php

In [26]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [27]:
soup.select('div p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [28]:
soup.select('h2 ~ p')

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [32]:
soup.select('p#paragraph-id b')

[<b>Some bold text</b>]

In [39]:
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [41]:
# grab by element with specific property
soup.select("[align = middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

Get different properties of the HTML

In [48]:
# .string
header = soup.find('h2')
header.string

'A Header'

In [47]:
# if multiple child elements use get_text
div = soup.find('div')
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [50]:
# Get a specific property from an element
link = soup.find('a')
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [52]:
paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'paragraph-id'

Code navigation

In [62]:
# path syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [69]:
# know the terms parent, sibling, child
soup.body.find('div').find_next_siblings()


[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]