In [25]:
# 'requests' module behaves as kind of a virtual browser and allows us to request web pages the same way
# you would as if you typed a URL into your browser's location window.
import requests


In [26]:
# Here we will specify which web page we would like to load.
# requests.get will retrieve a page by its address
# In this example, the contents of a web page are stored in a variable called 'page'
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
print(page)

<Response [200]>


In [27]:
# Let's examine the page's contents (i.e. the HTML instide the page)
# As you will see, we get back a bunch of plain text and that it would take some work to parse it
# and to extract meaningful information
print(page.content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


### BeautifulSoup to the rescue!

Beautiful Soup is a library that makes it easy to scrape information from web pages. It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.

If you have not used BeautifulSoup before, you will need to install it.  You can do this in several different ways, depending on your development environment.

* Launch your terminal application:
    * On MacOS, go to Applications --> Terminal
    * On Windows, go to Start Menu --> Command Prompt.  You may have to right-click on Command Prompt and select 'Run As Administrator'
* If you are installing BeautifulSoup for your core Python installation:
    * For Python 2.7, run the following command: pip install beautifulsoup4
    * For Python 3.7, run the following command: pip3 install beautifulsoup4
    * For Anaconda, run the following command: conda install beautifulsoup4


In [28]:
# Load BeautifulSoup module
!pip3 install beautifulsoup4
from bs4 import BeautifulSoup



In [29]:
# BeautifulSoup function takes two parameters: content of an HTML page, and a parser specification.
# The parser specification basically tells BeautifulSoup what type of 'language' it needs to parse.
# BeautifulSoup can handle different versions of HTML, XML, etc...

# Note that the 'page.content' parameter comes from the page that we loaded using the 'request' module 
# a few blocks of code above:)

soup = BeautifulSoup(page.content, 'html.parser')

In [31]:
# Let's see what we get back
print(soup)

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [32]:
# What we got by printing 'soup' looks OK, but a bit difficult to read without indentations and formatting.
# Let's make it look prettier

print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [33]:
# Let's take a look at all individual HTML elements stored in 'soup'
print(list(soup.children))

['html', '\n', <html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>]


In [9]:
# We can iterate through them
for item in list(soup.children):
    print(item)
    #print(type(item))

html


<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [10]:
# We can also treat 'soup' as a list of elements
html = list(soup.children)[2]
print(html)

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [11]:
print(list(html.children))

['\n', <head>
<title>A simple example page</title>
</head>, '\n', <body>
<p>Here is some simple content for this page.</p>
</body>, '\n']


In [12]:
# Get HTML <body> tag contents
body = list(html.children)[3]
print(body)

<body>
<p>Here is some simple content for this page.</p>
</body>


In [13]:
# Take a look at elements inside the <body> tag
print(list(body.children))

['\n', <p>Here is some simple content for this page.</p>, '\n']


In [14]:
# In this example, we can grab individual tag <p>
p = list(body.children)[1]

In [15]:
# And get the text stored inside that tag
p.get_text()

'Here is some simple content for this page.'

In [16]:
# find_all() function allows us to find all instances of a particular element in an HTML page
# In this example, will get back a list with all instances of element <p>
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [17]:
# Get text inside the first <p> element in an HTML page
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [18]:
# We can also use the find() function to find only the first instance of a particular element
soup.find('p').get_text()

'Here is some simple content for this page.'

### Using CSS selectors to search for tags by class and id

In [34]:
# Load a web page's contents
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
# Parse web page with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup)

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>


In [35]:
# Find all instances of a tag '<p>' that has CSS class of 'outer-text'
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [21]:
# Find all tags that have CSS class of 'outer_text'
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [36]:
# Find all HTML elements with id = 'first'
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [23]:
soup.find('p', class_='outer-text', id='second')

<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>

In [24]:
# We can also use BeautifulSoup for selecting nested elements
# In the example below, we are asking BeautifulSoup to find a <p> element that is a child of a <div> element
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

### Let's work through a more realistic example to see if we can download and parse weather data

* Download the web page containing the forecast.
* Create a BeautifulSoup class to parse the page.
* Find the div with id seven-day-forecast, and assign to seven_day
* Inside seven_day, find each individual forecast item.
* Extract and print the first forecast item.

In [37]:
# Let's examing the https://forecast.weather.gov/ web page and find weather for Pittsburgh

# Request the weather web page
# page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
page = requests.get("https://forecast.weather.gov/MapClick.php?CityName=Pittsburgh&state=PA&site=PBZ&textField1=40.4392&textField2=-79.9767&e=0#.XyLIEPhKid0")

# Parse the page with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <!-- Meta -->
  <meta content="width=device-width" name="viewport"/>
  <link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/>
  <title>
   National Weather Service
  </title>
  <meta content="National Weather Service" name="DC.title">
   <meta content="NOAA National Weather Service National Weather Service" name="DC.description"/>
   <meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/>
   <meta content="" name="DC.date.created" scheme="ISO8601"/>
   <meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/>
   <meta content="weather, National Weather Service" name="DC.keywords"/>
   <meta content="NOAA's National Weather Service" name="DC.publisher"/>
   <meta content="National Weather Service" name="DC.contributor"/>
   <meta content="//www.weather.gov/disclaimer.php" name="DC.rights"/>
   <meta content="General" name="rating"/>
   <meta content="index,follow" name="robots"/>
   <

In [38]:
# After examining the HTML code, we can find that the seven day forcast is inside
# of a <div> tag with the id = "seven-day-forecast"
# Let's grab that element
seven_day = soup.find(id="seven-day-forecast")
print(seven_day.prettify())


<div class="panel panel-default" id="seven-day-forecast">
 <div class="panel-heading">
  <b>
   Extended Forecast for
  </b>
  <h2 class="panel-title">
   Pittsburgh PA
  </h2>
 </div>
 <div class="panel-body" id="seven-day-forecast-body">
  <div id="seven-day-forecast-container">
   <ul class="list-unstyled" id="seven-day-forecast-list">
    <li class="forecast-tombstone">
     <div class="tombstone-container">
      <p class="period-name">
       Today
       <br/>
       <br/>
      </p>
      <p>
       <img alt="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. " class="forecast-icon" src="newimages/medium/bkn.png" title="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. "/>
      </p>
      <p class="short-desc">
       Mostly Cloudy
      </p>
      <p class="temp temp-high">
       High: 34 °F
      </p>
     </div>
    </li>
    <li class="forecast-tombstone">
     <div class="tombstone-container">
      <p class="period-name">
       Tonight


In [39]:
# Further examination reveals that daily forcasts are inside of another <div> container
# with id = "tombstone-container"
forecast_items = seven_day.find_all(class_="tombstone-container")
print(forecast_items)

[<div class="tombstone-container">
<p class="period-name">Today<br/><br/></p>
<p><img alt="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. " class="forecast-icon" src="newimages/medium/bkn.png" title="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. "/></p><p class="short-desc">Mostly Cloudy</p><p class="temp temp-high">High: 34 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: A chance of light snow, mainly after 11pm.  Cloudy, with a low around 28. Calm wind.  Chance of precipitation is 40%. New snow accumulation of less than a half inch possible. " class="forecast-icon" src="newimages/medium/nsn40.png" title="Tonight: A chance of light snow, mainly after 11pm.  Cloudy, with a low around 28. Calm wind.  Chance of precipitation is 40%. New snow accumulation of less than a half inch possible. "/></p><p class="short-desc">Chance Light<br/>Snow</p><p class="temp temp-low">Low: 28 °F</p><

In [40]:

# forecast_items will give us a list of items for seven days of the week
# If we want to see only tonight's weather, we only need the first element
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. " class="forecast-icon" src="newimages/medium/bkn.png" title="Today: Mostly cloudy, with a high near 34. West wind around 6 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Cloudy
 </p>
 <p class="temp temp-high">
  High: 34 °F
 </p>
</div>


There are 4 pieces of information we can extract:

* The name of the forecast item — in this case, Tonight.
* The description of the conditions — this is stored in the title property of img.
* A short description of the conditions — in this case, Mostly Clear.
* The temperature low — in this case, 49 degrees.

In [41]:
# HTML element with class='period_name' will allow us to get the specific day for which we are looking at the weather
period = tonight.find(class_="period-name").get_text()

# HTML element with class='short-desc' will allow us to get a short description of the day's weather
short_desc = tonight.find(class_="short-desc").get_text()

# HTML element with class='temp' will give is the temperature for a given day
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Today
Mostly Cloudy
High: 34 °F


Now, we can extract the title attribute from the img tag. To do this, we just treat the BeautifulSoup object like a dictionary, and pass in the attribute we want as a key:

In [42]:
img = tonight.find("img")
desc = img['title']
print(desc)

Today: Mostly cloudy, with a high near 34. West wind around 6 mph. 


Now that we know how to extract each individual piece of information, we can combine our knowledge with css selectors and list comprehensions to extract everything at once.

In the below code, we:

* Select all items with the class period-name inside an item with the class tombstone-container in seven_day.
* Iterate through a list to call the get_text method on each BeautifulSoup object.

In [43]:
# Let's get all weather periods
period_tags = seven_day.select(".tombstone-container .period-name")

periods = []
for pt in period_tags:
    periods.append(pt.get_text())

print(periods)

['Today', 'Tonight', 'Wednesday', 'WednesdayNight', 'Thursday', 'ThursdayNight', 'Friday', 'FridayNight', 'Saturday']


In [44]:
# Let's get all short weather descriptions
short_descs = []
for sd in seven_day.select(".tombstone-container .short-desc"):
    short_descs.append(sd.get_text())
    
short_descs

['Mostly Cloudy',
 'Chance LightSnow',
 'Chance Snow',
 'Slight ChanceSnow thenMostly Clear',
 'Partly Sunny',
 'ChanceShowers',
 'ChanceShowers',
 'Showers',
 'Showers']

In [45]:
# Let's get all temperatures

temps = []

for t in seven_day.select(".tombstone-container .temp"):
    temps.append(t.get_text())
    
print(temps)

['High: 34 °F', 'Low: 28 °F', 'High: 37 °F', 'Low: 24 °F', 'High: 42 °F', 'Low: 34 °F', 'High: 56 °F', 'Low: 51 °F', 'High: 65 °F']


In [46]:
# Last, but not least, let's get all descriptions

descs = []

for d in seven_day.select(".tombstone-container img"):
    descs.append(d["title"])
    
print(descs)

['Today: Mostly cloudy, with a high near 34. West wind around 6 mph. ', 'Tonight: A chance of light snow, mainly after 11pm.  Cloudy, with a low around 28. Calm wind.  Chance of precipitation is 40%. New snow accumulation of less than a half inch possible. ', 'Wednesday: A chance of snow, mainly before 2pm.  Mostly cloudy, with a high near 37. Calm wind becoming west around 6 mph in the afternoon.  Chance of precipitation is 50%. New snow accumulation of less than a half inch possible. ', 'Wednesday Night: A slight chance of snow before 8pm.  Partly cloudy, with a low around 24. West wind around 5 mph becoming calm  in the evening.  Chance of precipitation is 20%.', 'Thursday: Partly sunny, with a high near 42. Calm wind becoming southeast around 5 mph in the afternoon. ', 'Thursday Night: A chance of showers after 8pm.  Mostly cloudy, with a low around 34. Chance of precipitation is 30%. New precipitation amounts of less than a tenth of an inch possible. ', 'Friday: A chance of shower

In [47]:
for i in range(0, len(periods)):
    print("Period: " + periods[i])
    print("Weather: " + short_descs[i])
    print("Temperature: " + temps[i])
    print("Overall forecast: " + descs[i])
    print("____________________________")

Period: Today
Weather: Mostly Cloudy
Temperature: High: 34 °F
Overall forecast: Today: Mostly cloudy, with a high near 34. West wind around 6 mph. 
____________________________
Period: Tonight
Weather: Chance LightSnow
Temperature: Low: 28 °F
Overall forecast: Tonight: A chance of light snow, mainly after 11pm.  Cloudy, with a low around 28. Calm wind.  Chance of precipitation is 40%. New snow accumulation of less than a half inch possible. 
____________________________
Period: Wednesday
Weather: Chance Snow
Temperature: High: 37 °F
Overall forecast: Wednesday: A chance of snow, mainly before 2pm.  Mostly cloudy, with a high near 37. Calm wind becoming west around 6 mph in the afternoon.  Chance of precipitation is 50%. New snow accumulation of less than a half inch possible. 
____________________________
Period: WednesdayNight
Weather: Slight ChanceSnow thenMostly Clear
Temperature: Low: 24 °F
Overall forecast: Wednesday Night: A slight chance of snow before 8pm.  Partly cloudy, with 