In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [2]:
# Perform web scraping using Python 3 and the BeautifulSoup library
# We'll be scraping weather forecasts from the National Weather Service site,
# and then analyzing them using the Pandas library.

# HTML — contain the main content of the page.
# CSS — add styling to make the page look nicer.
# JS — Javascript files add interactivity to web pages.
# Images — image formats, such as JPG and PNG allow web pages to show pictures.

# When we perform web scraping, we're interested in the main content of the web page, so we look at the HTML

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# We can download pages using the Python requests library. 
# The requests library will make a GET request to a web server, which will download the HTML contents of a given web page for us. 
# There are several different types of requests we can make using requests, of which GET is just one. 
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [10]:
# We get a Response object. 
# This object has a status_code property, which indicates if the page was downloaded successfully.
page.status_code

# A status_code of 200 means that the page downloaded successfully. 
# Status code starting with a 2 generally indicates success, and a code starting with a 4 or a 5 indicates an error.

200

In [11]:
# Print out the HTML content of the page using the content property.
page.content


'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [12]:
# We can use the BeautifulSoup library to parse this document, and extract the text from the p tag:
soup = BeautifulSoup(page.content, 'html.parser')

# We can now print out the HTML content of the page, formatted nicely
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [19]:
# First select all the elements at the top level of the page using the children property of soup. 
# soup.children returns a list generator, so we need to call the list function on it:
soup_children = list(soup.children)
print(len(soup_children))
print(soup_children)

# There are two tags at the top level of the page 
# -- the initial <!DOCTYPE html> tag, and the <html> tag. There is a newline character (\n) in the list as well. 
# The type of each element in the list is:
soup_children_type = [type(item) for item in list(soup.children)]
print(soup_children_type)

# All of the items are BeautifulSoup objects. 
# The first is a Doctype object, which contains information about the type of the document. 
# The second is a NavigableString, which represents text found in the HTML document. 
# The final item is a Tag object, which contains other nested tags. 

# The most important object type, and the one we'll deal with most often, is the Tag object.
# The Tag object allows us to navigate through an HTML document, and extract other tags and text
html = list(soup.children)[-1]

# We can find the children inside the html tag:
print(list(html.children))


# There are two tags here, head, and body. We want to extract the text inside the p tag
body = list(html.children)[3]

# Get the p tag by finding the children of the body tag:
print(list(body.children))

# Isolate the p tag:
p = list(body.children)[1]

# Use the get_text method to extract all of the text inside the tag:
print(p.get_text())


3
[u'html', u'\n', <html>\n<head>\n<title>A simple example page</title>\n</head>\n<body>\n<p>Here is some simple content for this page.</p>\n</body>\n</html>]


In [8]:
# Finding all instances of a tag at once
# Use the find_all method, which will find all the instances of a tag on a page.
print(soup.find_all('p'))
print(soup.find_all('p')[0].get_text())

# Use the find method to find the first instance of a tag, which will return a single BeautifulSoup object:
print(soup.find('p'))

# Searching for tags by class and id
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

# Use the find_all method to search for items by class or by id
find_p_class = soup.find_all('p', class_='outer-text')
print(find_p_class)

find_class = soup.find_all(class_="outer-text")
print(find_class)

find_id = soup.find_all(id="first")
print(find_id)

# Use CSS selectors to find all the p tags in page that are inside of a div
p_div = soup.select("div p")
print(p_div)

[<p class="inner-text first-item" id="first">\n                First paragraph.\n            </p>, <p class="inner-text">\n                Second paragraph.\n            </p>, <p class="outer-text first-item" id="second">\n<b>\n                First outer paragraph.\n            </b>\n</p>, <p class="outer-text">\n<b>\n                Second outer paragraph.\n            </b>\n</p>]

                First paragraph.
            
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>
[<p class="outer-text first-item" id="second">\n<b>\n  

In [4]:
# Downloading weather data
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())


<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. " class="forecast-icon" src="newimages/medium/nsct.png" title="Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. "/>
 </p>
 <p class="short-desc">
  Partly Cloudy
 </p>
 <p class="temp temp-low">
  Low: 52 °F
 </p>
</div>


In [5]:
# Extracting information from the page
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Tonight
Partly Cloudy
Low: 52 °F


In [6]:
# Extract the title attribute from the img tag
# To do this, just treat the BeautifulSoup object like a dictionary, and pass in the attribute we want as a key:
img = tonight.find("img")
desc = img['title']

print(img)
print(desc)

<img alt="Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. " class="forecast-icon" src="newimages/medium/nsct.png" title="Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. "/>
Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. 


In [7]:
# Extracting all the information from the page
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
print periods


[u'Tonight', u'Tuesday', u'TuesdayNight', u'Wednesday', u'WednesdayNight', u'Thursday', u'ThursdayNight', u'Friday', u'FridayNight']


In [8]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)

[u'Partly Cloudy', u'Mostly Sunny', u'Partly Cloudy', u'Mostly Sunny', u'Mostly Cloudy', u'Partly Sunny', u'Partly Cloudy', u'Mostly Sunny', u'Partly Cloudy']
[u'Low: 52 \xb0F', u'High: 67 \xb0F', u'Low: 52 \xb0F', u'High: 66 \xb0F', u'Low: 51 \xb0F', u'High: 63 \xb0F', u'Low: 51 \xb0F', u'High: 61 \xb0F', u'Low: 51 \xb0F']
[u'Tonight: Partly cloudy, with a low around 52. West southwest wind 11 to 18 mph, with gusts as high as 24 mph. ', u'Tuesday: Mostly sunny, with a high near 67. West southwest wind 10 to 17 mph, with gusts as high as 23 mph. ', u'Tuesday Night: Partly cloudy, with a low around 52. West wind 8 to 18 mph, with gusts as high as 24 mph. ', u'Wednesday: Mostly sunny, with a high near 66. West southwest wind 9 to 16 mph, with gusts as high as 21 mph. ', u'Wednesday Night: Mostly cloudy, with a low around 51. West wind 10 to 16 mph, with gusts as high as 21 mph. ', u'Thursday: Partly sunny, with a high near 63.', u'Thursday Night: Partly cloudy, with a low around 51.', u'

In [21]:
print temps[3]

High: 65 °F


In [10]:
# Combining data into a Pandas Dataframe
weather = pd.DataFrame({
        "period": periods, 
        "short_desc": short_descs, 
        "temp": temps, 
        "desc":descs
    })
weather

Unnamed: 0,desc,period,short_desc,temp
0,"Tonight: Partly cloudy, with a low around 52. ...",Tonight,Partly Cloudy,Low: 52 °F
1,"Tuesday: Mostly sunny, with a high near 67. We...",Tuesday,Mostly Sunny,High: 67 °F
2,"Tuesday Night: Partly cloudy, with a low aroun...",TuesdayNight,Partly Cloudy,Low: 52 °F
3,"Wednesday: Mostly sunny, with a high near 66. ...",Wednesday,Mostly Sunny,High: 66 °F
4,"Wednesday Night: Mostly cloudy, with a low aro...",WednesdayNight,Mostly Cloudy,Low: 51 °F
5,"Thursday: Partly sunny, with a high near 63.",Thursday,Partly Sunny,High: 63 °F
6,"Thursday Night: Partly cloudy, with a low arou...",ThursdayNight,Partly Cloudy,Low: 51 °F
7,"Friday: Mostly sunny, with a high near 61.",Friday,Mostly Sunny,High: 61 °F
8,"Friday Night: Partly cloudy, with a low around...",FridayNight,Partly Cloudy,Low: 51 °F


In [19]:
# use regular expression to pull out the numeric temperature values
'''
Series.str can be used to access the values of the series as strings and apply several methods to it. 
These can be accessed like Series.str.<function/property>.

https://pandas.pydata.org/pandas-docs/stable/api.html#string-handling
'''
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
weather

In [22]:
# Find the mean of all the high and low temperatures:
weather["temp_num"].mean()

57.111111111111114

In [30]:
# Only select the rows that happen at night:
is_night = weather["period"].str.lower().str.contains("night")
weather["is_night"] = is_night
weather

Unnamed: 0,desc,period,short_desc,temp,temp_num,is_night
0,"Tonight: Partly cloudy, with a low around 52. ...",Tonight,Partly Cloudy,Low: 52 °F,52,True
1,"Tuesday: Mostly sunny, with a high near 67. We...",Tuesday,Mostly Sunny,High: 67 °F,67,False
2,"Tuesday Night: Partly cloudy, with a low aroun...",TuesdayNight,Partly Cloudy,Low: 52 °F,52,True
3,"Wednesday: Mostly sunny, with a high near 66. ...",Wednesday,Mostly Sunny,High: 66 °F,66,False
4,"Wednesday Night: Mostly cloudy, with a low aro...",WednesdayNight,Mostly Cloudy,Low: 51 °F,51,True
5,"Thursday: Partly sunny, with a high near 63.",Thursday,Partly Sunny,High: 63 °F,63,False
6,"Thursday Night: Partly cloudy, with a low arou...",ThursdayNight,Partly Cloudy,Low: 51 °F,51,True
7,"Friday: Mostly sunny, with a high near 61.",Friday,Mostly Sunny,High: 61 °F,61,False
8,"Friday Night: Partly cloudy, with a low around...",FridayNight,Partly Cloudy,Low: 51 °F,51,True
