In [1]:
import requests


In [5]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
print(page.status_code)

200


In [6]:
print(page.content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [7]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [9]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [10]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [12]:
#[type(item) for item in list(soup.children)]
for item in list(soup.children):
    print(type(item))

<class 'bs4.element.Doctype'>
<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>


In [14]:
html = list(soup.children)[2]
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [15]:
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [16]:
body = list(html.children)[3]

In [17]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [18]:
p = list(body.children)[1]

In [19]:
p.get_text()

'Here is some simple content for this page.'

In [20]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [21]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [22]:
# Searching for tags by class and id

In [23]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [24]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [25]:
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [26]:
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [27]:
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

* Download the web page containing the forecast.
* Create a BeautifulSoup class to parse the page.
* Find the div with id seven-day-forecast, and assign to seven_day
* Inside seven_day, find each individual forecast item.
* Extract and print the first forecast item.

In [28]:
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Mostly cloudy, with a low around 56. West southwest wind 7 to 14 mph, with gusts as high as 18 mph. " class="forecast-icon" src="newimages/medium/nbkn.png" title="Tonight: Mostly cloudy, with a low around 56. West southwest wind 7 to 14 mph, with gusts as high as 18 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Cloudy
 </p>
 <p class="temp temp-low">
  Low: 56 °F
 </p>
</div>


There are 4 pieces of information we can extract:

* The name of the forecast item — in this case, Tonight.
* The description of the conditions — this is stored in the title property of img.
* A short description of the conditions — in this case, Mostly Clear.
* The temperature low — in this case, 49 degrees.

In [29]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

Tonight
Mostly Cloudy
Low: 56 °F


Now, we can extract the title attribute from the img tag. To do this, we just treat the BeautifulSoup object like a dictionary, and pass in the attribute we want as a key:

In [30]:
img = tonight.find("img")
desc = img['title']
print(desc)

Tonight: Mostly cloudy, with a low around 56. West southwest wind 7 to 14 mph, with gusts as high as 18 mph. 


Now that we know how to extract each individual piece of information, we can combine our knowledge with css selectors and list comprehensions to extract everything at once.

In the below code, we:

* Select all items with the class period-name inside an item with the class tombstone-container in seven_day.
* Iterate through a list to call the get_text method on each BeautifulSoup object.

In [31]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight']

In [35]:
short_descs = []
for sd in seven_day.select(".tombstone-container .short-desc"):
    short_descs.append(sd.get_text())
    
short_descs

['Mostly Cloudy',
 'DecreasingClouds',
 'IncreasingClouds',
 'GradualClearing',
 'Partly Cloudy',
 'Partly Sunny',
 'Partly Cloudy',
 'Partly Sunny',
 'Partly Cloudy']

In [None]:
temps = []

for t in seven_day.select(".tombstone-container .temp"):
    temps(t.get_text())