> Notes:
> + [dataquest: Python Web Scraping Tutorial using BeautifulSoup](https://www.dataquest.io/blog/web-scraping-tutorial-python/)
> + [python generators](https://wiki.python.org/moin/Generators)

In [1]:
# objective:
# scrape weather forecasts from National Weather Service (http://www.weather.gov/)
# analyze data using pandas library

In [2]:
# requests library

import requests

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
# html status_codes - https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
# 2xx = success
# 3xx = redirection
# 4xx = client errors
# 5xx = server errors
print('page status code: {}'.format(page.status_code))
print('page content: {}'.format(page.content))

page status code: 200
page content: b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [51]:
# beautiful soup

from bs4 import BeautifulSoup

# get soup
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [52]:
# print soup content
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [53]:
# soup children

# soup.children returns python list generator, hence calling the list function on it
# check the python generators link in Notes: above
soup_children = list(soup.children)
soup_children
# The above tells us that there are two tags at the top level of the page – the initial 
# <!DOCTYPE html> tag, and the <html> tag. There is a newline character (\n) in the list as well

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [54]:
# soup children element type
[type(elem) for elem in soup_children]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [55]:
# The third element [bs4.element.Tag] is most important for us
html_children = list(soup_children[2].children)
html_children

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [56]:
# body element
body_children = list(html_children[3].children)
body_children

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [49]:
# p element
p_element = body_children[1]
p_element

<p>Here is some simple content for this page.</p>

In [48]:
# p element text
p_text = p_element.get_text()
p_text

'Here is some simple content for this page.'

In [46]:
# finding all instances of a tag at once
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [57]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [58]:
# first instance only
soup.find('p')

<p>Here is some simple content for this page.</p>

In [60]:
# searching for tags by class and id
page2 = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup2 = BeautifulSoup(page2.content, 'html.parser')
soup2

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [65]:
soup2.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [66]:
soup2.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [67]:
soup2.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [70]:
# using css selectors - https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors
# p a – finds all a tags inside of a p tag.
# body p a – finds all a tags inside of a p tag inside of a body tag.
# html body – finds all body tags inside of an html tag.
# p.outer-text – finds all p tags with a class of outer-text.
# p#first – finds all p tags with an id of first.
# body p.outer-text – finds any p tags with a class of outer-text inside of a body tag.

soup2.select('p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>, <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [71]:
soup2.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [82]:
# weather data

page_weather = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
page_weather.raise_for_status()
soup_weather = BeautifulSoup(page_weather.content, 'html.parser')
seven_day = soup_weather.find(id='seven-day-forecast')
forecast_items = seven_day.find_all(class_='tombstone-container')
first_forecast_item = forecast_items[0]
print(first_forecast_item.prettify())

<div class="tombstone-container">
 <p class="period-name">
  This
  <br>
   Afternoon
  </br>
 </p>
 <p>
  <img alt="This Afternoon: Mostly sunny, with a high near 71. West wind 8 to 14 mph, with gusts as high as 18 mph. " class="forecast-icon" src="newimages/medium/sct.png" title="This Afternoon: Mostly sunny, with a high near 71. West wind 8 to 14 mph, with gusts as high as 18 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 71 °F
 </p>
</div>


In [87]:
period_name = first_forecast_item.find(class_='period-name')
period_name.get_text()

'ThisAfternoon'

In [91]:
desc = first_forecast_item.find('img').get('title')
desc

'This Afternoon: Mostly sunny, with a high near 71. West wind 8 to 14 mph, with gusts as high as 18 mph. '

In [93]:
short_desc = first_forecast_item.find(class_='short-desc')
short_desc.get_text()

'Mostly Sunny'

In [95]:
temp = first_forecast_item.find(class_='temp')
temp.get_text()

'High: 71 °F'