> Notes:
> + [dataquest: Python Web Scraping Tutorial using BeautifulSoup](https://www.dataquest.io/blog/web-scraping-tutorial-python/)
> + [python generators](https://wiki.python.org/moin/Generators)
> + [html status_codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes)
> + [css selectors](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors)
> + [Series.str.extract](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html)

In [1]:
# objective:
# scrape weather forecasts from National Weather Service (http://www.weather.gov/)
# analyze data using pandas library

In [2]:
# requests library

import requests

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
# html status_codes - https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
# 2xx = success
# 3xx = redirection
# 4xx = client errors
# 5xx = server errors
print('page status code: {}'.format(page.status_code))
print('page content: {}'.format(page.content))

page status code: 200
page content: b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [3]:
# beautiful soup

from bs4 import BeautifulSoup

# get soup
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [4]:
# print soup content
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [5]:
# soup children

# soup.children returns python list generator, hence calling the list function on it
# check the python generators link in Notes: above
soup_children = list(soup.children)
soup_children
# The above tells us that there are two tags at the top level of the page – the initial 
# <!DOCTYPE html> tag, and the <html> tag. There is a newline character (\n) in the list as well

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [6]:
# soup children element type
[type(elem) for elem in soup_children]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [7]:
# The third element [bs4.element.Tag] is most important for us
html_children = list(soup_children[2].children)
html_children

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [8]:
# body element
body_children = list(html_children[3].children)
body_children

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [9]:
# p element
p_element = body_children[1]
p_element

<p>Here is some simple content for this page.</p>

In [10]:
# p element text
p_text = p_element.get_text()
p_text

'Here is some simple content for this page.'

In [11]:
# finding all instances of a tag at once
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [12]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [13]:
# first instance only
soup.find('p')

<p>Here is some simple content for this page.</p>

In [14]:
# searching for tags by class and id
page2 = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup2 = BeautifulSoup(page2.content, 'html.parser')
soup2

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [15]:
soup2.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [16]:
soup2.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [17]:
soup2.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [18]:
# using css selectors - https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors
# p a – finds all a tags inside of a p tag.
# body p a – finds all a tags inside of a p tag inside of a body tag.
# html body – finds all body tags inside of an html tag.
# p.outer-text – finds all p tags with a class of outer-text.
# p#first – finds all p tags with an id of first.
# body p.outer-text – finds any p tags with a class of outer-text inside of a body tag.

soup2.select('p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>, <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [19]:
soup2.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [20]:
# weather data

page_weather = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
page_weather.raise_for_status()
soup_weather = BeautifulSoup(page_weather.content, 'html.parser')
seven_day = soup_weather.find(id='seven-day-forecast')
forecast_items = seven_day.find_all(class_='tombstone-container')
first_forecast_item = forecast_items[0]
print(first_forecast_item.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br>
   <br/>
  </br>
 </p>
 <p>
  <img alt="Today: Sunny, with a high near 69. North wind 5 to 14 mph becoming west in the afternoon. Winds could gust as high as 18 mph. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 69. North wind 5 to 14 mph becoming west in the afternoon. Winds could gust as high as 18 mph. "/>
 </p>
 <p class="short-desc">
  Sunny
 </p>
 <p class="temp temp-high">
  High: 69 °F
 </p>
</div>


In [21]:
period_name = first_forecast_item.find(class_='period-name')
period_name.get_text()

'Today'

In [22]:
desc = first_forecast_item.find('img').get('title')
desc

'Today: Sunny, with a high near 69. North wind 5 to 14 mph becoming west in the afternoon. Winds could gust as high as 18 mph. '

In [23]:
short_desc = first_forecast_item.find(class_='short-desc')
short_desc.get_text()

'Sunny'

In [24]:
temp = first_forecast_item.find(class_='temp')
temp.get_text()

'High: 69 °F'

In [25]:
# extracting all information from page

period_names = [pn.get_text() for pn in seven_day.select('.tombstone-container .period-name')]
print(period_names)
print()
short_descs = [sd.get_text() for sd in seven_day.select('.tombstone-container .short-desc')]
print(short_desc)
print()
temps = [t.get_text() for t in seven_day.select('.tombstone-container .temp')]
print(temp)
print()
descs = [i.get('title') for i in seven_day.select('.tombstone-container img')]
print(desc)
print()

['Today', 'Tonight', 'Tuesday', 'TuesdayNight', 'Wednesday', 'WednesdayNight', 'Thursday', 'ThursdayNight', 'Friday']

<p class="short-desc">Sunny</p>

<p class="temp temp-high">High: 69 °F</p>

Today: Sunny, with a high near 69. North wind 5 to 14 mph becoming west in the afternoon. Winds could gust as high as 18 mph. 



In [26]:
# pandas dataframe using data dictionary
import pandas as pd
weather = pd.DataFrame({
        'period' : period_names,
        'short_desc' :  short_descs,
        'temp' : temps,
        'desc' : descs        
    })
weather

Unnamed: 0,desc,period,short_desc,temp
0,"Today: Sunny, with a high near 69. North wind ...",Today,Sunny,High: 69 °F
1,"Tonight: Partly cloudy, with a low around 51. ...",Tonight,Partly Cloudy,Low: 51 °F
2,"Tuesday: Mostly sunny, with a high near 67. Ca...",Tuesday,Mostly Sunny,High: 67 °F
3,"Tuesday Night: Partly cloudy, with a low aroun...",TuesdayNight,Partly Cloudy,Low: 52 °F
4,"Wednesday: Partly sunny, with a high near 70. ...",Wednesday,Partly Sunny,High: 70 °F
5,"Wednesday Night: Mostly cloudy, with a low aro...",WednesdayNight,Mostly Cloudy,Low: 53 °F
6,Thursday: A 20 percent chance of rain. Partly...,Thursday,Slight ChanceRain,High: 67 °F
7,"Thursday Night: Rain. Mostly cloudy, with a l...",ThursdayNight,Rain,Low: 54 °F
8,"Friday: Rain. Cloudy, with a high near 64. Wi...",Friday,Rain andWindy,High: 64 °F


In [27]:
# data analysis

# Named groups will become column names in the result. Check the link up top
# $ s.str.extract('(?P<letter>[ab])(?P<digit>\d)')
#   letter digit
# 0      a     1
# 1      b     2
# 2    NaN   NaN

# numeric temp
temp_nums = weather['temp'].str.extract('(?P<temp_num>\d+)', expand=False)
weather['temp_num'] = temp_nums.astype('int')
weather

Unnamed: 0,desc,period,short_desc,temp,temp_num
0,"Today: Sunny, with a high near 69. North wind ...",Today,Sunny,High: 69 °F,69
1,"Tonight: Partly cloudy, with a low around 51. ...",Tonight,Partly Cloudy,Low: 51 °F,51
2,"Tuesday: Mostly sunny, with a high near 67. Ca...",Tuesday,Mostly Sunny,High: 67 °F,67
3,"Tuesday Night: Partly cloudy, with a low aroun...",TuesdayNight,Partly Cloudy,Low: 52 °F,52
4,"Wednesday: Partly sunny, with a high near 70. ...",Wednesday,Partly Sunny,High: 70 °F,70
5,"Wednesday Night: Mostly cloudy, with a low aro...",WednesdayNight,Mostly Cloudy,Low: 53 °F,53
6,Thursday: A 20 percent chance of rain. Partly...,Thursday,Slight ChanceRain,High: 67 °F,67
7,"Thursday Night: Rain. Mostly cloudy, with a l...",ThursdayNight,Rain,Low: 54 °F,54
8,"Friday: Rain. Cloudy, with a high near 64. Wi...",Friday,Rain andWindy,High: 64 °F,64


In [28]:
# temp mean
weather['temp_num'].mean()

60.77777777777778

In [29]:
# is night?
is_night = weather['period'].str.upper().str.lower().str.contains('night')
weather['is_night'] = is_night
weather

Unnamed: 0,desc,period,short_desc,temp,temp_num,is_night
0,"Today: Sunny, with a high near 69. North wind ...",Today,Sunny,High: 69 °F,69,False
1,"Tonight: Partly cloudy, with a low around 51. ...",Tonight,Partly Cloudy,Low: 51 °F,51,True
2,"Tuesday: Mostly sunny, with a high near 67. Ca...",Tuesday,Mostly Sunny,High: 67 °F,67,False
3,"Tuesday Night: Partly cloudy, with a low aroun...",TuesdayNight,Partly Cloudy,Low: 52 °F,52,True
4,"Wednesday: Partly sunny, with a high near 70. ...",Wednesday,Partly Sunny,High: 70 °F,70,False
5,"Wednesday Night: Mostly cloudy, with a low aro...",WednesdayNight,Mostly Cloudy,Low: 53 °F,53,True
6,Thursday: A 20 percent chance of rain. Partly...,Thursday,Slight ChanceRain,High: 67 °F,67,False
7,"Thursday Night: Rain. Mostly cloudy, with a l...",ThursdayNight,Rain,Low: 54 °F,54,True
8,"Friday: Rain. Cloudy, with a high near 64. Wi...",Friday,Rain andWindy,High: 64 °F,64,False


In [30]:
# A good next step would be to pick a site and try some web scraping on your own. 
# Some good examples of data to scrape are:
# (1) News articles
# (2) Sports scores
# (3) Weather forecasts
# (4) Stock prices
# (5) Online retailer prices
# You may also want to keep scraping the National Weather Service, 
# and see what other data you can extract from the page, or about your own city.