# Simple web scraping practice
**First things first, import those libraries**

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

**Perform a get requests on `http://dataquestio.github.io/web-scraping-pages/simple.html` and print the status code**

In [2]:
url = 'http://dataquestio.github.io/web-scraping-pages/simple.html'
response = requests.get(url)
response

<Response [200]>

**To see the content of the response, use the content attribute**

In [3]:
response.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

**When life gives you HTML, make soup. **

In [4]:
soup = BeautifulSoup(response.content, 'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

**We can make this look pretty, with proper indentation, using the `.prettify()` method**

In [5]:
soup.prettify()

'<!DOCTYPE html>\n<html>\n <head>\n  <title>\n   A simple example page\n  </title>\n </head>\n <body>\n  <p>\n   Here is some simple content for this page.\n  </p>\n </body>\n</html>'

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


**We can move through the structure one level at a time. Select all of the top level elements using the `.children` attribute. _Note that this attribute returns a list generator. Wrap it in a `list()` function to see the actual values_**

In [7]:
list(soup.children)

['html', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [8]:
list(soup.children)[1]

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

**Using a list comprehension, check the type of each top level element**

In [9]:
[type(child) for child in soup.children]

[bs4.element.Doctype, bs4.element.Tag]

**When we manually select the second top-level element, we can see all of the html content**

In [10]:
soup.find_all('body')

[<body>
 <p>Here is some simple content for this page.</p>
 </body>]

**But the fastest way to grab this HTML is through the built in `.html` attribute**

In [11]:
html = soup.html
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

**We can grab the text out of this HTML with `.get_text()`**

In [12]:
html.get_text()

'\n\nA simple example page\n\n\nHere is some simple content for this page.\n\n'

In [13]:
html.get_text(strip=True)

'A simple example pageHere is some simple content for this page.'

In [14]:
html.get_text(separator=' ', strip=True)

'A simple example page Here is some simple content for this page.'

**And also with the `.text` attribute**

In [15]:
html.text

'\n\nA simple example page\n\n\nHere is some simple content for this page.\n\n'

In [16]:
# html.text gives you a string, which you use .strip() on
html.text.strip()

'A simple example page\n\n\nHere is some simple content for this page.'

## Quick Practice


1. Create a new request to get the html at `http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html`    
    
2. Turn that HTML into a soup object    
    
3. Using the `.find()` method, save the title tag as its own variable.    
    
4. Do the same with the body tag  
    


In [17]:
res = requests.get('http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html')
soup = BeautifulSoup(res.content, 'lxml')

In [18]:
head = soup.find('head')
head

<head>
<title>A simple example page</title>
</head>

In [19]:
# We can use '.text' or '.get_text()' on the find object
title = soup.find('head').get_text(strip=True)
title

'A simple example page'

In [20]:
body = soup.find('body')
body

<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>

In [21]:
body.get_text(strip=True, separator=' ')

'First paragraph. Second paragraph. First outer paragraph. Second outer paragraph.'

**Now we can start ripping through these paragraphs using `.find_all()`**

In [22]:
body.find_all('p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>, <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

**But what if I only care about paragraphs with the 'outer-text' class?**

In [23]:
body.find_all('p', {'class':'outer-text'})

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

**Even if it isn't a paragraph, we can select all elements with the class 'outer-text'**

In [24]:
body.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

**The same goes for ID's**

In [25]:
# Get first id
soup.find(id='first')

<p class="inner-text first-item" id="first">
                First paragraph.
            </p>

In [26]:
# Get second id
soup.find(id='second')

<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>

In [27]:
# You can use it in a dictionary
soup.find(attrs={'id':'second'})

<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>

**We can use the `.select()` method with CSS selectors**

In [28]:
soup.select('div p#first' )

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

## Lets put this into action on a real site

Our task is to scrape the Extended Forecast for Seattle, found [here](https://forecast.weather.gov/MapClick.php?lat=47.6036&lon=-122.3294). Before we begin, lets explore the site using chrome's dev tools:
- How would we select the entire extended forecast container?
- What method is best for selecting each forecast as individual elements?
- And how would we select them?

**Once we know how to drill down to the 'observations', we can begin scraping**

**Create the initial request and grab the HTML**

In [29]:
url = 'https://forecast.weather.gov/MapClick.php?lat=38.8904&lon=-77.032#.Xid0i1NKi1s'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

**Grab our seven-day container**

In [30]:
dc_forecast = soup.find(id='seven-day-forecast')

**Select all of the tombstone-containers**

In [31]:
forecast_items = dc_forecast.find_all(attrs={'class':'tombstone-container'})

**Print out the first one**

In [32]:
forecast_items[0]

<div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Mostly clear, with a low around 24. North wind 5 to 7 mph. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 24. North wind 5 to 7 mph. "/></p><p class="short-desc">Mostly Clear</p><p class="temp temp-low">Low: 24 °F</p></div>

**Using the class of the html and the `.text` attribute, select the period for which the forecast is made, the short description of the forecast, and the temperature.**

In [33]:
tonight = forecast_items[0]
period = tonight.find(class_='period-name').text
period

'Tonight'

In [34]:
desc = tonight.find(class_='short-desc').text
desc

'Mostly Clear'

In [35]:
temp = tonight.find(class_='temp-low').text
temp

'Low: 24 °F'

**Back on the actual webpage, we can mouse over the images and see a bit more information**

In [36]:
# get the title from the images (the "alt text/title")
img = tonight.find('img')
desc = img['title'] 
desc

'Tonight: Mostly clear, with a low around 24. North wind 5 to 7 mph. '

**Now lets run this process for every forecast on the page, and print the data**

In [37]:
for item in forecast_items:
    print(item.find(class_='period-name').text)
    print(item.find(class_='short-desc').text)
    print(item.find(class_='temp').text)
    print('------')

Tonight
Mostly Clear
Low: 24 °F
------
Wednesday
Sunny
High: 41 °F
------
WednesdayNight
Mostly Clear
Low: 25 °F
------
Thursday
Mostly Sunny
High: 45 °F
------
ThursdayNight
Mostly Cloudy
Low: 32 °F
------
Friday
Partly Sunnythen SlightChance Rain
High: 49 °F
------
FridayNight
Rain
Low: 39 °F
------
Saturday
Rain
High: 50 °F
------
SaturdayNight
ChanceShowers
Low: 36 °F
------


**One more time, but lets save the data and turn it into a DataFrame**

In [38]:
forecast_list = []
for item in forecast_items:
    period = item.find(class_='period-name').text
    desc = item.find(class_='short-desc').text
    temp = item.find(class_='temp').text
    
    forecast_dict = {
        'Period':period,
        'Description':desc,
        'Temperature':temp
    }
    
    forecast_list.append(forecast_dict)

In [39]:
forecast_df = pd.DataFrame(forecast_list)
forecast_df

Unnamed: 0,Period,Description,Temperature
0,Tonight,Mostly Clear,Low: 24 °F
1,Wednesday,Sunny,High: 41 °F
2,WednesdayNight,Mostly Clear,Low: 25 °F
3,Thursday,Mostly Sunny,High: 45 °F
4,ThursdayNight,Mostly Cloudy,Low: 32 °F
5,Friday,Partly Sunnythen SlightChance Rain,High: 49 °F
6,FridayNight,Rain,Low: 39 °F
7,Saturday,Rain,High: 50 °F
8,SaturdayNight,ChanceShowers,Low: 36 °F


In [40]:
# Save to a csv file
forecast_df.to_csv('forecast.csv', index=False)