In [None]:
# Display as slides with the Jupyter notebook RISE extension
# https://github.com/damianavila/RISE
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
              'theme': 'sans-serif',
              'transition': 'default',
              'start_slideshow_at': 'selected',
})

# Intro to Web Scraping


&nbsp;

### Matt Bauman
#### July 6, 2016

# What is HTML?

* Human *and* machine-readable text
* Supposed to be the semantic structure of a document

* Horribly abused
* Often terribly malformed
* Frequently unreadable by humans and just barely readable by machines

* It's a ~~miracle~~ ton of effort that makes browsers work at all

# Okay, but *what is it*?

* Plain-text markup that wraps content in **tags**
* Tags are marked in brackets like `<body>`
* And everything that follows is considered part of `body` until it's closed with a `</body>`.

* Tags can be nested
* Can be closed immediately without enclosing any content `<div />`.
* Can have attributes to modify their behavior or name them

In [None]:
from IPython.core.display import display, HTML
display(HTML('<p style="color:red;">Hello, world</h1>'))

In [None]:
import requests
print requests.get('http://www.nytimes.com/').text

# Important tags for scraping

* `div` - major sections
* `table` - broken down into `tr` (rows) and `td` (datum)
* `form` - contains `input` tags that get submitted
* `ul`/`ol` - lists (ordered and unordered), contains `li` (list items)

# Important attributes for scraping

* `id` and `class`
* They *name* tags; web developers use these names for styling and interactivity
* `id`s are unique; `class`es are groups

# Why web scraping is terrible

### Invalid pages and incompatibilities

* w3c (WWW Consortium) sets standards for HTML, CSS, XML, etc.
* They have [a validator](https://validator.w3.org) to ensure that pages meet their specs

### HTML can be extremely hard to read

* Fortunately, web inspector tools can make your life easier
* Check out [The NY Times](http://www.nytimes.com/) in the browser

### Some sites require javascript to work

* There aren't any libraries (that I'm aware of) that implement Javascript
* Try turning off Javascript in your browser and make sure the site still works
* You can often *emulate* the Javascript code to make the same requests... but it's a pain

### It's fragile 

* While the *markup* is machine readable, that just specifies page layout
* The *same content* can be coded in HTML in an infinite number of ways and still look identical
* Web authors can change their code at any point...

* **and still look very similar**. [An extreme example](https://web.archive.org/web/20001109144000/http://www1.nytimes.com/)

# Working around the terrible-ness

* Don't worry about parsing yourself -- no regexes or string searches!
* Don't worry about traversing individual nested levels (e.g., inside two divs and ...)

### Instead...

* Think of each webpage as a "tag soup"
* Try to find a way to describe the tags you're looking for in a minimal way
* And use a good library

# Scraping in five lines:

In [None]:
# Look for headlines in the NYTimes
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.nytimes.com/')
soup = BeautifulSoup(r.text)
tags = soup.find_all(attrs={'class': 'story-heading'})


In [None]:
for tag in tags: display(HTML(str(tag)))

# Hedging your bets

* There are lots of ways to specify a search through the tag soup
* Some methods may be more robust than others...
* But it's not worth spending too much time trying to out-wit whatever might be updating the site on the other side

In [None]:
# Another way to get the headlines
articles = soup.find_all('article')
import re
[article.find_all(re.compile('^h\d')) for article in articles]

# Advanced topics: HTTP

* HTTP specifies *how* you ask for and retrieve content
* Also specifies metadata in headers that control caching, redirects, sessions, and more

In [None]:
r = requests.get('http://google.com/')
r.headers

# Searches and forms

* Typically, the most interesting things to scrape are hidden behind searches and forms
* How do you enter text into Google's search box via Python?

In [None]:
soup = BeautifulSoup(requests.get('http://google.com').text)
print soup.find('form').prettify()

In [None]:
r = requests.get('http://google.com/search', 
                 params={'q':  'how long does a walrus live?',
                         'btnI': "I'm Feeling Lucky"})


# Types of requests

* `requests.get` is actually doing a `GET`
    * It encodes the parameters (if any) directly into the url: `?param=value&param2=value2...`
    * This means that it gets *saved into your browser history*
    * Back buttons, refresh may send the same parameters again

### Other HTTP verbs:

* `POST` is the other most common method
    * Just like `GET`, except that it sends its parameters hidden in a header
    * Often used for purchases, posts, etc, that you don't want to submit twice
* There's [others](https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol#Request_methods) (`PUT`, `DELETE`, `HEAD`, ...), but they're rarer

# A slightly more complicated example

* Let's look for satellites! [heavens-above.com](http://heavens-above.com)

In [None]:
# Scrape the times that the ISS is visible
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def scrape_times(text):
    soup = BeautifulSoup(text)
    rows = soup.find_all('tr', attrs={'class':'clickableRow'})
    times = []
    for row in rows:
        cols = row.find_all('td')
        times.append(cols[0].text + ' ' + cols[2].text)
    return times
scrape_times(r.text)

In [None]:
# Get the next page
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def get_next_page(r):
    soup = BeautifulSoup(r.text)
    inputs = soup.find_all('input')
    d = {input.attrs['name']: input.attrs['value'] for input in inputs}
    d.pop('ctl00$cph1$btnPrev')
    d['ctl00_cph1_radioAll'] = 'radioVisible'
    from urlparse import urljoin
    url = urljoin(r.url, soup.find('form').attrs['action'])
    return requests.post(url, d)
scrape_times(get_next_page(r).text)

In [None]:
# Get the next 10 pages!
from tqdm import tqdm
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')

times = []
for i in tqdm(xrange(10)):
    times.extend(scrape_times(r.text))
    r = get_next_page(r)
times