# Scraping Data with BeautifulSoup

## Getting the Soup

In [14]:
from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://en.wikipedia.org'
#Wikipedia will reject requests unless we add
# a user-agent attribute to our http header
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_Nobel_soup():
    '''Return a parsed tag tree of our Nobel prize page'''
    response = requests.get(
        BASE_URL + '/wiki/List_of_Nobel_Laureates', headers=HEADERS)
    #return the response parsed by BeautifulSoup
    return BeautifulSoup(response.content, 'lxml') #lxml is one of the parser options

## Selecting Tags

In [15]:
soup = get_Nobel_soup()
soup.find('table', {'class': 'wikitable sortable'})
# this works, but fine is not very robust.  If we change the order
# of the two classes we specified, it won't work if it doesn't match
# the order that the two classes were defined in in the HTML
soup.find('table', {'class': 'sortable wikitable'})

# So instead of using BeautifulSoup's selectors (which are fragile)
# we recommend using lxml's methods instead:
soup.select('table.sortable.wikitable') #lxml uses CSS style selectors ('.' is class, '#' is id, etc.)
# This works no matter the order of the classes and returns an *array* of all the matches

table = soup.select_one('table.sortable.wikitable') #selects just the first one
#print(table)
table.select('th')
# these lxml selectors also support regex and other approaches.

def get_column_titles(table):
    '''Get the Nobel categories from the table header'''
    cols = []
    for th in table.select_one('tr').select('th')[1:]: #loop through table head, ignoring leftmost year column
        link = th.select_one('a')
        if link:
            cols.append({'name': link.text,
                        'href': link.attrs['href']})
        else:
            cols.append({'name':th.text, 'href': None})
    return cols
            
print( get_column_titles(table) )

[{'href': '/wiki/List_of_Nobel_laureates_in_Physics', 'name': u'Physics'}, {'href': '/wiki/List_of_Nobel_laureates_in_Chemistry', 'name': u'Chemistry'}, {'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine', 'name': u'Physiology\nor Medicine'}, {'href': '/wiki/List_of_Nobel_laureates_in_Literature', 'name': u'Literature'}, {'href': '/wiki/List_of_Nobel_Peace_Prize_laureates', 'name': u'Peace'}, {'href': '/wiki/List_of_Nobel_laureates_in_Economics', 'name': u'Economics'}]


In [16]:
def get_Nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-1]:
        try:
            year = int(row.select_one('td').text) #Gets first <td>
        except ValueError:
            year = None
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({
                        'year':year,
                        'category':cols[i]['name'],
                        'name':winner.text,
                        'link':winner.attrs['href']
                    })
    return winners

winners = get_Nobel_winners(table)
print(winners)[:2]

[{'category': u'Physics', 'link': '/wiki/Wilhelm_R%C3%B6ntgen', 'name': u'Wilhelm R\xf6ntgen', 'year': 1901}, {'category': u'Chemistry', 'link': '/wiki/Jacobus_Henricus_van_%27t_Hoff', 'name': u"Jacobus Henricus van 't Hoff", 'year': 1901}]


# Caching web pages

Since we can make large numbers of requests, it is best to cache results.  The python package 'requests-cache' makes this easy.  It has useful options like specifying the cache backend (memory or a DB) and setting an expiration time for the cache.

In [17]:
import requests
import requests_cache

requests_cache.install_cache('nobel_pages', backend='sqlite', expire_after=7200)
#use requests as usual...

## Follow link and scrape nationality from bio page

In [18]:
def get_winner_nationality(w):
    '''scrape bio data from the winner's wikipedia page'''
    response = requests.get('http://en.wikipedia.org' + w['link'], headers=HEADERS)
    soup = BeautifulSoup(response.content, 'lxml')
    person_data = {'name': w['name']}
    attr_rows = soup.select('table.infobox tr') #remember, this is CSS-style selectors
    for tr in attr_rows:
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
    return person_data

# test the get_winner_nationality
wdata = []
# test first 50 winners
for w in winners[:50]:
    wdata.append(get_winner_nationality(w))
missing_nationality = []
for w in wdata:
    # if missing 'Nationality' add to list
    if not w.get('Nationality'):
        missing_nationality.append(w)
print(missing_nationality)

[{'name': u'\xc9lie Ducommun'}, {'name': u'Charles Albert Gobat'}, {'name': u'Marie Curie'}, {'name': u'Niels Ryberg Finsen'}, {'name': u'Ivan Pavlov'}, {'name': u'Institut de Droit International'}, {'name': u'Philipp Lenard'}, {'name': u'Bertha von Suttner'}, {'name': u'Santiago Ram\xf3n y Cajal'}, {'name': u'Theodore Roosevelt'}, {'name': u'Ernesto Teodoro Moneta'}, {'name': u'Louis Renault'}, {'name': u'Paul Ehrlich'}, {'name': u'Rudolf Christoph Eucken'}, {'name': u'Klas Pontus Arnoldson'}]
