# Scraping Nobel Prize Wikipedia Page with BeautifulSoup
We are using BeautifulSoup to *scrape* the [Nobel Prize Winners Wikipedia Page](https://en.wikipedia.org/wiki/List_of_Nobel_laureates)

In [5]:
# install necessary libraries if absent
!pip install beautifulsoup4
!pip install --upgrade requests-cache



In [7]:
from bs4 import BeautifulSoup
import requests
import requests_cache

# Set cache for two hours
requests_cache.install_cache('nobel_pages', backend='sqlite', expire_after=7200)

In [8]:
BASE_URL = 'http://en.wikipedia.org'
HEADERS = {'User-Agent': 'Mozilla/5.0'}

In [9]:
def get_Nobel_soup():
    """Get the BeautifulSoup Tree Hierarchy of the Webpage"""
    response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS)
    return BeautifulSoup(response.content, 'lxml')

In [10]:
soup = get_Nobel_soup()

In [11]:
table = soup.select_one('table.sortable.wikitable') # Select the table with the data

In [12]:
def get_column_titles(table):
    """Get the Nobel Categories from the table header"""
    cols = []
    for th in table.select_one('tr').select('th')[1:]: # Avoid the first column with the Year
        link = th.select_one('a')
        if link:
            cols.append({'name': link.text,
                        'href': link.attrs['href']})
        else:
            cols.append({'name': th.text, 'href': None})
    return cols

In [13]:
get_column_titles(table)

[{'name': 'Physics', 'href': '/wiki/List_of_Nobel_laureates_in_Physics'},
 {'name': 'Chemistry', 'href': '/wiki/List_of_Nobel_laureates_in_Chemistry'},
 {'name': 'Physiologyor Medicine',
  'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine'},
 {'name': 'Literature', 'href': '/wiki/List_of_Nobel_laureates_in_Literature'},
 {'name': 'Peace', 'href': '/wiki/List_of_Nobel_Peace_Prize_laureates'},
 {'name': 'Economics', 'href': '/wiki/List_of_Nobel_laureates_in_Economics'}]

In [14]:
def get_Nobel_winners(table):
    """Get the Nobel Winners"""
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-1]:
        year = int(row.select_one('td').text) # Get 1st <td>
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                  winners.append({
                      'year': year,
                      'category': cols[i]['name'],
                      'name': winner.text,
                      'link': winner.attrs['href']
                  })
    return winners

In [33]:
winners = get_Nobel_winners(table)

In [34]:
def get_winner_nationality(winner):
    """Scapre biographic data from the winner's wikipedia page"""
    data = requests.get(BASE_URL + winner['link'], headers=HEADERS)
    soup = BeautifulSoup(data.content, 'lxml')
    person_data = {'name': winner['name']}
    attr_rows = soup.select('table.infobox tr')
    for tr in attr_rows:
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
    return person_data

In [35]:
winner_data = []
# test first 50 winners
for winner in winners[:50]:
    winner_data.append(get_winner_nationality(winner))
missing_nationality = []
for winner in winner_data:
    # if mising 'Nationality,' add to list
    if not winner.get('Nationality'):
        missing_nationality.append(winner)
winner_data

[{'name': 'Wilhelm Röntgen', 'Nationality': 'German[1]'},
 {'name': "Jacobus Henricus van 't Hoff", 'Nationality': 'Dutch'},
 {'name': 'Emil Adolf von Behring', 'Nationality': 'German'},
 {'name': 'Sully Prudhomme', 'Nationality': 'French'},
 {'name': 'Henry Dunant', 'Nationality': 'Swiss'},
 {'name': 'Frédéric Passy', 'Nationality': 'French'},
 {'name': 'Hendrik Lorentz', 'Nationality': 'Dutch'},
 {'name': 'Pieter Zeeman', 'Nationality': 'Netherlands'},
 {'name': 'Hermann Emil Fischer', 'Nationality': 'Germany'},
 {'name': 'Ronald Ross', 'Nationality': 'British'},
 {'name': 'Theodor Mommsen', 'Nationality': 'German'},
 {'name': 'Élie Ducommun'},
 {'name': 'Charles Albert Gobat'},
 {'name': 'Henri Becquerel', 'Nationality': 'French'},
 {'name': 'Pierre Curie'},
 {'name': 'Marie Curie'},
 {'name': 'Svante Arrhenius', 'Nationality': 'Swedish'},
 {'name': 'Niels Ryberg Finsen'},
 {'name': 'Bjørnstjerne Bjørnson', 'Nationality': 'Norwegian'},
 {'name': 'Randal Cremer', 'Nationality': 'Brit

In [36]:
missing_nationality

[{'name': 'Élie Ducommun'},
 {'name': 'Charles Albert Gobat'},
 {'name': 'Pierre Curie'},
 {'name': 'Marie Curie'},
 {'name': 'Niels Ryberg Finsen'},
 {'name': 'Ivan Pavlov'},
 {'name': 'Institut de Droit International'},
 {'name': 'Philipp Lenard'},
 {'name': 'Bertha von Suttner'},
 {'name': 'Santiago Ramón y Cajal'},
 {'name': 'Theodore Roosevelt'},
 {'name': 'Ernesto Teodoro Moneta'},
 {'name': 'Louis Renault'},
 {'name': 'Paul Ehrlich'},
 {'name': 'Rudolf Christoph Eucken'},
 {'name': 'Klas Pontus Arnoldson'}]