In [2]:
import cs109style
cs109style.customize_mpl()
cs109style.customize_css()

# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
#from pattern import web



Setting custom matplotlib visual style
Setting custom CSS for the IPython Notebook


## Fetching population data from Wikipedia

In this example we will fetch data about countries and their population from Wikipedia.

http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.

### We will go through the following steps:
* fetching html with embedded data
* parsing html to extract the data
* collecting the data in a panda dataframe
* displaying the data

To give you some starting points for your homework, we will also show the different sub-steps that can be taken to reach the presented solution.

## Fetching the Wikipedia site

In [4]:
url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'
website_html = requests.get(url).text
# print(website_html)

## Parsing html data

In [18]:
def get_population_html_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    soup = BeautifulSoup(html)
    
    tbls = soup.find_all('table', {'class': 'sortable wikitable'})

    return tbls

tables = get_population_html_tables(website_html)

#print ("table length: %d" %len(tables))
print(tables[0].find('th').get_text())

Country (or dependent territory)


In [21]:
def table_type(tbl):
    return tbl.find('th').get_text()

table_by_type = {}
for tbl in tables:
    typ = table_type(tbl)
    if typ not in table_by_type:
        table_by_type[typ] = list()
    table_by_type[typ].append(tbl)


# group the tables by type
tables_by_type = defaultdict(list)  # defaultdicts have a default value that is inserted when a new key is accessed
for tbl in tables:
    tables_by_type[table_type(tbl)].append(tbl)

print(tables_by_type)

defaultdict(<class 'list'>, {'Country (or dependent territory)': [<table class="sortable wikitable" style="text-align: right">
<tbody><tr>
<th>Country (or dependent territory)</th>
<th>1950</th>
<th>1955</th>
<th>%</th>
<th>1960</th>
<th>%</th>
<th>1965</th>
<th>%</th>
<th>1970</th>
<th>%</th>
<th>1975</th>
<th>%</th>
<th>1980</th>
<th>%
</th></tr>
<tr>
<td align="left"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a></td>
<td>8,151</td>
<td>8,892</td>
<td>1.76</td>
<td>9,830</td>

## Extracting data and filling it into a dictionary

In [37]:
headers = tables[0].find_all('tr')
first_header = headers[0]
th_s = first_header.find_all('th')

years = [int(val.get_text()) for val in th_s if val.get_text().isnumeric()]
year_indices = [idx for idx, val in enumerate(th_s) if val.get_text().isnumeric()]

rows = tables[0].find_all('tr')[1:]

for row in rows:
    tds = row.find_all('td')
    country_name = tds[0].find('a').get_text()
    print(country_name)

Afghanistan
Albania
Algeria
American Samoa
Andorra
Angola
Anguilla
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
British Virgin Islands
Brunei
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Cape Verde
Cayman Islands
Central African Republic
Chad
Chile
China
Colombia
Comoros
Cook Islands
Costa Rica
Croatia
Cuba
Curaçao
Cyprus
Czech Republic
Democratic Republic of the Congo
Denmark
Djibouti
Dominica
Dominican Republic
Timor-Leste
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Ethiopia
Faroe Islands
Federated States of Micronesia
Fiji
Finland
France
French Polynesia
Gabon
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Isle of Man
Israel
Italy
Ivory Coast
Jamaica
Japan
Jersey
Jordan


In [None]:
def get_countries_population(tables):
    """Extract population data for countries from all tables and store it in dictionary."""
    
    result = defaultdict(dict)

    # 1. step: try to extract data for a single table
    for tbl in tables:
        headers = tbl.find_all('tr')
        first_header = headers[0]
        th_s = first_header.find_all('th')

        years = [int(val.get_text()) for val in th_s if val.get_text().isnumeric()]
        year_indices = [idx for idx, val in enumerate(th_s) if val.get_text().isnumeric()]
        
        print(years)
        print(indices)

    # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict
    
    return result


result = get_countries_population(tables_by_type['Country (or dependent territory)'])
print(result)

## Creating a dataframe from a dictionary

In [None]:
# create dataframe

df = pd.DataFrame.from_dict(result, orient='index')
# sort based on year
df.sort(axis=1,inplace=True)
print df


## Some data accessing functions for a panda dataframe

In [None]:
subtable = df.iloc[0:2, 0:2]
print "subtable"
print subtable
print ""

column = df[1955]
print "column"
print column
print ""

row = df.ix[0] #row 0
print "row"
print row
print ""

rows = df.ix[:2] #rows 0,1
print "rows"
print rows
print ""

element = df.ix[0,1955] #element
print "element"
print element
print ""

# max along column
print "max"
print df[1950].max()
print ""

# axes
print "axes"
print df.axes
print ""

row = df.ix[0]
print "row info"
print row.name
print row.index
print ""

countries =  df.index
print "countries"
print countries
print ""

print "Austria"
print df.ix['Austria']

## Plotting population of 4 countries

In [None]:
plotCountries = ['Austria', 'Germany', 'United States', 'France']
    
for country in plotCountries:
    row = df.ix[country]
    plt.plot(row.index, row, label=row.name ) 
    
plt.ylim(ymin=0) # start y axis at 0

plt.xticks(rotation=70)
plt.legend(loc='best')
plt.xlabel("Year")
plt.ylabel("# people (million)")
plt.title("Population of countries")

## Plot 5 most populous countries from 2010 and 2060

In [None]:
def plot_populous(df, year):
    # sort table depending on data value in year column
    df_by_year = df.sort(year, ascending=False)
    
    plt.figure()
    for i in range(5):  
        row = df_by_year.ix[i]
        plt.plot(row.index, row, label=row.name ) 
            
    plt.ylim(ymin=0)
    
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Year")
    plt.ylabel("# people (million)")
    plt.title("Most populous countries in %d" % year)

plot_populous(df, 2010)
plot_populous(df, 2050)