# Gapminder data choropleths

In [1]:
import plotly.plotly as py
import plotly.tools as tls
import pandas as pd
import json
import urllib2
import re

The most famous gapminder data is featured in [Hans Rosling](https://en.wikipedia.org/wiki/Hans_Rosling)'s bubble charts (see a plotly version [here](https://plot.ly/~etpinard/191/fig-31b-hans-roslings-bubble-chart-for-the-year-2007/)).

In this notebook, we'll explore the same dataset using plotly choropleths.

In [2]:
# big thanks to Jennifer Bryan!
df_full = pd.read_csv('http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderDataFiveYear.txt', sep='\t')

df_full.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106


Select a year and truncate the dataframe

In [3]:
# Choose a year, find other years with df['year'].unique()
the_year = 2007   

# Find indices corresponding to 'the_year'
i_year = (df_full['year'] == the_year)

# Grab all rows correponding to 'the_year'
df = df_full[i_year] 

Plotly uses [ISO-3](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3) identifier codes to draw choropleths.

The JSON below will help us convert the country names found in the Wikipedia table to ISO-3 codes.

In [4]:
get_iso3 = json.load(urllib2.urlopen('https://raw.githubusercontent.com/etpinard/country-iso3/master/get-iso3.json'))

get_iso3

[{u'iso3': u'AFG', u'regex': u'afghan'},
 {u'iso3': u'ALA', u'regex': u'\\b(a|\xe5)land'},
 {u'iso3': u'ALB', u'regex': u'albania'},
 {u'iso3': u'DZA', u'regex': u'algeria'},
 {u'iso3': u'ASM', u'regex': u'^(?=.*americ).*samoa'},
 {u'iso3': u'AND', u'regex': u'andorra'},
 {u'iso3': u'AGO', u'regex': u'angola'},
 {u'iso3': u'AIA', u'regex': u'anguill?a'},
 {u'iso3': u'ATA', u'regex': u'antarctica'},
 {u'iso3': u'ATG', u'regex': u'antigua'},
 {u'iso3': u'ARG', u'regex': u'argentin'},
 {u'iso3': u'ARM', u'regex': u'armenia'},
 {u'iso3': u'ABW', u'regex': u'^(?!.*bonaire).*\\baruba'},
 {u'iso3': u'AUS', u'regex': u'australia'},
 {u'iso3': u'AUT', u'regex': u'^(?!.*hungary).*austria|\\baust.*\\bemp'},
 {u'iso3': u'AZE', u'regex': u'azerbaijan'},
 {u'iso3': u'BHS', u'regex': u'bahamas'},
 {u'iso3': u'BHR', u'regex': u'bahrain'},
 {u'iso3': u'BGD', u'regex': u'bangladesh|^(?=.*east).*paki?stan'},
 {u'iso3': u'BRB', u'regex': u'barbados'},
 {u'iso3': u'BLR', u'regex': u'belarus|byelo'},
 {u'is

Loop through all the countries in the dataset. For each country, try each of the regular expressions of the above JSON. When a match is found, append the data lists.

In [5]:
locations = []
pops = []
lExps = []
gdps = []

# keep track of country names that do not match to a regex
countries_no_match = []

for country, pop, lExp, gdp in zip(df['country'], df['pop'], df['lifeExp'], df['gdpPercap']):
    for item in get_iso3:
        # N.B. an empty list in python is falsy
        if re.findall(item['regex'], country.lower()):
            locations.append(item['iso3'])
            pops.append(pop / 1e6)
            lExps.append(lExp)
            gdps.append(gdp)
            break
    else:
        print(country.lower())
        losers.append(country)

In [6]:
len(locations), len(df['country']), len(countries_no_match)

# all countries have found a match!

(142, 142, 0)

Define a plot function

In [10]:
def plot(z, title, units):
    if units == '$':
        colorbar= dict(
            tickprefix=' ' + units,
            showtickprefix='last'   
        )
    else:
        colorbar = dict(
            ticksuffix=' ' + units,
            showticksuffix='last'
        )
    
    url = py.plot(
        dict(
            data=[
                dict(
                    type='choropleth',
                    locations=locations,
                    z=z,
                    colorbar=colorbar
                )
            ],
            layout=dict(
                title=title + ' by country in ' + str(the_year),
                titlefont=dict(
                    size=22
                ),
                geo=dict(
                    projection=dict(
                        type='kavrayskiy7'
                    )
                ),
                width=1000,
                height=600
            )
        ),
        validate=False,
        filename=title.lower().replace(' ', '-'),
        auto_open=False
    )
    print(url)

Use the plot function for each of the three dependent variables

In [11]:
plot(pops, 'World population', 'million')
plot(lExps, 'Life expectancy', 'year')
plot(gdps, 'GDP per capita', '$')

https://plot.ly/~etpinard/4250
https://plot.ly/~etpinard/4252
https://plot.ly/~etpinard/4254


In [12]:
tls.embed('https://plot.ly/~etpinard/4250')

We note that the Russia is not part of the dataset.

In [13]:
tls.embed('https://plot.ly/~etpinard/4252')

In [14]:
tls.embed('https://plot.ly/~etpinard/4254')

In [19]:
# Inject CSS styling in the NB
from IPython.display import display, HTML
display(HTML(open('../_custom.css').read()))