See https://www.gapminder.org/videos/dont-panic-the-facts-about-population/

In [None]:
import numpy as np
from datascience import *

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
population = Table.read_table('../../data/population.csv')
population.show(3)

In [None]:
# bangladesh population in [1970, 2015]
# Alpha-3 code for Bangladesh is BGD

b_pop = population.where('geo', are.equal_to('bgd')).where('time', are.between_or_equal_to(1970, 2015)).drop('geo')
b_pop

In [None]:
life_expectancy = Table.read_table('../../data/life_expectancy.csv')
child_mortality = Table.read_table('../../data/child_mortality.csv').relabeled(2, 'child_mortality_under_5_per_1000_born')
fertility = Table.read_table('../../data/fertility.csv')

In [None]:
print(life_expectancy.row(0))
print(child_mortality.row(0))
print(fertility.row(0))

In [None]:
def fertility_over_time(country, start):
    """Create a two-column table that describes a country's total fertility rate each year."""
    return fertility.where('geo', are.equal_to(country)).where('time', are.above_or_equal_to(start)).drop(0).relabel(
        ['time', 'children_per_woman_total_fertility'], ['Year', 'Children per woman'])

In [None]:
fertility_over_time('bgd', 1970)

In [None]:
bangladesh_code = 'bgd'
fertility_over_time(bangladesh_code, 1970).plot(0, 1)

In [None]:
# Q 1.3
bgd_fertility = fertility_over_time('bgd', 1970)
bgd_child_mortality = child_mortality.where('geo', are.equal_to('bgd')).where(
    'time', are.above_or_equal_to(1970)).drop(0).relabel(0, 'Year').relabel(1, 'Child deaths per 1000 born')

post_1969_fertility_and_child_mortality = bgd_fertility.join('Year', bgd_child_mortality)
print(post_1969_fertility_and_child_mortality)
post_1969_fertility_and_child_mortality.scatter('Children per woman', 'Child deaths per 1000 born')

In [None]:
# patch, to avoid population_total int32 leading to overflow
x = np.array(population.column(2), dtype = 'int64')
population = population.drop(2).with_column('population_total', x)

population.where('time', are.between(1800, 2006)).drop('geo').group('time', sum).plot(0)

In [None]:
# Q 2.1

# We first create a population table that only includes the 
# 50 countries with the largest 2010 populations. We focus on 
# these 50 countries only so that plotting later will run faster.
big_50 = population.where('time', 2010).sort(2, descending=True).take(np.arange(50)).column('geo')
population_of_big_50 = population.where('time', are.above(1959)).where('geo', are.contained_in(big_50))


# Create a function stats_for_year that takes a year and returns a table of statistics.
# The table it returns should have four columns:
#  geo, population_total, children_per_woman_total_fertility, and child_mortality_under_5_per_1000_born.
# Each row should contain one Alpha-3 country code and three statistics:
# population, fertility rate, and child mortality for that year from the population, fertility and child_mortality tables.
# Only include rows for which all three statistics are available for the country and year.
def stats_for_year(year):
    """Return a table of the stats for each country that year."""
    p = population_of_big_50.where('time', year).drop('time')
    f = fertility.where('time', year).drop('time')
    c = child_mortality.where('time', year).drop('time')
    return p.join('geo', f).join('geo', c)

stats_for_year(2015)

In [None]:
def pop_for_year(year):
    return sum(stats_for_year(year).column('population_total'))

pop_for_year(1960)

In [None]:
np.arange(1960, 2011, 10)

In [None]:
# Q 2.2
decades = Table().with_column('decade', np.arange(1960, 2011, 10))
pop_by_decade = decades.with_column('population', decades.apply(pop_for_year, 'decade'))
pop_by_decade.set_format(1, NumberFormatter)
pop_by_decade


In [None]:
countries = Table.read_table('../../data/countries.csv').where('country', are.contained_in(population.group('geo').column(0)))
countries.select('country', 'name', 'world_6region').row(0)

In [None]:
# Q 2.3

stats_for_year(1960).join('geo', countries, 'country').group('world_6region').relabel(0, 'region')

In [None]:
from functools import lru_cache as cache

@cache(None)
def stats_relabeled(year):
    """Relabeled and cached version of stats_for_year."""
    return stats_for_year(year).relabeled(2, 'Children per woman').relabeled(3, 'Child deaths per 1000 born')

def fertility_vs_child_mortality(year):
    """Draw a color scatter diagram comparing child mortality and fertility."""
    with_region = stats_relabeled(year).join('geo', countries.select('country', 'world_6region'), 'country')
    with_region.scatter(2, 3, sizes=1, colors=4, s=500)
    plt.xlim(0,10)
    plt.ylim(-50, 500)
    plt.title(year)

fertility_vs_child_mortality(2010)

In [None]:
import ipywidgets as widgets

for year in np.arange(1960, 2016):
    stats_relabeled(year)

_ = widgets.interact(fertility_vs_child_mortality, 
                     year=widgets.IntSlider(min=1960, max=2015, value=1960))