In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import matplotlib as mpl

# High-res plots
mpl.rcParams['figure.dpi']= 150

In [3]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl
init_notebook_mode(connected=True)

In [4]:
# Personal plotting function for pretty plots
def clean_plot(leg=True, grid=None, font=None):
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    axis_color = 'lightgrey'
    ax.spines['bottom'].set_color(axis_color)
    ax.spines['left'].set_color(axis_color)
    ax.tick_params(axis='both', color=axis_color)
    
    if leg:
        ax.legend(frameon = False, loc='upper left', bbox_to_anchor=(1, 1))
        
    if grid is not None:
        plt.grid(color='lightgrey', axis = grid, linestyle='-', linewidth=.5)
        
    if font is not None:
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
            ax.get_xticklabels() + ax.get_yticklabels()):
            
            item.set_fontfamily(font['family'])
            item.set_color(font['color'])

## Problem 1

GapMinder Test

Score: 31%

Apparently, many things surprised me observing my 31% score. An answer that surprised me in particular what that the proportion of people living in extreem poverty has halved over the past 20 years. It may be a misunderstanding of the term "extreem poverty". I know that statistically the less wealthy populations tend to have more childeren, and would be expanding faster. However, if the "extreem poverty" defination relates to access to electricity, clean water, and medicine then the proportional decline makes more sense. In general, I took the gapminder with a pessimistic view, assuming the worst for many of the questions. The world seems to be improving more than I had assumed in many of the areas pertaining to the questions.

Question Restatement: What is the income of the world population?

In [6]:
# Function taken from https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy
def weight_array(ar, weights):
    zipped = zip(ar, weights)
    weighted = []
    for i in zipped:
        weighted = weighted + [i[0]] * int(i[1])
    return weighted

# Gapfinder path head
gp = 'ddf--gapminder--systema_globalis'

adj_income = pd.read_csv(f'{gp}/ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv')

popfm = pd.read_csv(f'{gp}/ddf--datapoints--female_population_with_projections--by--geo--time.csv')
popm = pd.read_csv(f'{gp}/ddf--datapoints--male_population_with_projections--by--geo--time.csv')

pop = popfm.merge(popm, how='left', on=['geo', 'time'])
pop['total'] = pop['female_population_with_projections'] + pop['male_population_with_projections']

adj_income = adj_income[adj_income.time <= 2019]

adj_income = adj_income.merge(pop, how='left', on=['geo', 'time'])

# Get most resent year up to 2019 (don't include projections)
adj_income = adj_income.sort_values(['geo', 'time'], ascending=False).drop_duplicates('geo').reset_index(drop=True)

adj_income = adj_income[adj_income.total.notnull()]

adj_income['pop_perc'] = adj_income['total'] / adj_income['total'].sum()

cont_map.income_groups
# p_25 = np.percentile(weight_array(adj_income['income_per_person_gdppercapita_ppp_inflation_adjusted'], adj_income['total']), 25)
# p_75 = np.percentile(weight_array(adj_income['income_per_person_gdppercapita_ppp_inflation_adjusted'], adj_income['total']), 75)

In [None]:
plt.vline(p_25, ymin=min(adj_income['total']), ymax=max(adj_income['total']))

ax = sns.scatterplot(y='total', x='income_per_person_gdppercapita_ppp_inflation_adjusted', data = adj_income)
ax.set_yscale('log')
ax.set_xscale('log')

In [32]:
# display(adj_income[adj_income.total.notnull()])
pop[pop['geo'] == 'geo']
adj_income[adj_income.total.isnull()]

Unnamed: 0,geo,time,income_per_person_gdppercapita_ppp_inflation_adjusted,female_population_with_projections,male_population_with_projections,total,pop_perc


## Problem 2

In [34]:
gdp = pd.read_csv(f'{gp}/ddf--datapoints--gdppercapita_us_inflation_adjusted--by--geo--time.csv').rename(columns={'gdppercapita_us_inflation_adjusted' : 'gdp'})
cont_map = pd.read_csv(f'{gp}/ddf--entities--geo--country.csv')

gdp = gdp.merge(cont_map[['country', 'name', 'world_4region', 'world_6region']],
                how='left', left_on='geo', right_on='country') \
    .rename(columns={'name' : 'Country', 'time' : 'Year', 'gdp' : 'GDP'})
gdp.head()

In [36]:
fig = px.line(gdp, x="Year", y="GDP", color='Country')
fig.show()

In [40]:
viz = gdp.groupby(['world_4region', 'Year']).agg({'GDP' : 'sum'}).reset_index()

fig = px.line(viz, x="Year", y="GDP", color='world_4region')
fig.show()

In [42]:
viz = gdp.groupby(['world_6region', 'Year']).agg({'GDP' : 'sum'}).reset_index()

fig = px.line(viz, x="Year", y="GDP", color='world_6region')
fig.show()

## Problem 3

In [68]:
life_exp = pd.read_csv(f'{gp}/ddf--datapoints--life_expectancy_years--by--geo--time.csv')
mortality = pd.read_csv(f'{gp}/ddf--datapoints--child_mortality_0_5_year_olds_more_years_version_7--by--geo--time.csv')
gdp = pd.read_csv(f'{gp}/ddf--datapoints--gdppercapita_us_inflation_adjusted--by--geo--time.csv').rename(columns={'gdppercapita_us_inflation_adjusted' : 'gdp'})
cont_map = pd.read_csv(f'{gp}/ddf--entities--geo--country.csv')

gdp = gdp.merge(cont_map[['country', 'name', 'world_4region', 'world_6region']],
                how='left', left_on='geo', right_on='country') \
    .merge(life_exp, how='left', on=['geo', 'time']) \
    .merge(mortality, how='left', on=['geo', 'time']) \
    .rename(columns={'name' : 'Country', 'time' : 'Year', 'gdp' : 'GDP',
                    'child_mortality_0_5_year_olds_more_years_version_7': 'Child Mortality',
                    'life_expectancy_years' : 'Life Expectancy'}) \
    .sort_values('Year')

In [67]:
xcol = "GDP"
ycol = "Life Expectancy"

px.scatter(gdp, x=xcol, y=ycol, animation_frame="Year", animation_group="Country",
           color="world_6region", hover_name="Country", log_x=True,
           range_x=[round(min(gdp[xcol]) * .95), round(max(gdp[xcol]) * 1.05)], range_y=[round(min(gdp[ycol]) * .95), round(max(gdp[ycol]) * 1.05)])


In [69]:
xcol = "GDP"
ycol = "Child Mortality"

px.scatter(gdp, x=xcol, y=ycol, animation_frame="Year", animation_group="Country",
           color="world_6region", hover_name="Country", log_x=True,
           range_x=[round(min(gdp[xcol]) * .95), round(max(gdp[xcol]) * 1.05)], range_y=[round(min(gdp[ycol]) * .95), round(max(gdp[ycol]) * 1.05)])


## Problem 4

## Problem 5

* Static shows quick snapshot of story
* Interactive helps people understand more intuitively, but can get lost
* Anamation tells interactive story

Man doesn't use data as story, but uses data to tell the story