In [17]:
# %load ../firstcell.py
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [18]:
import json
from pandas.io.json import json_normalize

# About the data

### Population
World Bank population estimates are compiled from three main sources: (1) the United Nations Population Division’s World Population Prospects (WPP), (2) census reports and other publications from national statistical offices (NSOs), and (3) Eurostat Demographic Statistics, which collects data directly from European NSOs. The WPP is the primary source for World Bank population estimates because for many countries it provides the most reliable, comprehensive, and internally consistent population and demographic datasets.

Population estimates are usually based on national population censuses. More countries conducted a census in the 2010 census round (2005–14) than in previous rounds, with about 93 percent of the estimated world population enumerated in that period. Recency and quality of a census and of complementary data available from vital registration systems or surveys are important for high quality population and demographic estimates.

Errors and undercounting in censuses occur even in high-income countries; in some low- and middle-income countries the errors may be substantial. Census-year population estimates for low- and middle-income countries that lack reliable recent census data are provided by the United Nations Population Division. Those estimates are derived from demographic modeling based on a cohort component method, a standard method for estimating and projecting population that draws on fertility, mortality, and net migration data, often collected from sample surveys. A similar approach is used to calculate pre- and post-census (“intercensal”) estimates for all countries, even those whose census data are reliable

###  GDP(current US$)
GDP at purchaser's prices is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in current U.S. dollars. Dollar figures for GDP are converted from domestic currencies using single year official exchange rates. For a few countries where the official exchange rate does not reflect the rate effectively applied to actual foreign exchange transactions, an alternative conversion factor is used.

In [43]:
#Retrieve world population per year and per country from the world bank.
from urllib.request import urlretrieve

time_range=(1986,2017)

url='http://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?date='+str(time_range[0])+\
':'+str(time_range[1])+'&per_page=8448&format=json'

urlretrieve(url,'pop_country_year.json')

In [52]:
#Retrieve GDP per year and per country

url='https://api.worldbank.org/v2/countries/all/indicators/NY.GDP.MKTP.CD?date='+str(time_range[0])\
+':'+str(time_range[1])+'&per_page=8448&format=json'

urlretrieve(url,'GDP_country_year.json')

('GDP_country_year.json', <http.client.HTTPMessage at 0x7f4640e84c10>)

In [93]:
with open('pop_country_year.json','r') as p,open('GDP_country_year.json','r') as g:
    poplulation=json.load(p)
    gross_domP=json.load(g) 

In [83]:
poplulation[0]

{'page': 1,
 'pages': 1,
 'per_page': 8448,
 'total': 8448,
 'sourceid': '2',
 'lastupdated': '2020-07-01'}

In [84]:
gross_domP[0]

{'page': 1,
 'pages': 1,
 'per_page': 8448,
 'total': 8448,
 'sourceid': '2',
 'lastupdated': '2020-07-01'}

In [98]:
pop_df=json_normalize(poplulation[1])
gdp_df=json_normalize(gross_domP[1])

In [95]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8448 entries, 0 to 8447
Data columns (total 10 columns):
countryiso3code    8448 non-null object
date               8448 non-null object
value              7627 non-null float64
unit               8448 non-null object
obs_status         8448 non-null object
decimal            8448 non-null int64
indicator.id       8448 non-null object
indicator.value    8448 non-null object
country.id         8448 non-null object
country.value      8448 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 660.1+ KB


In [87]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8448 entries, 0 to 8447
Data columns (total 10 columns):
countryiso3code    8448 non-null object
date               8448 non-null object
value              8387 non-null float64
unit               8448 non-null object
obs_status         8448 non-null object
decimal            8448 non-null int64
indicator.id       8448 non-null object
indicator.value    8448 non-null object
country.id         8448 non-null object
country.value      8448 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 660.1+ KB


In [88]:
pop_df.describe(include=['object'])

Unnamed: 0,countryiso3code,date,unit,obs_status,indicator.id,indicator.value,country.id,country.value
count,8448.0,8448,8448.0,8448.0,8448,8448,8448,8448
unique,260.0,32,1.0,1.0,1,1,264,264
top,,2015,,,SP.POP.TOTL,"Population, total",FR,Central African Republic
freq,160.0,264,8448.0,8448.0,8448,8448,32,32


In [89]:
gdp_df.describe(include=['object'])

Unnamed: 0,countryiso3code,date,unit,obs_status,indicator.id,indicator.value,country.id,country.value
count,8448.0,8448,8448.0,8448.0,8448,8448,8448,8448
unique,260.0,32,1.0,1.0,1,1,264,264
top,,2015,,,NY.GDP.MKTP.CD,GDP (current US$),FR,Central African Republic
freq,160.0,264,8448.0,8448.0,8448,8448,32,32


In [90]:
gdp_df['countryiso3code'].unique()

array(['ARB', 'CSS', 'CEB', 'EAR', 'EAS', 'EAP', 'TEA', 'EMU', 'ECS',
       'ECA', 'TEC', 'EUU', 'FCS', 'HPC', '', 'IBD', 'IBT', 'IDB', 'IDX',
       'IDA', 'LTE', 'LCN', 'LAC', 'TLA', 'LDC', 'LMY', 'MEA', 'MNA',
       'TMN', 'MIC', 'NAC', 'OED', 'OSS', 'PSS', 'PST', 'PRE', 'SST',
       'SAS', 'TSA', 'SSF', 'SSA', 'TSS', 'WLD', 'AFG', 'ALB', 'DZA',
       'ASM', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT',
       'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN',
       'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'VGB', 'BRN', 'BGR',
       'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CYM', 'CAF', 'TCD',
       'CHI', 'CHL', 'CHN', 'COL', 'COM', 'COD', 'COG', 'CRI', 'CIV',
       'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM',
       'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FRO',
       'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA',
       'GIB', 'GRC', 'GRL', 'GRD', 'GUM', 'GTM', 'GIN', 'GNB', 'GUY',
       'HTI', 'H

In [99]:
#filtering only countries
idxg=np.where(gdp_df['country.value'].unique()=='World')[0][0]+1
idxp=np.where(pop_df['country.value'].unique()=='World')[0][0]+1

popfiltering_set= pop_df['country.value'].unique()[idxp:]
gfiltering_set=gdp_df['country.value'].unique()[idxg:]

cols_touse=['countryiso3code','date','value','indicator.value','country.value']

gdp_df=gdp_df[gdp_df['country.value'].isin(gfiltering_set)].reset_index(drop=True)[cols_touse]
pop_df=pop_df[pop_df['country.value'].isin(popfiltering_set)].reset_index(drop=True)[cols_touse]

In [104]:
# gdp_df.to_csv('GDP_filtered.csv')
# pop_df.to_csv('Population_filtered.csv')

In [79]:
#checking that there is the same number of iso3codes throughout the years in both datasets
l1=pop_df.date.unique()
s=np.zeros(len(l1))
for i,year in enumerate(l1):
    count=pop_df[pop_df.date==str(year)]['countryiso3code'].unique()
    s[i] = len(count)
    
print(np.all(s==217))

l2=gdp_df.date.unique()
s=np.zeros(len(l2))
for i,year in enumerate(gdp_df.date.unique()):
    count=gdp_df[pop_df.date==str(year)]['countryiso3code'].unique()
    s[i]=len(count)
    
print(np.all(s==217))


True
True


In [75]:
#same country names in both datasets
v1=gdp_df['country.value'].unique()
v2=pop_df['country.value'].unique()
np.all(v1==v2)

True