### Step 1: download data sets from data from World Bank World Development Indicators database

This workbook downloads the variables of interest from the appropriate WDI database vintages.

The next step checks the patent application counts against the WIPO data (which have no easily accessible API).

In [1]:
from IPython.display import display, HTML
from datetime import date
import os
import pandas as pd
import requests
from pyjstat import pyjstat

# pandas_datareader spams FutureWarning on import
import warnings
warnings.filterwarnings('ignore', module='pandas_datareader')
from pandas_datareader import wb

# find target path for data files, assuming the notebook is in the right place
data_path = _dh[0]
assert data_path.endswith(os.path.join('em-2020','data'))

Fetch all indicators and select out those of interest:

In [2]:
all_indicators = wb.get_indicators().set_index('id')
all_indicators = all_indicators.query('source == "World Development Indicators"')
# all_indicators = all_indicators.query('source == "WDI Database Archives"')
indicator_map = {
    'NV.IND.TOTL.ZS': 'ISG',
    'NE.TRD.GNFS.ZS': 'TO',
    'NY.GDP.PCAP.CD': 'P_GDP',
    'NY.GDP.PCAP.PP.CD': 'P_GDPb',
    'EG.USE.PCAP.KG.OE': 'kg oil per cap',
    'IP.PAT.RESD': 'resident patents',
    'IP.PAT.NRES': 'nonresident patents',
    }
search = lambda x: all_indicators.loc[all_indicators.name.str.contains(x, case=False), 'name']
to_test = {k:k for k in search('energy').index}
# indicator_map.update(to_test)
indicator_ids = indicator_map.keys()
pd.options.display.max_colwidth = 100
all_indicators.loc[indicator_ids, ['name', 'sourceNote', 'source']].sort_values('name')

Unnamed: 0_level_0,name,sourceNote,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EG.USE.PCAP.KG.OE,Energy use (kg of oil equivalent per capita),"Energy use refers to use of primary energy before transformation to other end-use fuels, which i...",World Development Indicators
NY.GDP.PCAP.CD,GDP per capita (current US$),GDP per capita is gross domestic product divided by midyear population. GDP is the sum of gross ...,World Development Indicators
NY.GDP.PCAP.PP.CD,"GDP per capita, PPP (current international $)",GDP per capita based on purchasing power parity (PPP). PPP GDP is gross domestic product convert...,World Development Indicators
NV.IND.TOTL.ZS,"Industry (including construction), value added (% of GDP)",Industry corresponds to ISIC divisions 10-45 and includes manufacturing (ISIC divisions 15-37). ...,World Development Indicators
IP.PAT.NRES,"Patent applications, nonresidents",Patent applications are worldwide patent applications filed through the Patent Cooperation Treat...,World Development Indicators
IP.PAT.RESD,"Patent applications, residents",Patent applications are worldwide patent applications filed through the Patent Cooperation Treat...,World Development Indicators
NE.TRD.GNFS.ZS,Trade (% of GDP),Trade is the sum of exports and imports of goods and services measured as a share of gross domes...,World Development Indicators


Fetch all countries and select out those of interest:

In [3]:
all_countries = wb.get_countries().set_index('iso3c')
country_ids = [
    'BGD',
    'GBR',
    'IND',
]
country_map = all_countries.loc[country_ids, 'iso2c'].to_dict()
all_countries.loc[country_ids, ['name', 'region']]

Unnamed: 0_level_0,name,region
iso3c,Unnamed: 1_level_1,Unnamed: 2_level_1
BGD,Bangladesh,South Asia
GBR,United Kingdom,Europe & Central Asia
IND,India,South Asia


Download the data and save it 

In [4]:
url_template = (
    'http://api.worldbank.org/v2/'
    'sources/{source}/'
    'country/{country}/'
    'series/{indicator}/'
    'time/ALL/'
)

In [5]:
def fetch(country_id, version=None):
    results = {}
    for indicator_id in indicator_ids:
        source = 2 if version is None else 57
        url = url_template.format(source=source, indicator=indicator_id, country=country_id)
        if version is not None:
            url += 'version/{}'.format(version)
        r = requests.get(url, params={'format': 'jsonstat', 'per_page': 1000})
        result = r.json()
#         indicator_id_map = result['WDA']['dimension']['series']['category']['label']
#         assert len(indicator_id_map) == 1
#         assert list(indicator_id_map.keys())[0] == indicator_id
        ds = pyjstat.Dataset.read(r.text)
        df = ds.write('dataframe')
        df.time = df.time.astype('int')
        results[indicator_id] = df.set_index('time')['value']
    country_df = pd.DataFrame(results).rename(columns=indicator_map)
    country_df.index.name = 'year'
    
    # energy intensity is given in kg. oil equiv. per capita; divide by $GDP per capita
    # to replicate kg. oil equiv. per $GDP
    country_df['EI'] = country_df['kg oil per cap'] / country_df['P_GDP']
    # and again for kg. oil equiv. per $GDP PPP-adjusted
    country_df['EIb'] = country_df['kg oil per cap'] / country_df['P_GDPb']
    country_df.drop(columns=['kg oil per cap'], inplace=True)

    
    # technological innovation is proxied by total number of patents
    country_df['TI'] = country_df['resident patents'] + country_df['nonresident patents']
    country_df.drop(columns=['resident patents', 'nonresident patents'], inplace=True)

    file = 'WB_{}_{}.csv'.format(country_id, version or 'current')
    path = os.path.join(data_path, file)
    country_df.to_csv(path)
    
    return country_df

Save down the reference year from Pan et. al (2019)

In [6]:
reference = fetch('BGD', '201805')
display(HTML('<h2>Bangladesh</h2>'))
pd.options.display.float_format = '{:.2f}'.format
display(reference.loc[1986:2015].describe().T[['min', 'max', 'mean', 'std', 'count']].sort_values('min'))
pd.options.display.float_format = '{:.3f}'.format
display(reference.loc[1986:2015, ['ISG', 'TO', 'TI', 'P_GDP', 'EI']].corr())

Unnamed: 0,min,max,mean,std,count
EIb,0.07,0.14,0.1,0.02,25.0
EI,0.2,0.49,0.35,0.07,29.0
TO,16.69,48.11,30.56,9.98,30.0
ISG,20.05,28.15,24.07,2.28,30.0
TI,93.0,354.0,239.2,93.5,30.0
P_GDP,227.42,1210.16,498.83,261.51,30.0
P_GDPb,832.43,3335.76,1715.36,757.89,26.0


Unnamed: 0,ISG,TO,TI,P_GDP,EI
ISG,1.0,0.933,0.797,0.872,-0.855
TO,0.933,1.0,0.842,0.876,-0.919
TI,0.797,0.842,1.0,0.693,-0.782
P_GDP,0.872,0.876,0.693,1.0,-0.949
EI,-0.855,-0.919,-0.782,-0.949,1.0


Save down up-to-date data and extend to India and the UK

In [7]:
fetch('BGD', None)

ind = fetch('IND', None)
display(HTML('<h2>India</h2>'))
display(ind.loc[:].describe().T[['min', 'max', 'mean', 'std', 'count']].sort_values('min'))

gbr = fetch('GBR', None)
display(HTML('<h2>United Kingdom</h2>'))
display(gbr.loc[:].describe().T[['min', 'max', 'mean', 'std', 'count']].sort_values('min'))

Unnamed: 0,min,max,mean,std,count
EIb,0.112,0.283,0.184,0.056,25.0
EI,0.396,2.254,1.03,0.453,44.0
TO,7.662,55.794,23.114,15.16,59.0
ISG,20.089,31.137,25.93,3.054,59.0
P_GDP,82.189,2009.979,526.941,528.472,59.0
P_GDPb,1236.671,7762.882,3389.982,1925.106,29.0
TI,2901.0,50055.0,18629.658,17300.132,38.0


Unnamed: 0,min,max,mean,std,count
EI,0.059,2.17,0.624,0.681,56.0
EIb,0.065,0.218,0.134,0.05,26.0
ISG,17.509,27.899,21.593,3.297,29.0
TO,41.361,62.305,52.526,5.2,49.0
P_GDP,1397.595,50566.827,19773.976,16396.15,59.0
P_GDPb,16698.342,45973.574,30785.658,9230.86,29.0
TI,20941.0,41612.0,28284.692,4987.338,39.0
