In [1]:
from collections import defaultdict
import numpy as np
import os
import pandas as pd
from pandas import IndexSlice as ix
import pycountry

%reload_ext autoreload
%autoreload 1
%aimport util

In [2]:
dataset_raw = pd.read_csv(util.RAW_DATASET)

In [3]:
def common_name(country):
    try:
        return country.common_name
    except AttributeError:
        return country.name

### Rafiq et al. (2016): 22 "increasingly urbanized emerging economies"

In [4]:
rafiq2016_table2 = """Angola
Costa Rica
Ghana
Indonesia
Lebanon
Mongolia
Namibia
Panama
Sudan
Thailand
Zambia
Bangladesh
China
Ethiopia
India
Jordan
Malaysia
Mozambique
Nigeria
Singapore
Tanzania
Vietnam
"""

records = [pycountry.countries.get(name=x) 
           or pycountry.countries.get(common_name=x) 
           for x in rafiq2016_table2.strip().split('\n')]
assert all(x is not None for x in records)
records = sorted(records, key=common_name)
rafiq2016_codes = [x.alpha_3 for x in records]
repr(rafiq2016_codes)

"['AGO', 'BGD', 'CHN', 'CRI', 'ETH', 'GHA', 'IND', 'IDN', 'JOR', 'LBN', 'MYS', 'MNG', 'MOZ', 'NAM', 'NGA', 'PAN', 'SGP', 'SDN', 'TZA', 'THA', 'VNM', 'ZMB']"

In [5]:
assert util.RAFIQ2016_COUNTRIES == rafiq2016_codes

### Tiba and Frikha (2018): middle-income countries (minus Venezuela)

In [6]:
tiba2018_middle = """
Algeria
Argentina
Brazil
Bulgaria
Chile
China
Colombia
Malaysia
Mexico
Thailand
Turkey
Venezuela
"""

records = [pycountry.countries.get(name=x) 
           or pycountry.countries.get(common_name=x) 
           for x in tiba2018_middle.strip().split('\n')]
assert all(x is not None for x in records)
records = sorted(records, key=common_name)
tiba2018_middle_codes = [x.alpha_3 for x in records]
repr(tiba2018_middle_codes)

"['DZA', 'ARG', 'BRA', 'BGR', 'CHL', 'CHN', 'COL', 'MYS', 'MEX', 'THA', 'TUR', 'VEN']"

In [7]:
assert util.TIBA2018_MID_COUNTRIES == tiba2018_middle_codes

### Tiba and Frikha (2018): high-income countries

In [8]:
tiba2018_high = """
Australia
Canada
France
Germany
Japan
Netherlands
Portugal
Spain
Sweden
Switzerland
United Kingdom
United States
"""

records = [pycountry.countries.get(name=x) 
           or pycountry.countries.get(common_name=x) 
           for x in tiba2018_high.strip().split('\n')]
assert all(x is not None for x in records)
records = sorted(records, key=common_name)
tiba2018_high_codes = [x.alpha_3 for x in records]
repr(tiba2018_high_codes)

"['AUS', 'CAN', 'FRA', 'DEU', 'JPN', 'NLD', 'PRT', 'ESP', 'SWE', 'CHE', 'GBR', 'USA']"

In [9]:
assert util.TIBA2018_HIGH_COUNTRIES == tiba2018_high_codes

## Extract contiguous panel sets

In [10]:
dataset_raw = pd.read_csv(util.RAW_DATASET, index_col=['Country Code', 'Year'])

In [11]:
dataset_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,ENI,POP,URB,AFL,TI,TS,CTS,KAOPEN
Country Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ABW,1980,,333.866667,50.472,,,,,
ABW,1981,,336.483333,50.456,,,,,
ABW,1982,,340.805556,50.441,,,,,
ABW,1983,,345.561111,50.426,,,,,
ABW,1984,,349.088889,50.411,,,,,
...,...,...,...,...,...,...,...,...,...
ZWE,2015,,35.710557,32.385,2679.510136,0.202684,0.567488,0.029254,-0.148388
ZWE,2016,,36.268295,32.296,2806.458631,0.092656,0.512190,0.024596,-0.148388
ZWE,2017,,36.801719,32.237,3028.245976,,0.500283,0.022614,-0.148388
ZWE,2018,,37.324591,32.209,3203.888554,,0.500054,0.022107,-0.148388


In [19]:
dataset = dataset_raw.loc[ix[util.ALL_COUNTRIES, 1990:2015], :].sort_index()
def find_years(x):
    return ','.join(map(str, x.index.get_level_values('Year')))

missing = dataset.stack(dropna=False).isnull()
missing.loc[missing].groupby(level=[0,2]).aggregate(find_years)

Country Code        
ARG           ENI                                2015
AUS           TI             1990,1991,1992,1993,1994
BGR           ENI                                2015
              KAOPEN              1990,1991,1992,1993
BRA           ENI                                2015
CHE           KAOPEN    1990,1991,1992,1993,1994,1995
CHN           ENI                                2015
COL           ENI                                2015
              TI                                 1990
DZA           ENI                                2015
              TI                            2008,2009
MYS           ENI                                2015
THA           ENI                                2015
VEN           ENI                 2012,2013,2014,2015
              AFL                 2012,2013,2014,2015
              TI                  1995,1998,1999,2001
              TS                                 2015
              CTS                                2015
dtype: 

In [13]:
for c, df in dataset.groupby(level='Country Code'):
    df_not_na = df.dropna(axis='rows', how='any')
    years = df_not_na.index.get_level_values('Year')
    contiguous_years = range(years[0], years[-1]+1)
    if any(years != contiguous_years):
        missing = set(contiguous_years) - set(years)
        print('%s %s' % (c, missing))


DZA {2008, 2009}
VEN {2001, 1995, 1998, 1999}


In [14]:
country_years = defaultdict(lambda:(1990,2015), {
    **{x:(1990,2014) for x in ['ARG', 'BRA', 'CHN', 'DZA', 'MYS', 'THA']},
    'AUS': (1995,2015),
    'BGR': (1994,2014),
    'CHE': (1996,2014),
    'COL': (1994,2014),
    'VEN': (2002,2011),
})
country_years = {c: country_years[c] for c in util.ALL_COUNTRIES}
data_subset = pd.concat([dataset.loc[ix[c, start:end], :]
                        for c, (start, end) in country_years.items()])
assert set(data_subset.index.get_level_values('Country Code')) == set(util.ALL_COUNTRIES)

In [15]:
missing = data_subset.stack(dropna=False).isnull()
missing.loc[missing].groupby(level=[0,2]).aggregate(find_years)

Country Code    
DZA           TI    2008,2009
dtype: object

In [16]:
def format_date(code):
    start, end = country_years[code]
    return '%s-%s' % (start, end)
    
def sorted_names(codes):
    name_map = {common_name(pycountry.countries.get(alpha_3=code)):code  for code in codes}
    names = sorted(name_map.keys())
    dates = map(format_date, (name_map[name] for name in names))
    return pd.DataFrame({'Country': names, 'Date range': dates})

print(sorted_names(tiba2018_middle_codes).to_latex(index=False))
print(sorted_names(tiba2018_high_codes).to_latex(index=False))


\begin{tabular}{ll}
\toprule
   Country & Date range \\
\midrule
   Algeria &  1990-2014 \\
 Argentina &  1990-2014 \\
    Brazil &  1990-2014 \\
  Bulgaria &  1994-2014 \\
     Chile &  1990-2015 \\
     China &  1990-2014 \\
  Colombia &  1994-2014 \\
  Malaysia &  1990-2014 \\
    Mexico &  1990-2015 \\
  Thailand &  1990-2014 \\
    Turkey &  1990-2015 \\
 Venezuela &  2002-2011 \\
\bottomrule
\end{tabular}

\begin{tabular}{ll}
\toprule
        Country & Date range \\
\midrule
      Australia &  1995-2015 \\
         Canada &  1990-2015 \\
         France &  1990-2015 \\
        Germany &  1990-2015 \\
          Japan &  1990-2015 \\
    Netherlands &  1990-2015 \\
       Portugal &  1990-2015 \\
          Spain &  1990-2015 \\
         Sweden &  1990-2015 \\
    Switzerland &  1996-2014 \\
 United Kingdom &  1990-2015 \\
  United States &  1990-2015 \\
\bottomrule
\end{tabular}



In [17]:
data_subset.to_csv(util.DATASET_SUBSET)