# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [None]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import hashlib
import numpy as np
import pandas as pd
import re
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [2]:
release = '2019.Q1.G.03' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../../'
print('data inputs dir: ' + wd_dir)

## Utilities

#### Convert string to camelCase

In [3]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [4]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [5]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


#### Compute a hash of a dictionary

In [6]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(value.encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [7]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [8]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [9]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

### List of countries to be plotted on a map (with XY coordinates)

In [10]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'M49':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [11]:
countryListXY(wd_dir + 'globalResources/refAreas.txt').head(10)

Unnamed: 0,Country_Profile,ISO3,M49,UN_Member,X,Y,areaName
0,1,AFG,4,1,66.02688198,33.83160199,Afghanistan
1,1,ALB,8,1,20.06660928,41.13897007,Albania
2,1,ATA,10,0,21.47585697,-80.40897662,Antarctica
3,1,DZA,12,1,2.678164227,28.15940032,Algeria
4,1,ASM,16,0,-170.7187269,-14.30587306,American Samoa
5,1,AND,20,1,1.576257417,42.54548611,Andorra
6,1,AGO,24,1,17.57817062,-12.33724746,Angola
7,1,ATG,28,1,-61.7999755,17.07761471,Antigua and Barbuda
8,1,AZE,31,1,50.01064725,40.39229544,Azerbaijan
9,1,ARG,32,1,-65.14563274,-35.19446255,Argentina


#### Call the endpoint of the SDG API that provides the list of hierarchical groupings of geographic Areas:

In [12]:
def geoAreaTree():
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    return responseData

Note: The geoAreaTree object has various "trees" in it.  We usually use the "World" tree; however, some economic and geographic groupings are only in other trees.

In [13]:
print(len(geoAreaTree()))
for t in geoAreaTree():
    print('root='+t['geoAreaName'])

7
root=World
root=Least Developed Countries (LDC)
root=Land Locked Developing Countries (LLDC)
root=Small Island Developing States (SIDS)
root=Developed Regions
root=Developing Regions
root=Custom groupings of data providers


#### Traverse a hierarchical tree of geographic areas and convert it to a parent-child hierarchy table:

In [14]:
def flatten(tree):
    
    global hierarchy
    
    hierarchy = []
    traverse.level = 1
    traverse(tree)
    
    return pd.DataFrame(hierarchy)

In [15]:
def traverse(tree, parentCode=None, parentName=None):
    
    # print(' ' * traverse.level + 'Level: ' + str(traverse.level) + ', ' + tree['type'] + ', ' + str(tree['geoAreaCode']) + '-' + tree['geoAreaName'] )
    
    d = {}
    
    d['level'] = traverse.level
    d['type'] = tree['type']
    d['parentCode'] = parentCode
    d['parentName'] = parentName
    d['geoAreaCode'] = str(tree['geoAreaCode'])
    d['geoAreaName'] = tree['geoAreaName']
    
    hierarchy.append(d)
        
    if tree['children']:
        for child in tree['children']:
            traverse.level += 1
            traverse(child, str(tree['geoAreaCode']), tree['geoAreaName'])
            traverse.level -= 1
    
    return pd.DataFrame(hierarchy)

#### `geoAreas` holds the flattened list of geographic areas under 'World':

In [16]:
geoAreas = flatten(geoAreaTree()[0])
geoAreas.head()
print('...')
geoAreas.tail()

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
0,1,World,1,,,Region
1,10,Antarctica,2,1.0,World,Country
2,2,Africa,2,1.0,World,Region
3,15,Northern Africa,3,2.0,Africa,Region
4,12,Algeria,4,15.0,Northern Africa,Country


...


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
291,543,Oceania (exc. Australia and New Zealand),3,9,Oceania,Region
292,62,Central and Southern Asia,2,1,World,Region
293,513,Europe and Northern America,2,1,World,Region
294,747,Northern Africa and Western Asia,2,1,World,Region
295,753,Eastern and South-Eastern Asia,2,1,World,Region


##### Temporary Fix for missing regions: 

In [17]:
if('910' not in geoAreas['geoAreaCode']):
    d_910 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '910',
              'geoAreaName' : 'High income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_910.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('911' not in geoAreas['geoAreaCode']):
    d_911 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '911',
              'geoAreaName' : 'Low income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_911.items()})
    geoAreas = geoAreas.append(x, sort = True)

    
if('912' not in geoAreas['geoAreaCode']):
    d_912 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '912',
              'geoAreaName' : 'Lower middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_912.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('913' not in geoAreas['geoAreaCode']):
    d_913 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '913',
              'geoAreaName' : 'Low and middle income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_913.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('914' not in geoAreas['geoAreaCode']):
    d_914 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '914',
              'geoAreaName' : 'Upper middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_914.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('135' not in geoAreas['geoAreaCode']):
    d_135 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '135',
              'geoAreaName' : 'Caucasus and Central Asia'
            }
    x = pd.DataFrame({k: [v] for k, v in d_135.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('127' not in geoAreas['geoAreaCode']):
    d_127 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '127',
              'geoAreaName' : 'Southern Asia (excluding India)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_127.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('199' not in geoAreas['geoAreaCode']):
    d_199 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '199',
              'geoAreaName' : 'Least Developed Countries (LDC)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_199.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('485' not in geoAreas['geoAreaCode']):
    d_485 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '485',
              'geoAreaName' : 'Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_485.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('514' not in geoAreas['geoAreaCode']):
    d_514 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '514',
              'geoAreaName' : 'Developed Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_514.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('515' not in geoAreas['geoAreaCode']):
    d_515 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '515',
              'geoAreaName' : 'Developing Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_515.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('518' not in geoAreas['geoAreaCode']):
    d_518 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '518',
              'geoAreaName' : 'Eastern Asia (excluding Japan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_518.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('738' not in geoAreas['geoAreaCode']):
    d_738 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '738',
              'geoAreaName' : 'Sub-Saharan Africa (inc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_738.items()})
    geoAreas = geoAreas.append(x, sort = True)

                 
if('746' not in geoAreas['geoAreaCode']):
    d_746 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '746',
              'geoAreaName' : 'Northern Africa (exc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_746.items()})
    geoAreas = geoAreas.append(x, sort = True)

#==================================

geoAreas = geoAreas.reset_index(drop=True)

geoAreas.tail(20)


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
290,876,Wallis and Futuna Islands,4,61.0,Polynesia,Country
291,543,Oceania (exc. Australia and New Zealand),3,9.0,Oceania,Region
292,62,Central and Southern Asia,2,1.0,World,Region
293,513,Europe and Northern America,2,1.0,World,Region
294,747,Northern Africa and Western Asia,2,1.0,World,Region
295,753,Eastern and South-Eastern Asia,2,1.0,World,Region
296,910,High income economies (WB),1,,,Group
297,911,Low income economies (WB),1,,,Group
298,912,Lower middle economies (WB),1,,,Group
299,913,Low and middle income economies (WB),1,,,Group


#### Merge coordinates and list of geographic areas in SDG database

In [18]:
geoAreas_dict = geoAreas.to_dict('records')
geoAreas_dict[0]


{'geoAreaCode': '1',
 'geoAreaName': 'World',
 'level': 1,
 'parentCode': None,
 'parentName': None,
 'type': 'Region'}

In [19]:
def geoareasXY(geoareas,coordinates_file):
    
    xy = countryListXY(coordinates_file)
    
    x = pd.merge(geoareas,xy.loc[:, xy.columns != 'geoAreaName'],
         how='outer',
         left_on='geoAreaCode', right_on='M49')
    x['order'] = x['geoAreaCode']
    x['order'] = x['order'].astype(float)
    x = x.sort_values('order')
    del x['order']
    del x['M49']
    
    x = x.reset_index(drop=True)
    
    x = x.to_dict('records')
    
    
    x_clean = []
    
    for i in x:
        i['geoAreaCode'] = i['geoAreaCode'].zfill(3)
        if i['parentCode']:
            i['parentCode'] = i['parentCode'].zfill(3)
        x_clean.append({k: None if str(v) == 'nan' else v for k, v in i.items()})
    
    
    
    return(x_clean)

# x.to_excel('test.xlsx', engine ='xlsxwriter')

In [20]:
geoareasXY(geoAreas, wd_dir + 'globalResources/refAreas.txt')[0:3]

[{'geoAreaCode': '001',
  'geoAreaName': 'World',
  'level': 1,
  'parentCode': None,
  'parentName': None,
  'type': 'Region',
  'Country_Profile': None,
  'ISO3': None,
  'UN_Member': None,
  'X': None,
  'Y': None,
  'areaName': None},
 {'geoAreaCode': '002',
  'geoAreaName': 'Africa',
  'level': 2,
  'parentCode': '001',
  'parentName': 'World',
  'type': 'Region',
  'Country_Profile': None,
  'ISO3': None,
  'UN_Member': None,
  'X': None,
  'Y': None,
  'areaName': None},
 {'geoAreaCode': '004',
  'geoAreaName': 'Afghanistan',
  'level': 4,
  'parentCode': '034',
  'parentName': 'Southern Asia',
  'type': 'Country',
  'Country_Profile': '1',
  'ISO3': 'AFG',
  'UN_Member': '1',
  'X': '66.02688198',
  'Y': '33.83160199',
  'areaName': 'Afghanistan'}]

### Get the list of goals, targets, indicators and series

In [21]:
with open(wd_dir + 'globalResources/metadata.json') as json_file:  
    metadata = json.load(json_file)
    

In [22]:
metadata[0]['targets'][0]['indicators'][0]['series'][0]

{'code': 'SI_POV_DAY1',
 'description': 'Proportion of population below international poverty line (%)',
 'release': '2019.Q1.G.03',
 'tags': ['poverty line', 'poverty', 'standard of living', 'basic needs']}

#### Get list of unique data series

In [23]:
series = []
for g in metadata:
    for t in g['targets']:
        for i in t['indicators']:
            if 'series' in i.keys():
                for s in i['series']:
                    series.append(s['code'])
series = list(set(series))

series[0:10]

['SG_NHR_INTEXST',
 'SI_COV_PENSN',
 'SG_PLN_MSTKSDG',
 'EN_WWT_WWDS',
 'SG_REG_BRTH90',
 'DC_TOF_TRDCML',
 'SH_STA_MMR',
 'ER_WAT_PROCED',
 'SH_STA_STUNT',
 'SG_DSR_SILN']

### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [24]:
def series_request_details(seriesCode,release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [26]:
series_request_details('SL_EMP_INJUR', '2019.Q1.G.03')

{'series': 'SL_EMP_INJUR', 'totalElements': 3353, 'nPages': 7, 'pageSize': 500}

#### Explore the code lists of the attributes and dimensions of a series
Describe each attribute or dimension as a simple dictionary made of a set of `code`-`description` pairs.  For the code, use the SDMX code, and not the internal codeof the database.  Keep all labels in camelCase.

In [27]:
def series_code_lists(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['seriesCode'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(a['id'])] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(d['id'])] = codelist_dict
        
    return new_dict


In [28]:
series_code_lists('SL_EMP_INJUR', '2019.Q1.G.03')

{'seriesCode': 'SL_EMP_INJUR',
 'nature': {'C': 'Country data',
  'CA': 'Country adjusted data',
  'E': 'Estimated data',
  'G': 'Global monitoring data',
  'M': 'Modeled data',
  'N': 'Non-relevant',
  'NA': 'Data nature not available'},
 'units': {'PER_100000_EMP': 'Per 100,000 employees',
  'PER_100000_PRSN_INSR': 'Per 100,000 persons insured',
  'PER_100000_WKRS_EMP': 'Per 100,000 workers employed'},
 'migratoryStatus': {'_T': 'No breakdown',
  'MIGPER': 'Migrants',
  'NONMIG': 'Non-migrant',
  'EUMIG': 'EU Migrants',
  'NONEUMIG': 'Non-EU Migrants'},
 'reportingType': {'N': 'National', 'G': 'Global'},
 'sex': {'FEMALE': 'Female', 'MALE': 'Male', 'BOTHSEX': 'Both sexes'}}

#### Simplify further by presenting all the codes and their descriptions in a single table:

In [33]:
def series_code_lists2(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    code_list = []
    
    for a in series_attributes:
       
        for c in a['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'attribute'
            new_dict['concept'] = camelCase(a['id'])
            new_dict['code'] = c['code']
            new_dict['sdmx'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    for d in series_dimensions:
        for c in d['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'dimension'
            new_dict['concept'] = camelCase(d['id'])
            new_dict['code'] = c['code']
            new_dict['sdmx'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    return pd.DataFrame(code_list)


In [34]:
def series_code_lists3(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    concepts = []
    
    for d in series_dimensions:

        new_dict = {}
        new_dict['concept'] = camelCase(d['id'])
        new_dict['role'] = 'dimension'
        new_dict['codes'] = []
        for c in d['codes']:
            new_dict2 = {}
            new_dict2['code'] = c['code']
            new_dict2['sdmx'] = c['sdmx']
            new_dict2['description'] = c['description']
            new_dict['codes'].append(new_dict2)
        concepts.append(new_dict)
    
    for a in series_attributes:
        
        new_dict = {}
        new_dict['concept'] = camelCase(a['id'])
        new_dict['role'] = 'attribute'
        new_dict['codes'] = []
        for c in a['codes']:
            new_dict2 = {}
            new_dict2['code'] = c['code']
            new_dict2['sdmx'] = c['sdmx']
            new_dict2['description'] = c['description']
            new_dict['codes'].append(new_dict2)
        concepts.append(new_dict)
        
        
    return concepts #pd.DataFrame(code_list)


In [39]:
series_code_lists2('SL_EMP_INJUR', '2019.Q1.G.03')
series_code_lists3('SL_EMP_INJUR', '2019.Q1.G.03')

Unnamed: 0,code,concept,description,role,sdmx,series
0,C,nature,Country data,attribute,C,SL_EMP_INJUR
1,CA,nature,Country adjusted data,attribute,CA,SL_EMP_INJUR
2,E,nature,Estimated data,attribute,E,SL_EMP_INJUR
3,G,nature,Global monitoring data,attribute,G,SL_EMP_INJUR
4,M,nature,Modeled data,attribute,M,SL_EMP_INJUR
5,N,nature,Non-relevant,attribute,N,SL_EMP_INJUR
6,,nature,Data nature not available,attribute,,SL_EMP_INJUR
7,PER_100000_EMP,units,"Per 100,000 employees",attribute,PER_100000_EMP,SL_EMP_INJUR
8,PER_100000_PRSN_INSR,units,"Per 100,000 persons insured",attribute,PER_100000_PRSN_INSR,SL_EMP_INJUR
9,PER_100000_WKRS_EMP,units,"Per 100,000 workers employed",attribute,PER_100000_WKRS_EMP,SL_EMP_INJUR


[{'concept': 'migratoryStatus',
  'role': 'dimension',
  'codes': [{'code': '_T', 'sdmx': '_T', 'description': 'No breakdown'},
   {'code': 'MIGPER', 'sdmx': 'MIGPER', 'description': 'Migrants'},
   {'code': 'NONMIG', 'sdmx': 'NONMIG', 'description': 'Non-migrant'},
   {'code': 'EUMIG', 'sdmx': 'MS_EUMIGRANT', 'description': 'EU Migrants'},
   {'code': 'NONEUMIG',
    'sdmx': 'MS_NONEUMIGRANT',
    'description': 'Non-EU Migrants'}]},
 {'concept': 'reportingType',
  'role': 'dimension',
  'codes': [{'code': 'N', 'sdmx': 'N', 'description': 'National'},
   {'code': 'G', 'sdmx': 'G', 'description': 'Global'}]},
 {'concept': 'sex',
  'role': 'dimension',
  'codes': [{'code': 'FEMALE', 'sdmx': 'F', 'description': 'Female'},
   {'code': 'MALE', 'sdmx': 'M', 'description': 'Male'},
   {'code': 'BOTHSEX', 'sdmx': '_T', 'description': 'Both sexes'}]},
 {'concept': 'nature',
  'role': 'attribute',
  'codes': [{'code': 'C', 'sdmx': 'C', 'description': 'Country data'},
   {'code': 'CA', 'sdmx': '

#### Build query string to collect data for a specific series from the global SDG API

In [40]:
def series_query(seriesCode, release, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    http = urllib3.PoolManager()
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [41]:
def series_data(seriesCode, release):
    x = series_request_details(seriesCode,release)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, release, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [43]:
x = series_data('SL_EMP_INJUR','2019.Q1.G.03')
if len(x) > 0:
    x[0]

---Series SL_EMP_INJUR: Processing page 1 of 7
---Series SL_EMP_INJUR: Processing page 2 of 7
---Series SL_EMP_INJUR: Processing page 3 of 7
---Series SL_EMP_INJUR: Processing page 4 of 7
---Series SL_EMP_INJUR: Processing page 5 of 7
---Series SL_EMP_INJUR: Processing page 6 of 7
---Series SL_EMP_INJUR: Processing page 7 of 7


{'goal': ['8'],
 'target': ['8.8'],
 'indicator': ['8.8.1'],
 'series': 'SL_EMP_INJUR',
 'seriesDescription': 'Non-fatal occupational injuries among employees, by sex and migrant status (per 100,000 employees)',
 'seriesCount': '3353',
 'geoAreaCode': '12',
 'geoAreaName': 'Algeria',
 'timePeriodStart': 2000.0,
 'value': '1541.5',
 'valueType': 'Float',
 'time_detail': None,
 'source': 'ILOSTAT - ADM-IR - Insurance records',
 'footnotes': ['Coverage of occupational injuries: Compensated injuries | Reference group coverage: Insured persons'],
 'attributes': {'Nature': 'C', 'Units': 'PER_100000_EMP'},
 'dimensions': {'Sex': 'BOTHSEX',
  'Migratory status': '_T',
  'Reporting Type': 'G'}}

#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.
Also convert the years (`timePeriod`) variable to `int`

In [44]:
geo = geoareasXY(geoAreas, wd_dir + 'globalResources/refAreas.txt')
geo[0]

{'geoAreaCode': '001',
 'geoAreaName': 'World',
 'level': 1,
 'parentCode': None,
 'parentName': None,
 'type': 'Region',
 'Country_Profile': None,
 'ISO3': None,
 'UN_Member': None,
 'X': None,
 'Y': None,
 'areaName': None}

In [45]:
def flat_series_data(seriesCode,release):

    codeLists = series_code_lists3(seriesCode,release)
    
    new_x = []
    for d in series_data(seriesCode,release):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[camelCase(k+' Code')] = v
                    for cl in codeLists:
                        if cl['concept'] == camelCase(k):
                            for c in cl['codes']:
                                if c['code'] == v:
                                    new_d[camelCase(k+' Desc')] = c['description']
                                    new_d[camelCase(k+' Code')] = c['sdmx']
                                    break
                            break
            elif key == 'time_detail':
                new_d[camelCase(key)] = value
            elif key == 'timePeriodStart':
                new_d['timePeriod'] = int(value)
            elif key == 'series':
                new_d['seriesCode'] = value
            elif key == 'seriesDescription':
                new_d['seriesDesc'] = value
            elif key == 'geoAreaCode':
                new_d['geoAreaCode'] = str(value).zfill(3)
            else:
                new_d[key] = value

        new_d['value_numeric_part'] = numeric_part(new_d['value'])
        new_d['value_is_censored'] = (new_d['valueType'] != 'Float')
        new_d['value_detail'] = new_d['value']

        del new_d['value']
        del new_d['valueType']
        del new_d['seriesCount']

        new_x.append(new_d)

    return new_x


In [46]:
x = flat_series_data('SL_EMP_INJUR','2019.Q1.G.03')


---Series SL_EMP_INJUR: Processing page 1 of 7
---Series SL_EMP_INJUR: Processing page 2 of 7
---Series SL_EMP_INJUR: Processing page 3 of 7
---Series SL_EMP_INJUR: Processing page 4 of 7
---Series SL_EMP_INJUR: Processing page 5 of 7
---Series SL_EMP_INJUR: Processing page 6 of 7
---Series SL_EMP_INJUR: Processing page 7 of 7


In [47]:
x[0]

{'goal': '8',
 'target': '8.8',
 'indicator': '8.8.1',
 'seriesCode': 'SL_EMP_INJUR',
 'seriesDesc': 'Non-fatal occupational injuries among employees, by sex and migrant status (per 100,000 employees)',
 'geoAreaCode': '012',
 'geoAreaName': 'Algeria',
 'timePeriod': 2000,
 'timeDetail': None,
 'source': 'ILOSTAT - ADM-IR - Insurance records',
 'footnotes': 'Coverage of occupational injuries: Compensated injuries | Reference group coverage: Insured persons',
 'natureCode': 'C',
 'natureDesc': 'Country data',
 'unitsCode': 'PER_100000_EMP',
 'unitsDesc': 'Per 100,000 employees',
 'sexCode': '_T',
 'sexDesc': 'Both sexes',
 'migratoryStatusCode': '_T',
 'migratoryStatusDesc': 'No breakdown',
 'reportingTypeCode': 'G',
 'reportingTypeDesc': 'Global',
 'value_numeric_part': 1541.5,
 'value_is_censored': False,
 'value_detail': '1541.5'}

##### Select distinct indicator-series that are included in the dataset

When there is a 'multi-purpose indicator', the same data series is part of two different goal-target-indicator specification.

In [48]:
indicator_series = unique_dicts(subdict_list(x,['goal', 'target', 'indicator', 'seriesCode', 'seriesDesc'])
)

Now, for each `indicator_series`, we want to build a json file that contains all the necessary information to be published as a layer. This requires to build a tree structure with the following levels:
- Information about the goal, target, indicator, and series
- Information about the geographic reference area
- Data grouped by time period

In [49]:
series = 'SL_EMP_INJUR'
release = '2019.Q1.G.03'

x = flat_series_data(series,release)

indicator_series =  unique_dicts(subdict_list(x,['goal', 'target', 'indicator', 'seriesCode', 'seriesDesc'])
)

---Series SL_EMP_INJUR: Processing page 1 of 7
---Series SL_EMP_INJUR: Processing page 2 of 7
---Series SL_EMP_INJUR: Processing page 3 of 7
---Series SL_EMP_INJUR: Processing page 4 of 7
---Series SL_EMP_INJUR: Processing page 5 of 7
---Series SL_EMP_INJUR: Processing page 6 of 7
---Series SL_EMP_INJUR: Processing page 7 of 7


In [50]:
x[0]
geo[0]

{'goal': '8',
 'target': '8.8',
 'indicator': '8.8.1',
 'seriesCode': 'SL_EMP_INJUR',
 'seriesDesc': 'Non-fatal occupational injuries among employees, by sex and migrant status (per 100,000 employees)',
 'geoAreaCode': '012',
 'geoAreaName': 'Algeria',
 'timePeriod': 2000,
 'timeDetail': None,
 'source': 'ILOSTAT - ADM-IR - Insurance records',
 'footnotes': 'Coverage of occupational injuries: Compensated injuries | Reference group coverage: Insured persons',
 'natureCode': 'C',
 'natureDesc': 'Country data',
 'unitsCode': 'PER_100000_EMP',
 'unitsDesc': 'Per 100,000 employees',
 'sexCode': '_T',
 'sexDesc': 'Both sexes',
 'migratoryStatusCode': '_T',
 'migratoryStatusDesc': 'No breakdown',
 'reportingTypeCode': 'G',
 'reportingTypeDesc': 'Global',
 'value_numeric_part': 1541.5,
 'value_is_censored': False,
 'value_detail': '1541.5'}

{'geoAreaCode': '001',
 'geoAreaName': 'World',
 'level': 1,
 'parentCode': None,
 'parentName': None,
 'type': 'Region',
 'Country_Profile': None,
 'ISO3': None,
 'UN_Member': None,
 'X': None,
 'Y': None,
 'areaName': None}

In [54]:
def get_data_to_json(series, release):
    
    x = flat_series_data(series,release)

    indicator_series =  unique_dicts(subdict_list(x,['goal', 'target', 'indicator', 'seriesCode', 'seriesDesc'])
    )

    dataset = []
    for s in indicator_series:
        d = s.copy()
        d['release'] = release
        indicator = d['indicator']
        data = select_dict(x, 'indicator', indicator)
        d['refAreas'] = geo.copy()
        for g in d['refAreas']:
            g_data = subdict_list(select_dict(data, 'geoAreaCode', g['geoAreaCode']),
                                  ['goal', 'target', 'indicator', 'seriesCode', 'seriesDesc','geoAreaCode', 'geoAreaName'], 
                                  exclude = True)
            g['data'] = g_data

        file_name = release + '\Indicator_' + indicator + '_Series_' + d['seriesCode'] + '.json'

        with open(wd_dir + r'data\unsd\\' + file_name, 'w') as f:
            json.dump(d, f, indent=4)
        
        print('created file ' + file_name)


In [55]:
get_data_to_json(series = 'SL_EMP_INJUR', release = '2019.Q1.G.03')

---Series SL_EMP_INJUR: Processing page 1 of 7
---Series SL_EMP_INJUR: Processing page 2 of 7
---Series SL_EMP_INJUR: Processing page 3 of 7
---Series SL_EMP_INJUR: Processing page 4 of 7
---Series SL_EMP_INJUR: Processing page 5 of 7
---Series SL_EMP_INJUR: Processing page 6 of 7
---Series SL_EMP_INJUR: Processing page 7 of 7
created file 2019.Q1.G.03\Indicator_8.8.1_Series_SL_EMP_INJUR.json


#### Produce 'long' files for each indicator/series combination
(Notice that multi-purpose indicators need to be split)

In [56]:
series = []
for g in metadata:
    for t in g['targets']:
        for i in t['indicators']:
            if 'series' in i.keys():
                for s in i['series']:
                    series.append(s['code'])
series = list(set(series))

for s in series:
    get_data_to_json(series = s, release = '2019.Q1.G.03')

---Series SG_NHR_INTEXST: Processing page 1 of 1
created file 2019.Q1.G.03\Indicator_16.a.1_Series_SG_NHR_INTEXST.json
---Series SI_COV_PENSN: Processing page 1 of 2
---Series SI_COV_PENSN: Processing page 2 of 2
created file 2019.Q1.G.03\Indicator_1.3.1_Series_SI_COV_PENSN.json
---Series SG_PLN_MSTKSDG: Processing page 1 of 1
created file 2019.Q1.G.03\Indicator_17.16.1_Series_SG_PLN_MSTKSDG.json
---Series EN_WWT_WWDS: Processing page 1 of 1
created file 2019.Q1.G.03\Indicator_6.3.1_Series_EN_WWT_WWDS.json
---Series SG_REG_BRTH90: Processing page 1 of 1
created file 2019.Q1.G.03\Indicator_17.19.2_Series_SG_REG_BRTH90.json
---Series DC_TOF_TRDCML: Processing page 1 of 4
---Series DC_TOF_TRDCML: Processing page 2 of 4
---Series DC_TOF_TRDCML: Processing page 3 of 4
---Series DC_TOF_TRDCML: Processing page 4 of 4
created file 2019.Q1.G.03\Indicator_8.a.1_Series_DC_TOF_TRDCML.json
---Series SH_STA_MMR: Processing page 1 of 35
---Series SH_STA_MMR: Processing page 2 of 35
---Series SH_STA_M