# Pre-processing script 2
- pivot into "wide" format

### Load necessary libraries

In [1]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import hashlib
import numpy as np
import pandas as pd
import re
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [2]:

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

data_dir = r'../../data/unsd/UNSYB/input/'
print('data inputs dir: ' + data_dir)

output_dir = r'../../data/unsd/UNSYB/output/'
print('outputs dir: ' + output_dir)



C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\notebooks\unsdSYB
data inputs dir: ../../data/unsd/UNSYB/input/
outputs dir: ../../data/unsd/UNSYB/output/


## Utilities

#### Convert string to camelCase

In [3]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [4]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [5]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


#### Return empty string if None

In [6]:
def xstr(s):
    if s is None:
        return ''
    return str(s)

#### Compute a hash of a dictionary

In [7]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [8]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [9]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            if set(keys_list) <= set(d.keys()):
                sub_d= {k: d[k] for k in keys_list}
                sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [10]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if k in set(d.keys()):
            if d[k] == v:
                selected.append(d)
    return selected

#### Find the coverage of an (unordered) list of years

In [11]:
def year_intervals (years_list):
    """ Find the coverage of an ordered list of years"""
    
    years_list = list(map(int, years_list))
    
    years_list.sort()
    
    n = len(years_list)
    
    start_y = list()
    end_y = list()
    
    start_y.append(years_list[0])
    
    if n > 1:
        for i in range(n-1):
            if(years_list[i+1] - years_list[i]>1):
                start_y.append(years_list[i+1])
                end_y.append(years_list[i])
    
    end_y.append(years_list[n-1])
    
    interval_yy = list()
    
    for i in range(len(start_y)):

        if  end_y[i] - start_y[i]> 0 :
            interval_yy.append(str(start_y[i]) + '-' + str(end_y[i]))
        else:
            interval_yy.append(str(start_y[i]))

    
    x = ",".join(interval_yy)
    return(x)


In [12]:
year_intervals(['1995','2000', '1996', '2001','2002','2003','2004'])
year_intervals(['1995'])
year_intervals(['2000','2004'])

'1995-1996,2000-2004'

'1995'

'2000,2004'

### Read data file

In [13]:
topic = '02'
table = '008'
series = 'SYB011'

file = 'Topic'+topic+'_Table'+table+'_Series'+series+'.json'

with open(output_dir + file) as json_file:  
    data = json.load(json_file)

In [14]:
def read_data_file(topic, table, series, dir_path):
    file = 'Topic'+topic+'_Table'+table+'_Series'+series+'.json'
    with open(dir_path + file) as json_file:  
        data = json.load(json_file)
    return data

In [15]:
data = read_data_file(topic = '02', table = '008', series = 'SYB011', dir_path = output_dir)
data.keys()

data_header = {k: data[k] for k in data.keys() if k in ['topicId', 'topicNameEN', 'topicNameFR',
                                          'tableCode', 'tableId', 'tableName', 'tableStatusId', 'tbBkCode', 'tbBkPrint', 
                                          'newSeriesName', 'SELECT', 'seriesCode', 'SYB_series']}

data_sources = data['sources']

data_footnotes = data['footnotes']



dict_keys(['topicId', 'topicNameEN', 'topicNameFR', 'tableCode', 'tableId', 'tableName', 'tableStatusId', 'tbBkCode', 'tbBkPrint', 'newSeriesName', 'SELECT', 'seriesCode', 'SYB_series', 'sources', 'footnotes', 'refAreas'])

In [16]:
data_header

data_sources[0]
data_footnotes[0]

{'topicId': 2,
 'topicNameEN': 'Population and migration',
 'topicNameFR': 'Population et migration',
 'tableCode': 'lifecbmort',
 'tableId': 8,
 'tableName': 'Population growth and indicators of fertility and mortality',
 'tableStatusId': 3,
 'tbBkCode': 'X03 ',
 'tbBkPrint': 'Y',
 'newSeriesName': 'Infant mortality ratio',
 'SELECT': None,
 'seriesCode': 'SYB011',
 'SYB_series': [253]}

{'sourceCode': 'UNSD_DYB',
 'sourceId': 35,
 'sourceNameEN': 'United Nations Statistics Division, New York, "Demographic Yearbook 2015" and the demographic statistics database, last accessed June 2017.',
 'sourceNameFR': 'Organisation des Nations Unies, Division de statistique, New York, Annuaire démographique 2015 et recueil de statistiques démographiques, denier accès juin 2017.'}

{'footnoteId': 1443,
 'footnoteCode': 'incuneastjerusalem',
 'footnoteTextEN': 'Including East Jerusalem.',
 'footnoteTextFR': 'Y compris Jérusalem-Est.'}

### List of countries to be plotted on a map (with XY coordinates)

In [17]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'M49':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [18]:
country_dictXY = countryListXY('../../globalResources/refAreas.txt').to_dict(orient = 'records')
country_dictXY[0]

{'Country_Profile': '1',
 'ISO3': 'AFG',
 'M49': '4',
 'UN_Member': '1',
 'X': '66.02688198',
 'Y': '33.83160199',
 'areaName': 'Afghanistan'}

### Add coordinates to data file

In [19]:
def add_coordinates (data, coordinates):
    
    new_data = {k: data[k] for k in data.keys() if k not in ['refAreas']}

    new_data['refAreas'] = []

    for g in data['refAreas']:

        geo_details = {}

        for k,v in g.items():
            if k not in ['data']:
                geo_details[k] = v


        geo_details['countryProfile']=None
        geo_details['ISO3'] = None
        geo_details['UN_Member'] = None
        geo_details['X'] = None
        geo_details['Y'] = None

        for xy in coordinates:

            if xy['M49'].zfill(3) != str(g['refAreaCode']).zfill(3):
                continue

            geo_details['countryProfile']=xy['Country_Profile']
            geo_details['ISO3'] = xy['ISO3']
            geo_details['UN_Member'] = xy['UN_Member']
            geo_details['X'] = xy['X']
            geo_details['Y'] = xy['Y']

        geo_details['data'] = g['data']

        new_data['refAreas'].append(geo_details)

    return new_data




In [20]:
data = add_coordinates(read_data_file(topic = '02', table = '008', series = 'SYB011', dir_path = output_dir),
                       coordinates = countryListXY('../../globalResources/refAreas.txt').to_dict(orient = 'records'))


{k: data[k] for k in data.keys() if k not in ['refAreas', 'sources', 'footnotes']}

{k: data['refAreas'][2][k] for k in data['refAreas'][2].keys() if k not in ['data']}

{'topicId': 2,
 'topicNameEN': 'Population and migration',
 'topicNameFR': 'Population et migration',
 'tableCode': 'lifecbmort',
 'tableId': 8,
 'tableName': 'Population growth and indicators of fertility and mortality',
 'tableStatusId': 3,
 'tbBkCode': 'X03 ',
 'tbBkPrint': 'Y',
 'newSeriesName': 'Infant mortality ratio',
 'SELECT': None,
 'seriesCode': 'SYB011',
 'SYB_series': [253]}

{'parentRegionId': '34',
 'parentRegion_DescEN': 'Southern Asia',
 'refAreaType': 'Area',
 'refAreaCode': 4,
 'refAreaDesc': 'Afghanistan',
 'countryProfile': '1',
 'ISO3': 'AFG',
 'UN_Member': '1',
 'X': '66.02688198',
 'Y': '33.83160199'}

### Select `refAreas` that have coordinates

In [21]:
ref_areas_publish = []
for d in data['refAreas']:
    if d['X'] and d['Y']:
        ref_areas_publish.append(d)


In [22]:
ref_areas_publish[0]

{'parentRegionId': '34',
 'parentRegion_DescEN': 'Southern Asia',
 'refAreaType': 'Area',
 'refAreaCode': 4,
 'refAreaDesc': 'Afghanistan',
 'countryProfile': '1',
 'ISO3': 'AFG',
 'UN_Member': '1',
 'X': '66.02688198',
 'Y': '33.83160199',
 'data': [{'year': 2010,
   'value': 76.725,
   'sourceId': 35,
   'footnoteId': [61246],
   'units': 'Number of deaths per 1,000 live births',
   'multiplier': 0,
   'baseYear': None},
  {'year': 2015,
   'value': 68.558,
   'sourceId': 35,
   'footnoteId': [61246],
   'units': 'Number of deaths per 1,000 live births',
   'multiplier': 0,
   'baseYear': None},
  {'year': 1985,
   'value': 158.868,
   'sourceId': 35,
   'footnoteId': [61246],
   'units': 'Number of deaths per 1,000 live births',
   'multiplier': 0,
   'baseYear': None},
  {'year': 1990,
   'value': 133.93,
   'sourceId': 35,
   'footnoteId': [61246],
   'units': 'Number of deaths per 1,000 live births',
   'multiplier': 0,
   'baseYear': None},
  {'year': 1995,
   'value': 111.164,


### Select distinct years among all data records:

In [23]:
def distinct_years (ref_areas_publish):

    temp = []
    timePeriods = []

    for g in ref_areas_publish:
        temp.extend(subdict_list(g['data'], ['year'], exclude = False))

    for i in unique_dicts(temp):
        timePeriods.append(i['year'])

    timePeriods.sort()
    return timePeriods
    


In [24]:
distinct_years(ref_areas_publish)

[1985, 1990, 1995, 2000, 2005, 2008, 2010, 2011, 2012, 2013, 2014, 2015, 2020]

### Select distinct slices among all data records

In [25]:
def distinct_slices(ref_areas_publish):
    temp = []

    for g in ref_areas_publish:
        temp.extend(subdict_list(g['data'], ['year', 'value', 'sourceId', 'footnoteId'], exclude = True))

    slices = unique_dicts(temp)
    return slices

In [26]:
distinct_slices(ref_areas_publish)

[{'units': 'Number of deaths per 1,000 live births',
  'multiplier': 0,
  'baseYear': None}]

### Open catalogue of SYB Series

In [27]:
with open(output_dir + 'SYB_Series_Catalog.json') as json_file:  
    catalogue = json.load(json_file)
catalogue[0]

{'topicId': 2,
 'topicNameEN': 'Population and migration',
 'topicNameFR': 'Population et migration',
 'tables': [{'tableCode': 'lifecbmort',
   'tableId': 8,
   'tableName': 'Population growth and indicators of fertility and mortality',
   'tableNote': None,
   'tableStatusId': 3,
   'tbBkCode': 'X03 ',
   'tbBkPrint': 'Y',
   'series': [{'newSeriesName': 'Infant mortality ratio',
     'units': 'Number of deaths per 1,000 live births',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB011',
     'SYB_series': [253]},
    {'newSeriesName': 'Life expectancy at birth',
     'units': 'Number of years',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB012',
     'SYB_series': [36, 268, 269]},
    {'newSeriesName': 'Maternal mortality ratio',
     'units': 'Number of deaths per 1,000 population',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB013',
     'SYB_series': [115

In [28]:
for t1 in catalogue:
    for t2 in t1['tables']:
        for s in t2['series']:

            #if s['seriesCode'] != 'SYB012':
            #    continue
                
            # Read data file:
            data = add_coordinates(read_data_file(topic = str(t1['topicId']).zfill(2), 
                                                  table = str(t2['tableId']).zfill(3), 
                                                  series =  s['seriesCode'], 
                                                  dir_path = output_dir),
                                   countryListXY('../../globalResources/refAreas.txt').to_dict(orient = 'records'))
            
            #{k: data[k] for k in data.keys() if k not in ['refAreas', 'sources', 'footnotes']}
            
            footnote_lookup = {k: data[k] for k in data.keys() if k == 'footnotes'}
            #footnote_lookup
                       
            source_lookup = {k: data[k] for k in data.keys() if k == 'sources'}
            #source_lookup
            
            # Select only refAreas that have coordinates:

            ref_areas_publish = []
            for d in data['refAreas']:
                if d['X'] and d['Y']:
                    ref_areas_publish.append(d)
                    
            # Select unique time periods among all records:
            years = distinct_years(ref_areas_publish)
            #years = [i for i in years if i <= 2019]

            # Obtain unique slices:
            slices = distinct_slices(ref_areas_publish)
            #slices   
            
            #ref_areas_publish[0]
            
            for g in ref_areas_publish:
                
                data_new = []

                for j in slices:   
                    
                    # Select data corresponding to reference area g and slice j:

                    slice_data = []

                    slice_data_wide = j.copy()

                    for record in g['data']:
                    
                        is_included = 1
                        
                        for k in j.keys():
                            if k not in record.keys():
                                is_included = is_included * 0
                                break
                            if j[k] != record[k]:
                                is_included = is_included * 0
                                break
                                
                        if is_included == 1: #and record['year'] <= 2019 :
                                
                            slice_data.append(record)
                        
                            
                        if record['multiplier'] == 3:
                            record['multiplierDesc'] = 'Thousands'
                        elif record['multiplier'] == 6:
                            record['multiplierDesc'] = 'Millions'
                        elif record['multiplier'] == 9:
                            record['multiplierDesc'] = 'Billions'
                        elif record['multiplier'] == 12:
                            record['multiplierDesc'] = 'Trillions'
                        else:
                            record['multiplierDesc'] = None
                    
                    if len(g['data'])>0:
                        slice_data_wide['multiplierDesc'] = g['data'][0]['multiplierDesc']
                    else:
                        slice_data_wide['multiplierDesc'] = None
                    
                    #----------------------------
                    slice_footnotes = []
                    slice_sources = []
                    slice_years = []
                    
                    
                    for r in slice_data:
                        if 'footnoteId' in r.keys():
                            if r['footnoteId']:
                                slice_footnotes.extend(r['footnoteId'])
                        if 'sourceId' in r.keys():
                            slice_sources.append(r['sourceId'])
                        if 'year' in r.keys():
                            slice_years.append(r['year'])
                    
                    

                    slice_footnotes = list(set(slice_footnotes))
                    slice_sources = list(set(slice_sources))

                    max_year = None
                    
                    if len(slice_years)>0:
                        max_year = max(slice_years)
                    #----------------------------
                
                    
                    #if len(slice_footnotes)>0:
                        #print('\n=============================')
                        #print(g['refAreaDesc'] + '\n Footnotes: ' + str(slice_footnotes) )
                        #print('=============================')

                    slice_footnote_join_EN = []
                    slice_footnote_join_FR = []
                    counter = 0
                    for fn in slice_footnotes:
                        #print('fn in slice_footnotes = ' + str(fn))
                        
                        for fn_lookup in footnote_lookup['footnotes']:
                            if fn_lookup['footnoteId'] != fn:
                                continue
                            fn_text_EN = xstr(fn_lookup['footnoteTextEN'])
                            fn_text_FR = xstr(fn_lookup['footnoteTextFR'])
                            
                        fn_years = []
                        
                        for r in slice_data:
                            if 'footnoteId' in r.keys():
                                if r['footnoteId']:
                                    if fn in r['footnoteId']:
                                        fn_years.append(r['year'])
                        fn_years = year_intervals(fn_years)
                        #print('fn_years = ' + fn_years)
                            
                        counter += 1

                        #print('years = ', year_intervals(years))
                        #print('- - - - - - - - - - - - - - - - -')

                        if fn_years !=  year_intervals(years):
                            slice_footnote_join_EN.append('['+fn_years+']: ' + fn_text_EN)
                            slice_footnote_join_FR.append('['+fn_years+']: ' + fn_text_FR)
                        else:
                            slice_footnote_join_EN.append(fn_text_EN)
                            slice_footnote_join_FR.append(fn_text_FR)
                            
                    if len(slice_footnote_join_EN) > 1:
                        slice_footnote_join_EN.sort()
                        slice_footnote_join_EN = ' '.join(slice_footnote_join_EN)
                        slice_footnote_join_FR.sort()
                        slice_footnote_join_FR = ' '.join(slice_footnote_join_FR)
                    elif len(slice_footnote_join_EN) == 1: 
                        slice_footnote_join_EN = slice_footnote_join_EN[0]
                        slice_footnote_join_FR = slice_footnote_join_FR[0]
                    else:
                        slice_footnote_join_EN = None
                        slice_footnote_join_FR = None

                    slice_data_wide['footnotesEN'] = slice_footnote_join_EN
                    slice_data_wide['footnotesFR'] = slice_footnote_join_FR
                    
                    #if slice_footnote_join_EN:
                    #    print(slice_footnote_join_EN)
                    #    print(slice_footnote_join_FR)
                        
                    #----------------------------
                
                    #if len(slice_sources)>0:
                        #print('\n=============================')
                        #print(g['refAreaDesc'] + '\n Sources: ' + str(slice_sources) )
                        #print('=============================')

                    slice_source_join_EN = []
                    slice_source_join_FR = []
                    counter = 0
                    for src in slice_sources:
                        #print('fn in slice_footnotes = ' + str(fn))
                        
                        for src_lookup in source_lookup['sources']:
                            if src_lookup['sourceId'] != src:
                                continue
                            src_text_EN = xstr(src_lookup['sourceNameEN'])
                            src_text_FR = xstr(src_lookup['sourceNameFR'])
                            
                        src_years = []
                        
                        for r in slice_data:
                            if 'sourceId' in r.keys():
                                if r['sourceId']:
                                    if src == r['sourceId']:
                                        src_years.append(r['year'])
                        src_years = year_intervals(src_years)
                        #print('src_years = ' + src_years)
                            
                        counter += 1

                        #print('years = ', year_intervals(years))
                        #print('- - - - - - - - - - - - - - - - -')

                        if src_years !=  year_intervals(years):
                            slice_source_join_EN.append('['+src_years+']: ' + src_text_EN)
                            slice_source_join_FR.append('['+src_years+']: ' + src_text_FR)
                        else:
                            slice_source_join_EN.append(src_text_EN)
                            slice_source_join_FR.append(src_text_FR)
                            
                    if len(slice_source_join_EN) > 1:
                        slice_source_join_EN.sort()
                        slice_source_join_EN = ' '.join(slice_source_join_EN)
                        slice_source_join_FR.sort()
                        slice_source_join_FR = ' '.join(slice_source_join_FR)
                    elif len(slice_source_join_EN) == 1: 
                        slice_source_join_EN = slice_source_join_EN[0]
                        slice_source_join_FR = slice_source_join_FR[0]
                    else:
                        slice_source_join_EN = None
                        slice_source_join_FR = None

                    slice_data_wide['sourceEN'] = slice_source_join_EN
                    slice_data_wide['sourceFR'] = slice_source_join_FR
                    
                    #if slice_source_join_EN:
                    #    print(slice_source_join_EN)
                    #    print(slice_source_join_FR)
                        
                
                    #------------------------

                    s_keys = list(j.keys())
                    s_keys.extend(['footnoteId','sourceId'])
                    
                    for y in years:

                        slice_data_y = subdict_list(select_dict(slice_data, 'year', y), 
                                                    s_keys, exclude = True)
                        
                        if len(slice_data_y)>0:
                            for ry in slice_data_y:

                                if 'value' in ry.keys():
                                    slice_data_wide['value_'+str(y)] = ry['value']
                                else:
                                    slice_data_wide['value_'+str(y)] = None

                        else:

                            slice_data_wide['value_'+str(y)] = None

                    if max_year:
                        slice_data_wide['value_latest_year'] = slice_data_wide['value_'+str(max_year)]
                        slice_data_wide['latest_year'] = max_year
                    else:
                        slice_data_wide['value_latest_year'] = None
                        slice_data_wide['latest_year'] = None



                    data_new.append(slice_data_wide)
                    
                del g['data']
                g['data'] = data_new
                
            new_data = {}
            new_data['topicId'] = data['topicId']
            new_data['topicNameEN'] = data['topicNameEN']
            new_data['topicNameFR'] = data['topicNameFR']
            new_data['tableCode'] = data['tableCode']
            new_data['tableId'] = data['tableId']
            new_data['tableName'] = data['tableName']
            new_data['seriesDesc'] = data['newSeriesName']
            new_data['seriesCode'] = data['seriesCode']
            new_data['SYB_series'] = data['SYB_series']
            
            
            new_data['data'] = ref_areas_publish

            file_name = 'wide_Topic'+ str(t1['topicId']).zfill(2) + '_Table' + str(t2['tableId']).zfill(3) + '_Series' + s['seriesCode'] + '.json'

            with open(output_dir  + file_name, 'w') as f:
                json.dump(new_data, f, indent=4)


            print(file_name)


                            

                        

wide_Topic02_Table008_SeriesSYB011.json
wide_Topic02_Table008_SeriesSYB012.json
wide_Topic02_Table008_SeriesSYB013.json
wide_Topic02_Table008_SeriesSYB014.json
wide_Topic02_Table008_SeriesSYB015.json
wide_Topic02_Table078_SeriesSYB072.json
wide_Topic02_Table078_SeriesSYB073.json
wide_Topic02_Table078_SeriesSYB074.json
wide_Topic02_Table078_SeriesSYB075.json
wide_Topic02_Table078_SeriesSYB076.json
wide_Topic02_Table078_SeriesSYB077.json
wide_Topic02_Table083_SeriesSYB078.json
wide_Topic02_Table083_SeriesSYB079.json
wide_Topic02_Table083_SeriesSYB080.json
wide_Topic02_Table083_SeriesSYB081.json
wide_Topic02_Table083_SeriesSYB082.json
wide_Topic02_Table089_SeriesSYB087.json
wide_Topic02_Table089_SeriesSYB088.json
wide_Topic02_Table089_SeriesSYB089.json
wide_Topic02_Table089_SeriesSYB090.json
wide_Topic02_Table089_SeriesSYB091.json
wide_Topic03_Table001_SeriesSYB001.json
wide_Topic03_Table001_SeriesSYB002.json
wide_Topic03_Table109_SeriesSYB108.json
wide_Topic03_Table109_SeriesSYB109.json
