# Integration example using SSSOM mappings of country classification

In [1]:
import pandas as pd
import os
from IPython.display import clear_output
import requests
import json
from fuzzywuzzy import fuzz
from datetime import datetime

## Load original SDG dataset

### 1.a Data 

In [2]:
data_sdg = pd.read_csv('../datasets/observations_SI_POV_EMP1.csv', dtype=str).fillna('')
data_sdg.head(3)

Unnamed: 0,SERIES_CODE,SERIES_DESCRIPTION,VARIABLE_CODE,VARIABLE_DESCRIPTION,VARIABLE_ACTIVE_DIMS,GEOGRAPHY_CODE,GEOGRAPHY_NAME,GEOGRAPHY_TYPE,GEO_AREA_CODE,GEO_AREA_NAME,...,UNIT_MULT,BASE_PERIOD,NATURE,SOURCE,GEO_INFO_URL,FOOT_NOTE,REPORTING_TYPE,OBS_STATUS,RELEASE_STATUS,RELEASE_NAME
0,SI_POV_EMP1,Employed population below international povert...,SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],1,World,Region,1,World,...,,,N,"Source = ILO modelled estimates, Nov. 2022",,,G,E,Published,2023.Q2.G.01
1,SI_POV_EMP1,Employed population below international povert...,SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],1,World,Region,1,World,...,,,N,"Source = ILO modelled estimates, Nov. 2022",,,G,E,Published,2023.Q2.G.01
2,SI_POV_EMP1,Employed population below international povert...,SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],1,World,Region,1,World,...,,,N,"Source = ILO modelled estimates, Nov. 2022",,,G,E,Published,2023.Q2.G.01


### 2a. Schema

In [3]:
schema_sdg = list(data_sdg)
schema_sdg = [{'ConceptID': x, 'ConceptName': x.replace("_", " ").title()} for x in schema_sdg]
for x in schema_sdg:
    if x['ConceptID'] in ['SERIES_CODE', 'GEOGRAPHY_CODE',
                          'FREQ', 'AGE', 'SEX', 
                         ]:
        x['Coded'] = True
        x['Role'] = 'Dimension'
    elif x['ConceptID'] in ['VARIABLE_CODE', 'GEO_AREA_CODE', 'CITIES', 'SAMPLING_STATIONS', 'IS_LATEST_PERIOD', 
                           'VALUE_TYPE', 'UNIT_MEASURE', 'UNIT_MULT', 'NATURE', 'REPORTING_TYPE', 'OBS_STATUS']:
        
        x['Coded'] = True
        x['Role'] = 'Attribute'
    elif x['ConceptID'] == 'TIME_PERIOD':
        x['Coded'] = False
        x['Role'] = 'TimeDimension'
    elif x['ConceptID'] == 'OBS_VALUE':
        x['Coded'] = False
        x['Role'] = 'Measurement'
    else:
        x['Coded'] = False
        x['Role'] = 'Attribute'

    x['ConceptID'] = 'sdg:' + x['ConceptID']
        
    
schema_sdg= pd.DataFrame(schema_sdg)
schema_sdg


Unnamed: 0,ConceptID,ConceptName,Coded,Role
0,sdg:SERIES_CODE,Series Code,True,Dimension
1,sdg:SERIES_DESCRIPTION,Series Description,False,Attribute
2,sdg:VARIABLE_CODE,Variable Code,True,Attribute
3,sdg:VARIABLE_DESCRIPTION,Variable Description,False,Attribute
4,sdg:VARIABLE_ACTIVE_DIMS,Variable Active Dims,False,Attribute
5,sdg:GEOGRAPHY_CODE,Geography Code,True,Dimension
6,sdg:GEOGRAPHY_NAME,Geography Name,False,Attribute
7,sdg:GEOGRAPHY_TYPE,Geography Type,False,Attribute
8,sdg:GEO_AREA_CODE,Geo Area Code,True,Attribute
9,sdg:GEO_AREA_NAME,Geo Area Name,False,Attribute


### 3a. Convert code values to IRIs in data table

In [4]:
for x in list(data_sdg):
    data_sdg.rename(columns={x: f'sdg:{x}'}, inplace=True)

for index, row in schema_sdg.iterrows():
    if row['Coded']:
        data_sdg[row["ConceptID"]] = data_sdg[row["ConceptID"]].apply(
            lambda x: f"{row['ConceptID']}/{x}" if x != '' else ''
        )
data_sdg.head(3)

Unnamed: 0,sdg:SERIES_CODE,sdg:SERIES_DESCRIPTION,sdg:VARIABLE_CODE,sdg:VARIABLE_DESCRIPTION,sdg:VARIABLE_ACTIVE_DIMS,sdg:GEOGRAPHY_CODE,sdg:GEOGRAPHY_NAME,sdg:GEOGRAPHY_TYPE,sdg:GEO_AREA_CODE,sdg:GEO_AREA_NAME,...,sdg:UNIT_MULT,sdg:BASE_PERIOD,sdg:NATURE,sdg:SOURCE,sdg:GEO_INFO_URL,sdg:FOOT_NOTE,sdg:REPORTING_TYPE,sdg:OBS_STATUS,sdg:RELEASE_STATUS,sdg:RELEASE_NAME
0,sdg:SERIES_CODE/SI_POV_EMP1,Employed population below international povert...,sdg:VARIABLE_CODE/SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],sdg:GEOGRAPHY_CODE/1,World,Region,sdg:GEO_AREA_CODE/1,World,...,,,sdg:NATURE/N,"Source = ILO modelled estimates, Nov. 2022",,,sdg:REPORTING_TYPE/G,sdg:OBS_STATUS/E,Published,2023.Q2.G.01
1,sdg:SERIES_CODE/SI_POV_EMP1,Employed population below international povert...,sdg:VARIABLE_CODE/SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],sdg:GEOGRAPHY_CODE/1,World,Region,sdg:GEO_AREA_CODE/1,World,...,,,sdg:NATURE/N,"Source = ILO modelled estimates, Nov. 2022",,,sdg:REPORTING_TYPE/G,sdg:OBS_STATUS/E,Published,2023.Q2.G.01
2,sdg:SERIES_CODE/SI_POV_EMP1,Employed population below international povert...,sdg:VARIABLE_CODE/SI_POV_EMP1@AGE--Y15T24,Employed population below international povert...,['AGE'],sdg:GEOGRAPHY_CODE/1,World,Region,sdg:GEO_AREA_CODE/1,World,...,,,sdg:NATURE/N,"Source = ILO modelled estimates, Nov. 2022",,,sdg:REPORTING_TYPE/G,sdg:OBS_STATUS/E,Published,2023.Q2.G.01


### 4a. Geography code list

In [5]:
cl_geo_sdg = data_sdg[['sdg:GEOGRAPHY_CODE', 'sdg:GEOGRAPHY_NAME']].drop_duplicates().reset_index(drop=True)
cl_geo_sdg.rename(columns={
                            'sdg:GEOGRAPHY_CODE': 'CodeID',
                            'sdg:GEOGRAPHY_NAME': 'CodeDescription'
                        }, inplace=True)
cl_geo_sdg

Unnamed: 0,CodeID,CodeDescription
0,sdg:GEOGRAPHY_CODE/1,World
1,sdg:GEOGRAPHY_CODE/2,Africa
2,sdg:GEOGRAPHY_CODE/5,South America
3,sdg:GEOGRAPHY_CODE/8,Albania
4,sdg:GEOGRAPHY_CODE/9,Oceania
...,...,...
155,sdg:GEOGRAPHY_CODE/858,Uruguay
156,sdg:GEOGRAPHY_CODE/860,Uzbekistan
157,sdg:GEOGRAPHY_CODE/862,Venezuela (Bolivarian Republic of)
158,sdg:GEOGRAPHY_CODE/887,Yemen


---

### 1b. Data table

In [6]:
data_ilo = pd.read_csv('../datasets/ILO Labour income distribution.csv', dtype=str).fillna('')
data_ilo.head(3)

Unnamed: 0,DATAFLOW,REF_AREA,FREQ,MEASURE,QTL,TIME_PERIOD,OBS_VALUE,OBS_STATUS,UNIT_MEASURE_TYPE,UNIT_MEASURE,UNIT_MULT,SOURCE,NOTE_SOURCE,NOTE_INDICATOR,NOTE_CLASSIF,DECIMALS,UPPER_BOUND,LOWER_BOUND
0,ILO:DF_LAP_2LID_QTL_RT(1.0),AFG,A,LAP_2LID_RT,QTL_DECILE_01,2013,0.44,M,RT,PT,0,ILO - Modelled Estimates,,,,1,,
1,ILO:DF_LAP_2LID_QTL_RT(1.0),AFG,A,LAP_2LID_RT,QTL_DECILE_02,2013,1.01,M,RT,PT,0,ILO - Modelled Estimates,,,,1,,
2,ILO:DF_LAP_2LID_QTL_RT(1.0),AFG,A,LAP_2LID_RT,QTL_DECILE_03,2013,1.7,M,RT,PT,0,ILO - Modelled Estimates,,,,1,,


In [7]:
list(data_ilo)

['DATAFLOW',
 'REF_AREA',
 'FREQ',
 'MEASURE',
 'QTL',
 'TIME_PERIOD',
 'OBS_VALUE',
 'OBS_STATUS',
 'UNIT_MEASURE_TYPE',
 'UNIT_MEASURE',
 'UNIT_MULT',
 'SOURCE',
 'NOTE_SOURCE',
 'NOTE_INDICATOR',
 'NOTE_CLASSIF',
 'DECIMALS',
 'UPPER_BOUND',
 'LOWER_BOUND']

### 2b. Schema

In [8]:
schema_ilo = list(data_ilo)
schema_ilo = [{'ConceptID': x, 'ConceptName': x.replace("_", " ").title()} for x in schema_ilo]

for x in schema_ilo:
    if x['ConceptID'] in ['REF_AREA', 'FREQ', 'MEASURE', 'QTL'] :
        x['Coded'] = True
        x['Role'] = 'Dimension'
    elif x['ConceptID'] in ['UNIT_MEASURE_TYPE', 'UNIT_MEASURE', 'UNIT_MULT','OBS_STATUS']:
        x['Coded'] = True
        x['Role'] = 'Attribute'
    elif x['ConceptID'] == 'TIME_PERIOD':
        x['Coded'] = False
        x['Role'] = 'TimeDimension'
    elif x['ConceptID'] == 'OBS_VALUE':
        x['Coded'] = False
        x['Role'] = 'Measurement'
    else:
        x['Coded'] = False
        x['Role'] = 'Attribute'

    x['ConceptID'] = 'ilo:' + x['ConceptID']
        
    
schema_ilo= pd.DataFrame(schema_ilo)
schema_ilo


Unnamed: 0,ConceptID,ConceptName,Coded,Role
0,ilo:DATAFLOW,Dataflow,False,Attribute
1,ilo:REF_AREA,Ref Area,True,Dimension
2,ilo:FREQ,Freq,True,Dimension
3,ilo:MEASURE,Measure,True,Dimension
4,ilo:QTL,Qtl,True,Dimension
5,ilo:TIME_PERIOD,Time Period,False,TimeDimension
6,ilo:OBS_VALUE,Obs Value,False,Measurement
7,ilo:OBS_STATUS,Obs Status,True,Attribute
8,ilo:UNIT_MEASURE_TYPE,Unit Measure Type,True,Attribute
9,ilo:UNIT_MEASURE,Unit Measure,True,Attribute


### 3b. Convert code values to IRIs in data table

In [9]:
for x in list(data_ilo):
    data_ilo.rename(columns={x: f'ilo:{x}'}, inplace=True)

for index, row in schema_ilo.iterrows():
    if row['Coded']:
        data_ilo[row["ConceptID"]] = data_ilo[row["ConceptID"]].apply(
            lambda x: f"{row['ConceptID']}/{x}" if x != '' else ''
        )

data_ilo.head(3)

Unnamed: 0,ilo:DATAFLOW,ilo:REF_AREA,ilo:FREQ,ilo:MEASURE,ilo:QTL,ilo:TIME_PERIOD,ilo:OBS_VALUE,ilo:OBS_STATUS,ilo:UNIT_MEASURE_TYPE,ilo:UNIT_MEASURE,ilo:UNIT_MULT,ilo:SOURCE,ilo:NOTE_SOURCE,ilo:NOTE_INDICATOR,ilo:NOTE_CLASSIF,ilo:DECIMALS,ilo:UPPER_BOUND,ilo:LOWER_BOUND
0,ILO:DF_LAP_2LID_QTL_RT(1.0),ilo:REF_AREA/AFG,ilo:FREQ/A,ilo:MEASURE/LAP_2LID_RT,ilo:QTL/QTL_DECILE_01,2013,0.44,ilo:OBS_STATUS/M,ilo:UNIT_MEASURE_TYPE/RT,ilo:UNIT_MEASURE/PT,ilo:UNIT_MULT/0,ILO - Modelled Estimates,,,,1,,
1,ILO:DF_LAP_2LID_QTL_RT(1.0),ilo:REF_AREA/AFG,ilo:FREQ/A,ilo:MEASURE/LAP_2LID_RT,ilo:QTL/QTL_DECILE_02,2013,1.01,ilo:OBS_STATUS/M,ilo:UNIT_MEASURE_TYPE/RT,ilo:UNIT_MEASURE/PT,ilo:UNIT_MULT/0,ILO - Modelled Estimates,,,,1,,
2,ILO:DF_LAP_2LID_QTL_RT(1.0),ilo:REF_AREA/AFG,ilo:FREQ/A,ilo:MEASURE/LAP_2LID_RT,ilo:QTL/QTL_DECILE_03,2013,1.7,ilo:OBS_STATUS/M,ilo:UNIT_MEASURE_TYPE/RT,ilo:UNIT_MEASURE/PT,ilo:UNIT_MULT/0,ILO - Modelled Estimates,,,,1,,


### 4b. Geography code list

In [10]:
cl_geo_ilo = pd.read_csv('../enumerations/ILO__CL_AREA(1.0).csv', dtype=str).fillna('')
cl_geo_ilo.head(3)

cl_geo_ilo = cl_geo_ilo[['code_id', 'code_name']].drop_duplicates().reset_index(drop=True)
cl_geo_ilo.rename(columns={
                            'code_id': 'CodeID',
                            'code_name': 'CodeDescription'
                        }, inplace=True)

cl_geo_ilo['CodeID'] = 'ilo:REF_AREA/' + cl_geo_ilo['CodeID']
cl_geo_ilo

Unnamed: 0,CodeID,CodeDescription
0,ilo:REF_AREA/AGO,Angola
1,ilo:REF_AREA/BDI,Burundi
2,ilo:REF_AREA/BEN,Benin
3,ilo:REF_AREA/BFA,Burkina Faso
4,ilo:REF_AREA/BWA,Botswana
...,...,...
326,ilo:REF_AREA/X25,Americas: High income
327,ilo:REF_AREA/X01,World
328,ilo:REF_AREA/X59,Southern Asia: Upper-middle income
329,ilo:REF_AREA/X65,"Northern, Southern and Western Europe: Upper-m..."


---

In [11]:
with pd.ExcelWriter('../inputs/sdg_dataset.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a specific sheet
    data_sdg.to_excel(writer, sheet_name='data', index=False)
    schema_sdg.to_excel(writer, sheet_name='schema', index=False)
    cl_geo_sdg.to_excel(writer, sheet_name='cl_geo', index=False)

In [12]:
with pd.ExcelWriter('../inputs/ilo_dataset.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a specific sheet
    data_ilo.to_excel(writer, sheet_name='data', index=False)
    schema_ilo.to_excel(writer, sheet_name='schema', index=False)
    cl_geo_ilo.to_excel(writer, sheet_name='cl_geo', index=False)