This script is used to read the raw data and generate `education.csv`.

In [1]:
import pandas as pd
import numpy as np

from constants import HEALTH_DIMENSION_CSV

In [2]:
df = pd.read_csv('../raw_data/health.csv')
df.replace("..", np.nan, inplace=True)

In [3]:
columns = df.columns.tolist()
# fix column years format
for i in range(4, len(columns)):
	columns[i] = columns[i][:4]
df.columns = columns
df.head(6)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Canada,CAN,,,,,,,,,,,,,,,
1,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,United States,USA,,,2.3,,2.8,,2.2,,2.4,,3.5,,3.4,,
2,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Mexico,MEX,,,,,,,13.6,,,12.4,10.0,,,14.1,
3,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,India,IND,,,,,,,,,38.7,37.9,,34.7,,,
4,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Indonesia,IDN,,40.1,,,39.2,,,36.4,,,,,30.8,,
5,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,"Iran, Islamic Rep.",IRN,,,,,6.8,,,,,,,,,,


In [4]:
# Replace missing value with the mean value for each country and attribute 
df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T
df.head(6)


  df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T


Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Canada,CAN,,,,,,,,,,,,,,,
1,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,United States,USA,,,2.3,,2.8,,2.2,,2.4,,3.5,,3.4,,
2,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Mexico,MEX,,,,,,,13.6,,,12.4,10.0,,,14.1,
3,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,India,IND,,,,,,,,,38.7,37.9,,34.7,,,
4,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Indonesia,IDN,,40.1,,,39.2,,,36.4,,,,,30.8,,
5,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,"Iran, Islamic Rep.",IRN,,,,,6.8,,,,,,,,,,


In [5]:
attributes_dic = {
    "capital_health_expenditure_percent_of_gdp": "SH.XPD.KHEX.GD.ZS",
    "current_health_expenditure_percent_of_gdp": "SH.XPD.CHEX.GD.ZS",
    "government_health_expenditure_percent_of_gdp": "SH.XPD.GHED.GD.ZS",
    "community_health_workers_per_1000": "SH.MED.CMHW.P3",
    "prevalence_of_hiv": "SH.DYN.AIDS.ZS",
    "prevalence_of_hiv_male": "SH.HIV.1524.MA.ZS",
    "prevalence_of_hiv_female": "SH.HIV.1524.FE.ZS",
    "adults_with_hiv": "SH.DYN.AIDS",
    "children_with_hiv": "SH.HIV.0014",
    "prevalence_of_overweight_adult": "SH.STA.OWAD.ZS",
    "prevalence_of_overweight_children_under_5": "SH.STA.OWGH.ZS",
    "prevalence_of_severe_wasting_children_under_5": "SH.SVR.WAST.ZS",
    "prevalence_of_stunting_children_under_5": "SH.STA.STNT.ZS",
    "prevalence_of_undernourishment": "SN.ITK.DEFC.ZS",
    "hospital_beds_per_1000": "SH.MED.BEDS.ZS",
    "physicians_per_1000": "SH.MED.PHYS.ZS",
    "number_of_stillbirths": "SH.DTH.STLB",
}


In [6]:
def transpose_attributes(dataframe, attribute):
	dataframe.drop(columns=["Series Name", "Series Code", "Country Name"], inplace=True)
	columns = dataframe.columns.tolist()
	return dataframe.melt(id_vars="Country Code", value_vars=columns[1:], var_name="Year", value_name=attribute)

In [7]:
# Transpose Attributes
attributes_df_dic = {}
for key, value in attributes_dic.items(): 
	filt = (df["Series Code"] == value)
	attribute_df = df.loc[filt].copy()
	attributes_df_dic[key] = transpose_attributes(attribute_df, key)

In [8]:
# Merge attributes together
attributes_lst_dic = list(attributes_df_dic.values())
health_df = attributes_lst_dic[0] 
for i in range(1, len(attributes_lst_dic)):
	health_df = pd.merge(health_df, attributes_lst_dic[i], how="left", left_on=["Country Code", "Year"], right_on=["Country Code", "Year"])
health_df.rename_axis("health_key", inplace=True)
health_df.head(30)

Unnamed: 0_level_0,Country Code,Year,capital_health_expenditure_percent_of_gdp,current_health_expenditure_percent_of_gdp,government_health_expenditure_percent_of_gdp,community_health_workers_per_1000,prevalence_of_hiv,prevalence_of_hiv_male,prevalence_of_hiv_female,adults_with_hiv,children_with_hiv,prevalence_of_overweight_adult,prevalence_of_overweight_children_under_5,prevalence_of_severe_wasting_children_under_5,prevalence_of_stunting_children_under_5,prevalence_of_undernourishment,hospital_beds_per_1000,physicians_per_1000,number_of_stillbirths
health_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,CAN,2005,0.4640432,9.03540611,6.59905815,,,,,,,58.6,,,,2.5,3.1,,1037
1,USA,2005,0.65363973,14.60504532,6.62940884,,,,,,,61.9,,,,2.5,3.2,,12382
2,MEX,2005,0.12093282,5.83612156,2.46497631,,0.3,0.1,0.1,190000.0,1900.0,58.9,,,,4.4,1.05,3.5303,18512
3,IND,2005,0.16114938,3.79116249,0.76303685,0.627,,,,,,14.2,,,,21.6,0.41,0.5885,734078
4,IDN,2005,0.19221886,2.53189397,0.73213077,,0.2,0.2,0.1,200000.0,2700.0,19.8,,,,19.2,,,66751
5,IRN,2005,0.11898085,5.38635683,1.98597813,,0.1,0.1,0.1,54000.0,500.0,53.4,,,,5.2,1.72,0.8869,12218
6,EGY,2005,,4.92229176,1.5893116,,0.1,0.1,0.1,2300.0,100.0,56.3,14.1,2.5,23.9,6.4,2.2,,28256
7,KEN,2005,,5.31436443,1.48269117,,6.8,1.3,3.8,1200000.0,230000.0,19.5,7.9,2.4,40.3,28.5,,,32030
8,NGA,2005,,4.46591902,1.14133215,,1.5,0.4,0.8,1100000.0,140000.0,21.8,,,,7.1,,0.2824,154544
9,CAN,2007,0.61307919,9.44500446,6.57271051,,,,,,,59.7,,,,2.5,2.96,,1129


In [9]:
# Output Education Dimension
health_df.to_csv(HEALTH_DIMENSION_CSV)