This script is used to read the raw data and generate `life_quality.csv`.

In [1]:
import pandas as pd
import numpy as np

from constants import LIFE_QUALITY_DIMENSION_CSV

In [2]:
df = pd.read_csv('../raw_data/life_quality.csv')
df.replace("..", np.nan, inplace=True)

In [3]:
columns = df.columns.tolist()
# fix column years format
for i in range(4, len(columns)):
	columns[i] = columns[i][:4]
df.columns = columns
df.head(6)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Canada,CAN,,,,,,,,,,,,,,,,
1,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,United States,USA,,3.1,,2.3,,2.8,,2.2,,2.4,,3.5,,3.4,,
2,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Mexico,MEX,,15.5,,,,,,13.6,,,12.4,10.0,,,14.1,
3,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,India,IND,,47.8,,,,,,,,38.7,37.9,,34.7,,,
4,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Indonesia,IDN,,,40.1,,,39.2,,,36.4,,,,,30.8,,
5,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,"Iran, Islamic Rep.",IRN,,,,,,6.8,,,,,,,,,,


In [4]:
# Replace missing value with the mean value for each country and attribute 
df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T
df.head(6)


  df[columns[4:]] = df[columns[4:]].T.fillna(df.mean(axis=1)).T


Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Canada,CAN,,,,,,,,,,,,,,,,
1,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,United States,USA,,3.1,,2.3,,2.8,,2.2,,2.4,,3.5,,3.4,,
2,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Mexico,MEX,,15.5,,,,,,13.6,,,12.4,10.0,,,14.1,
3,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,India,IND,,47.8,,,,,,,,38.7,37.9,,34.7,,,
4,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,Indonesia,IDN,,,40.1,,,39.2,,,36.4,,,,,30.8,,
5,"Prevalence of stunting, height for age (% of c...",SH.STA.STNT.ZS,"Iran, Islamic Rep.",IRN,,,,,,6.8,,,,,,,,,,


In [5]:
#df.rename(columns={
#    "People using at least basic drinking water services (% of population)": "basic_drinking_water_rate",
#    "People using at least basic drinking water services, rural (% of rural population)": "basic_drinking_water_rate_rural",
#    "People using at least basic drinking water services, urban (% of urban population)": "basic_drinking_water_rate_urban",
#    "People using safely managed drinking water services (% of population)": "safe_drinking_water_rate",
#    "People using safely managed drinking water services, rural (% of rural population)": "safe_drinking_water_rate_rural",
#    "People using safely managed drinking water services, urban (% of urban population)": "safe_drinking_water_rate_urban",
#    "People with basic handwashing facilities including soap and water (% of population)": "basic_handwashing_facilities_rate",
#    "People with basic handwashing facilities including soap and water, rural (% of rural population)": "basic_handwashing_facilities_rate_rural",
#    "People with basic handwashing facilities including soap and water, urban (% of urban population)": "basic_handwashing_facilities_rate_urban",
#    "People practicing open defecation (% of population)": "open_defecation_rate",
#    "People practicing open defecation, rural (% of rural population)": "open_defecation_rate_rural",
#    "People practicing open defecation, urban (% of urban population)": "open_defecation_rate_urban"
#}, inplace=True)


# rename and reorder columns
#df.rename_axis("life_quality_key", inplace=True)
#df


attributes_dic = {
    "basic_drinking_water_rate": "SH.H2O.BASW.ZS",
    "basic_drinking_water_rate_rural": "SH.H2O.BASW.RU.ZS",
    "basic_drinking_water_rate_urban": "SH.H2O.BASW.UR.ZS",
    "safe_drinking_water_rate": "SH.H2O.SMDW.ZS",
    "safe_drinking_water_rate_rural": "SH.H2O.SMDW.RU.ZS",
    "safe_drinking_water_rate_urban": "SH.H2O.SMDW.UR.ZS",
    "basic_handwashing_facilities_rate": "SH.STA.HYGN.ZS",
    "basic_handwashing_facilities_rate_rural": "SH.STA.HYGN.RU.ZS",
    "basic_handwashing_facilities_rate_urban": "SH.STA.HYGN.UR.ZS",
    "open_defecation_rate": "SH.STA.ODFC.ZS",
    "open_defecation_rate_rural": "SH.STA.ODFC.RU.ZS",
    "open_defecation_rate_urban": "SH.STA.ODFC.UR.ZS",
}


In [6]:
def transpose_attributes(dataframe, attribute):
	dataframe.drop(columns=["Series Name", "Series Code", "Country Name"], inplace=True)
	columns = dataframe.columns.tolist()
	return dataframe.melt(id_vars="Country Code", value_vars=columns[1:], var_name="Year", value_name=attribute)

In [7]:
# Transpose Attributes
attributes_df_dic = {}
for key, value in attributes_dic.items(): 
	filt = (df["Series Code"] == value)
	attribute_df = df.loc[filt].copy()
	attributes_df_dic[key] = transpose_attributes(attribute_df, key)

In [8]:
# Merge attributes together
attributes_lst_dic = list(attributes_df_dic.values())
life_quality_df = attributes_lst_dic[0] 
for i in range(1, len(attributes_lst_dic)):
	life_quality_df = pd.merge(life_quality_df, attributes_lst_dic[i], how="left", left_on=["Country Code", "Year"], right_on=["Country Code", "Year"])
life_quality_df.rename_axis("life_quality_key", inplace=True)
life_quality_df.head(30)

Unnamed: 0_level_0,Country Code,Year,basic_drinking_water_rate,basic_drinking_water_rate_rural,basic_drinking_water_rate_urban,safe_drinking_water_rate,safe_drinking_water_rate_rural,safe_drinking_water_rate_urban,basic_handwashing_facilities_rate,basic_handwashing_facilities_rate_rural,basic_handwashing_facilities_rate_urban,open_defecation_rate,open_defecation_rate_rural,open_defecation_rate_urban
life_quality_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,CAN,2005,99.23464422,98.20544302,99.48998183,98.24079534,,,,,,0.0,0.0,0.0
1,USA,2005,98.8536874,95.51131593,99.69304221,95.23581292,,96.41144911,,,,0.0,0.0,0.0
2,MEX,2005,93.09668937,81.80368497,96.60292525,40.35948897,,,,,,6.910224385,19.94258519,2.863955603
3,IND,2005,83.13360985,79.11866408,92.85201703,,35.93492395,,,,,58.84314476,74.7755284,20.27789371
4,IDN,2005,80.02217385,70.56315099,91.15220456,,,,,,,25.27204,37.137722,11.31019562
5,IRN,2005,95.15500125,88.83003719,98.19230917,91.43323292,82.53148404,95.70793742,,,,0.488579216,1.43,0.0365
6,EGY,2005,98.29760753,97.51632721,99.33211222,,,,73.53833157,62.42338542,88.25587378,1.232966031,1.688038667,0.63039419
7,KEN,2005,51.02456418,41.09441822,86.90824272,,,60.30574932,,,,15.37391052,18.89905303,2.635421007
8,NGA,2005,51.35268075,37.83069287,72.43679318,15.88227547,11.63975858,22.49740513,,,,24.57233598,32.47784849,12.24569318
9,CAN,2006,99.23461957,98.26651823,99.47343153,98.29946776,,,,,,0.0,0.0,0.0


In [9]:
# Output life quality Dimension
life_quality_df.to_csv(LIFE_QUALITY_DIMENSION_CSV)