This script is to process the raw data in csv format and generate the `measures.csv` which will be used to create the database.

In [417]:
import pandas as pd
import numpy as np

from constants import *

In [418]:
YEAR_LIST = [str(i) for i in range(2005, 2021)]

DEVELOPMENT_INDEX = {
    "Canada": 1,
	"United States": 1,
	"Mexico": 2,
	"India": 2,
	"Indonesia": 2,
	"Iran": 2,
	"Egypt": 2,
	"Kenya": 3,
	"Nigeria": 3,
}

COUNTRY_CODE_MAP = {
    "CAN": "Canada",
	"USA": "United States",
	"MEX": "Mexico",
	"IND": "India",
	"IDN": "Indonesia",
	"IRN": "Iran",
	"EGY": "Egypt",
	"KEN": "Kenya",
	"NGA": "Nigeria",
}

In [419]:
# Quality of life index
qoli_df = pd.read_csv("../raw_data/quality_of_life_index.csv")
cols = list(qoli_df.columns)
for y in YEAR_LIST:
    if cols.count(y) == 0:
        qoli_df[y] = np.nan
qoli_df = qoli_df[["Country"] + YEAR_LIST]
qoli_df.set_index("Country", inplace=True)
qoli_df

Unnamed: 0_level_0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Canada,,,,,,,,164.99,186.03,178.29,177.63,177.23,167.18,173.9,170.32,163.47
United States,,,,,,,,140.62,199.56,195.55,192.49,183.96,179.73,180.56,179.2,172.11
Mexico,,,,,,,,60.31,87.94,83.47,84.97,137.01,129.06,126.42,123.48,118.55
India,,,,,,,,44.01,73.7,78.01,78.6,109.28,101.52,122.09,117.51,108.63
Indonesia,,,,,,,,37.22,-12.54,21.85,22.7,72.19,62.02,112.89,107.2,97.47
Iran,,,,,,,,-35.23,-6.96,-11.08,1.07,96.93,97.17,92.43,87.02,74.14
Egypt,,,,,,,,-22.17,7.59,5.41,-7.07,88.79,91.81,84.42,83.98,86.54
Kenya,,,,,,,,,,,7.32,,,,,70.56
Nigeria,,,,,,,,,,,2.99,,,,,55.65


In [420]:
di_df = pd.DataFrame(columns=(["Country"] + YEAR_LIST))
di_df["Country"] = COUNTRY_LIST
for y in YEAR_LIST:
    di_df[y] = DEVELOPMENT_INDEX.values()
di_df.set_index("Country", inplace=True)
di_df

Unnamed: 0_level_0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Canada,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
United States,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Mexico,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
India,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Indonesia,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Iran,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Egypt,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Kenya,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Nigeria,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


In [421]:
# HDI data from UN only is only collected from 2005 to 2019
hdi_df = pd.read_csv("../raw_data/HDI.csv")
hdi_df = hdi_df[["Country"] + [str(i) for i in range(2005, 2020)]]
hdi_df["Country"] = hdi_df["Country"].str.strip()
hdi_df = hdi_df.loc[hdi_df["Country"].isin(COUNTRY_LIST)]
hdi_df["2020"] = np.nan
hdi_df.set_index("Country", inplace=True)
hdi_df

Unnamed: 0_level_0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Canada,0.894,0.898,0.896,0.898,0.898,0.901,0.903,0.906,0.913,0.918,0.921,0.923,0.926,0.928,0.929,
Egypt,0.637,0.645,0.653,0.66,0.662,0.668,0.671,0.677,0.683,0.685,0.691,0.696,0.698,0.701,0.707,
India,0.536,0.546,0.555,0.563,0.569,0.579,0.588,0.597,0.604,0.616,0.624,0.63,0.64,0.642,0.645,
Indonesia,0.632,0.642,0.643,0.647,0.658,0.665,0.673,0.681,0.687,0.69,0.695,0.703,0.707,0.712,0.718,
Iran,0.683,0.719,0.723,0.728,0.734,0.742,0.753,0.768,0.771,0.774,0.774,0.784,0.787,0.785,0.783,
Kenya,0.5,0.515,0.523,0.532,0.541,0.551,0.559,0.566,0.573,0.58,0.587,0.591,0.595,0.599,0.601,
Mexico,0.737,0.745,0.746,0.748,0.748,0.748,0.755,0.759,0.756,0.761,0.766,0.768,0.771,0.776,0.779,
Nigeria,0.465,0.473,0.478,0.484,0.49,0.482,0.492,0.5,0.519,0.523,0.526,0.526,0.531,0.534,0.539,
United States,0.9,0.903,0.906,0.911,0.912,0.916,0.919,0.92,0.918,0.92,0.921,0.922,0.924,0.925,0.926,


In [422]:
# country and year dimensions as join keys
year_dim = pd.read_csv(YEAR_DIMENSION_CSV)
country_dim = pd.read_csv(COUNTRY_DIMENSION_CSV)
# dimensions to join
education_dim = pd.read_csv(EDUCATION_DIMENSION_CSV)
health_dim = pd.read_csv(HEALTH_DIMENSION_CSV)
life_quality_dim = pd.read_csv(LIFE_QUALITY_DIMENSION_CSV)
political_event_dim = pd.read_csv(POLITICAL_EVENT_DIMENSION_CSV)
population_dim = pd.read_csv(POPULATION_DIMENSION_CSV)

In [423]:
big_join = education_dim.merge(health_dim, on=["Country Code", "Year"], how="left")
big_join = big_join.merge(life_quality_dim, on=["Country Code", "Year"], how="right")
big_join = big_join.merge(political_event_dim, on=["Country Code", "Year"], how="left")
big_join = big_join.merge(population_dim, on=["Country Code", "Year"])
big_join = big_join[["Year", "Country Code", "education_key", "health_key", "life_quality_key", "population_key", "event_key"]]

# Add 3 measures
big_join[["quality_of_life_index", "development_index", "human_development_index"]] = np.nan
for i, row in big_join.iterrows():
    year = str(row.at["Year"])
    country = str(COUNTRY_CODE_MAP[row.at["Country Code"]])
    big_join.at[i, "quality_of_life_index"] = qoli_df.at[country, year]
    big_join.at[i, "development_index"] = di_df.at[country, year]
    big_join.at[i, "human_development_index"] = hdi_df.at[country, year]


big_join["health_key"] = big_join["health_key"].fillna(-1).astype(int)
big_join["event_key"] = big_join["event_key"].fillna(-1).astype(int)
big_join


Unnamed: 0,Year,Country Code,education_key,health_key,life_quality_key,population_key,event_key,quality_of_life_index,development_index,human_development_index
0,2005,CAN,0,0,0,0,0,,1.0,0.894
1,2005,USA,1,1,1,1,104,,1.0,0.900
2,2005,MEX,2,2,2,2,78,,2.0,0.737
3,2005,IND,3,3,3,3,39,,2.0,0.536
4,2005,IDN,4,4,4,4,26,,2.0,0.632
...,...,...,...,...,...,...,...,...,...,...
139,2020,IDN,139,130,139,139,-1,97.47,2.0,
140,2020,IRN,140,131,140,140,-1,74.14,2.0,
141,2020,EGY,141,132,141,141,-1,86.54,2.0,
142,2020,KEN,142,133,142,142,-1,70.56,3.0,


In [424]:
fact_df = big_join

# map year to year key
year_rdict = year_dim[["year_key", "year_num"]].set_index("year_num").to_dict()["year_key"]
year_rdict
fact_df["Year"].replace(year_rdict, inplace=True)
fact_df.rename(columns={"Year": "year_key"})

# map country code to country key
cc_rdict = country_dim[["country_key", "country_code"]].set_index("country_code").to_dict()["country_key"]
fact_df["Country Code"].replace(cc_rdict, inplace=True)
fact_df.rename(columns={"Country Code": "country_key"})
fact_df

Unnamed: 0,Year,Country Code,education_key,health_key,life_quality_key,population_key,event_key,quality_of_life_index,development_index,human_development_index
0,0,0,0,0,0,0,0,,1.0,0.894
1,0,8,1,1,1,1,104,,1.0,0.900
2,0,6,2,2,2,2,78,,2.0,0.737
3,0,3,3,3,3,3,39,,2.0,0.536
4,0,2,4,4,4,4,26,,2.0,0.632
...,...,...,...,...,...,...,...,...,...,...
139,15,2,139,130,139,139,-1,97.47,2.0,
140,15,4,140,131,140,140,-1,74.14,2.0,
141,15,1,141,132,141,141,-1,86.54,2.0,
142,15,5,142,133,142,142,-1,70.56,3.0,


In [425]:
fact_df.to_csv(FACT_TABLE_CSV, index=False)