Data preprocessing

* handle missing values
* handling categorical attributes
* normalisation of numeric attributes
* feature selection to remove potentially redundant attributes

In [45]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

np.random.seed(42)

In [46]:
# Load datasets
fact_df = pd.read_csv("../data/fact_table.csv")
year_df = pd.read_csv("../data/year_dimension.csv")
country_df = pd.read_csv("../data/country_dimension.csv")
population_df = pd.read_csv("../data/population_dimension.csv")
education_df = pd.read_csv("../data/education_dimension.csv")
health_df = pd.read_csv("../data/health_dimension.csv")
life_quality_df = pd.read_csv("../data/life_quality_dimension.csv")
event_df = pd.read_csv("../data/event_dimension.csv")

In [47]:
# Drop Year and Country Code Attributes
columns = ["Country Code", "Year"]
population_df = population_df.drop(columns=columns)
education_df = education_df.drop(columns=columns)
health_df = health_df.drop(columns=columns)
life_quality_df = life_quality_df.drop(columns=columns)
event_df = event_df.drop(columns=columns)

In [48]:
# Merge fact table with all dimensions
df = fact_df.merge(country_df, how = 'inner', on = "country_key")
df = df.merge(year_df, how = 'inner', on = "year_key")
df = df.merge(education_df, how = 'inner', on = "education_key")
df = df.merge(health_df, how = 'inner', on = "health_key")
df = df.merge(life_quality_df, how = 'inner', on = "life_quality_key")
df = df.merge(population_df, how = 'inner', on = "population_key")

In [49]:
# Filter Attributes
LABEL = "human_development_index"
features = [
	LABEL,
	"year_num",
	"country_name",
	"secondary_school_enrollment_percent_of_gross", 
	"life_expectancy_female", 
	"life_expectancy_male",
	"tertiary_school_enrollment_percent_of_gross",
	"government_health_expenditure_percent_of_gdp",
	"birth_rate",
	"hospital_beds_per_1000",
	"prevalence_of_overweight_adult",
	"fertility_rate",
	"growth_rate",
	"current_health_expenditure_percent_of_gdp",
	"basic_drinking_water_rate_rural",
	"basic_drinking_water_rate",
	"prevalence_of_undernourishment",
	"open_defecation_rate",
	"open_defecation_rate_rural",
	"open_defecation_rate_urban",
	"prevalence_of_hiv",
	"capital_health_expenditure_percent_of_gdp"
]
dataset = df[features]
dataset

Unnamed: 0,human_development_index,year_num,country_name,secondary_school_enrollment_percent_of_gross,life_expectancy_female,life_expectancy_male,tertiary_school_enrollment_percent_of_gross,government_health_expenditure_percent_of_gdp,birth_rate,hospital_beds_per_1000,...,growth_rate,current_health_expenditure_percent_of_gdp,basic_drinking_water_rate_rural,basic_drinking_water_rate,prevalence_of_undernourishment,open_defecation_rate,open_defecation_rate_rural,open_defecation_rate_urban,prevalence_of_hiv,capital_health_expenditure_percent_of_gdp
0,0.894,2005,Canada,101.391190,82.600000,77.900000,,6.599058,10.600000,3.10,...,0.944467,9.035406,98.205443,99.234644,2.5,0.000000,0.000000,0.000000,,0.464043
1,0.900,2005,United States,95.052483,80.100000,75.000000,80.634827,6.629409,14.000000,3.20,...,0.921713,14.605045,95.511316,98.853687,2.5,0.000000,0.000000,0.000000,,0.653640
2,0.737,2005,Mexico,82.304077,77.999000,72.575000,24.216400,2.464976,21.741000,1.05,...,1.415817,5.836122,81.803685,93.096689,4.4,6.910224,19.942585,2.863956,0.3,0.120933
3,0.536,2005,India,53.969059,65.357000,63.689000,10.679640,0.763037,24.087000,0.41,...,1.579709,3.791162,79.118664,83.133610,21.6,58.843145,74.775528,20.277894,,0.161149
4,0.632,2005,Indonesia,60.154289,69.107000,65.579000,17.226299,0.732131,21.765000,,...,1.336305,2.531894,70.563151,80.022174,19.2,25.272040,37.137722,11.310196,0.2,0.192219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,,2020,Indonesia,,71.846067,67.713533,,,20.019533,,...,1.065179,,85.667962,92.415350,,6.106439,11.154300,2.242273,0.4,
140,,2020,Iran,,75.929733,73.271600,58.222488,,18.364400,,...,1.293027,,93.828735,97.482636,,,,0.000000,0.1,
141,,2020,Egypt,,73.025000,68.525733,,,26.880933,,...,1.920248,,99.332835,99.440176,,0.000000,0.000000,0.000000,0.1,
142,,2020,Kenya,,64.064800,59.792867,,,33.321733,,...,2.251879,,51.779979,61.632892,,8.506340,11.303955,1.310686,4.2,


In [50]:
static_features = [LABEL, "year_num", "country_name"]

In [51]:
df_static  = dataset[static_features]
df_to_impute = dataset.drop(columns=static_features, inplace=False)

In [52]:
# Create Pipeline, to fill in missing value and normalize dataset 
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()

numerical_pipeline = Pipeline([
	("imputer", imputer),
	("scaler", scaler),
])

num_feature_names = df_to_impute.columns

full_pipeline = ColumnTransformer([
	("num", numerical_pipeline, num_feature_names)
])

prepared_data = full_pipeline.fit_transform(df_to_impute)
df_imputed = pd.DataFrame(prepared_data, columns=num_feature_names)

df_imputed[static_features] = df_static

In [53]:
# Prepare Data
train_set = df_imputed.loc[df_imputed["year_num"] < 2020]
test_set = df_imputed.loc[df_imputed["year_num"] == 2020]

DATA_DIR = "../data_imputed/"
Path(DATA_DIR).mkdir(exist_ok=True, parents=True)

train_set.to_csv(os.path.join(DATA_DIR, "train.csv"), index=None)
test_set.to_csv(os.path.join(DATA_DIR, "test.csv"), index=None)