Data preprocessing

* handle missing values
* handling categorical attributes
* normalisation of numeric attributes
* feature selection to remove potentially redundant attributes

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

np.random.seed(42)

In [2]:
# Load datasets
fact_df = pd.read_csv("../data/fact_table.csv")
year_df = pd.read_csv("../data/year_dimension.csv")
country_df = pd.read_csv("../data/country_dimension.csv")
population_df = pd.read_csv("../data/population_dimension.csv")
education_df = pd.read_csv("../data/education_dimension.csv")
health_df = pd.read_csv("../data/health_dimension.csv")
life_quality_df = pd.read_csv("../data/life_quality_dimension.csv")
event_df = pd.read_csv("../data/event_dimension.csv")

In [3]:
# Drop Year and Country Code Attributes
columns = ["Country Code", "Year"]
population_df = population_df.drop(columns=columns)
education_df = education_df.drop(columns=columns)
health_df = health_df.drop(columns=columns)
life_quality_df = life_quality_df.drop(columns=columns)
event_df = event_df.drop(columns=columns)

In [4]:
# Merge fact table with all dimensions
df = fact_df.merge(country_df, how = 'inner', on = "country_key")
df = df.merge(year_df, how = 'inner', on = "year_key")
df = df.merge(education_df, how = 'inner', on = "education_key")
df = df.merge(health_df, how = 'inner', on = "health_key")
df = df.merge(life_quality_df, how = 'inner', on = "life_quality_key")
df = df.merge(population_df, how = 'inner', on = "population_key")
df = df.merge(event_df, how = 'inner', on = "event_key")

In [13]:
# Filter Attributes
LABEL = "human_development_index"
features = [
	LABEL,
	"year_num",
	"country_name",
	"secondary_school_enrollment_percent_of_gross", 
	"life_expectancy_female", 
	"life_expectancy_male",
	"development_index",
	"tertiary_school_enrollment_percent_of_gross",
	"government_health_expenditure_percent_of_gdp",
	"birth_rate",
	"hospital_beds_per_1000",
	"prevalence_of_overweight_adult",
	"fertility_rate",
	"growth_rate",
	"current_health_expenditure_percent_of_gdp",
	"basic_drinking_water_rate_rural",
	"basic_drinking_water_rate",
	"prevalence_of_undernourishment",
	"open_defecation_rate",
	"open_defecation_rate_rural",
	"open_defecation_rate_urban",
	"prevalence_of_hiv",
	"capital_health_expenditure_percent_of_gdp"
]
dataset = df[features]
dataset

Unnamed: 0,human_development_index,year_num,country_name,secondary_school_enrollment_percent_of_gross,life_expectancy_female,life_expectancy_male,development_index,tertiary_school_enrollment_percent_of_gross,government_health_expenditure_percent_of_gdp,birth_rate,...,growth_rate,current_health_expenditure_percent_of_gdp,basic_drinking_water_rate_rural,basic_drinking_water_rate,prevalence_of_undernourishment,open_defecation_rate,open_defecation_rate_rural,open_defecation_rate_urban,prevalence_of_hiv,capital_health_expenditure_percent_of_gdp
0,0.894,2005,Canada,101.391190,82.600,77.900,1.0,,6.599058,10.600,...,0.944467,9.035406,98.205443,99.234644,2.5,0.000000,0.000000,0.000000,,0.464043
1,0.900,2005,United States,95.052483,80.100,75.000,1.0,80.634827,6.629409,14.000,...,0.921713,14.605045,95.511316,98.853687,2.5,0.000000,0.000000,0.000000,,0.653640
2,0.737,2005,Mexico,82.304077,77.999,72.575,2.0,24.216400,2.464976,21.741,...,1.415817,5.836122,81.803685,93.096689,4.4,6.910224,19.942585,2.863956,0.3,0.120933
3,0.536,2005,India,53.969059,65.357,63.689,2.0,10.679640,0.763037,24.087,...,1.579709,3.791162,79.118664,83.133610,21.6,58.843145,74.775528,20.277894,,0.161149
4,0.632,2005,Indonesia,60.154289,69.107,65.579,2.0,17.226299,0.732131,21.765,...,1.336305,2.531894,70.563151,80.022174,19.2,25.272040,37.137722,11.310196,0.2,0.192219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,0.707,2017,Indonesia,87.059219,73.515,69.156,2.0,36.444439,1.349845,18.422,...,1.176197,2.904172,82.634073,90.104624,6.4,9.630592,16.350985,4.055858,0.4,0.214948
113,0.787,2017,Iran,86.311493,77.436,75.217,2.0,68.116089,4.346505,19.011,...,1.385334,8.427907,92.828996,97.088096,5.2,,,0.000000,0.1,
114,0.698,2017,Egypt,86.714447,73.967,69.453,2.0,35.164520,1.744669,27.050,...,2.090833,5.633056,99.059827,99.269514,4.9,0.000000,0.000000,0.000000,0.1,
115,0.595,2017,Kenya,,68.244,63.539,3.0,11.464710,1.778124,29.296,...,2.356477,4.140983,49.681773,59.584152,24.6,9.835456,12.822975,1.575633,4.7,0.591823


In [10]:
# Prepare Data
train_set, test_set = train_test_split(dataset, test_size=0.33)

train_data = train_set.drop(LABEL, axis=1)
train_labels = train_set[LABEL].to_numpy(copy=True) # HDI values

test_data = test_set.drop(LABEL, axis=1)
test_labels = train_set[LABEL].to_numpy(copy=True) # HDI values

[0.761 0.5   0.559 0.746 0.642 0.916 0.536 0.922 0.92  0.658 0.92  0.588
 0.519 0.903 0.901 0.473 0.911 0.784 0.671 0.653 0.5   0.532 0.924 0.624
 0.912 0.703 0.687 0.695 0.482 0.723 0.526 0.591 0.683 0.923 0.465 0.647
 0.898 0.768 0.745 0.748 0.921 0.9   0.753 0.787 0.637 0.734 0.563 0.643
 0.665 0.755 0.595 0.894 0.918 0.748 0.569 0.728 0.668 0.566 0.774 0.696
 0.63  0.597 0.523 0.896 0.484 0.771 0.906 0.737 0.766 0.573 0.677 0.673
 0.913 0.691 0.645 0.64  0.903 0.69 ]


In [8]:
# Create Pipeline, to fill in missing value and normalize dataset 
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()

numerical_pipeline = Pipeline([
	("imputer", imputer),
	("scaler", scaler),
])

num_feature_names = train_data.columns

full_pipeline = ColumnTransformer([
	("num", numerical_pipeline, num_feature_names)
])

prepared_training_data = full_pipeline.fit_transform(train_data)
# print(type(prepared_training_data))
# prepared_train_df = pd.DataFrame()
# prepared_testing_data = full_pipeline.fit_transform(test_data)

<class 'numpy.ndarray'>
