Data preprocessing

* handle missing values
* handling categorical attributes
* normalisation of numeric attributes
* feature selection to remove potentially redundant attributes

In [32]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

np.random.seed(42)

In [33]:
# Load datasets
fact_df = pd.read_csv("../data/fact_table.csv")
year_df = pd.read_csv("../data/year_dimension.csv")
country_df = pd.read_csv("../data/country_dimension.csv")
population_df = pd.read_csv("../data/population_dimension.csv")
education_df = pd.read_csv("../data/education_dimension.csv")
health_df = pd.read_csv("../data/health_dimension.csv")
life_quality_df = pd.read_csv("../data/life_quality_dimension.csv")
event_df = pd.read_csv("../data/event_dimension.csv")

In [34]:
# Drop Year and Country Code Attributes
columns = ["Country Code", "Year"]
population_df = population_df.drop(columns=columns)
education_df = education_df.drop(columns=columns)
health_df = health_df.drop(columns=columns)
life_quality_df = life_quality_df.drop(columns=columns)
event_df = event_df.drop(columns=columns)

In [35]:
# Merge fact table with all dimensions
df = fact_df.merge(country_df, how = 'inner', on = "country_key")
df = df.merge(year_df, how = 'inner', on = "year_key")
df = df.merge(education_df, how = 'inner', on = "education_key")
df = df.merge(health_df, how = 'inner', on = "health_key")
df = df.merge(life_quality_df, how = 'inner', on = "life_quality_key")
df = df.merge(population_df, how = 'inner', on = "population_key")
df = df.merge(event_df, how = 'inner', on = "event_key")

In [40]:
# Filter Attributes
LABEL = "human_development_index"
features = [
	LABEL,
	"secondary_school_enrollment_percent_of_gross", 
	"life_expectancy_female", 
	"life_expectancy_male",
	"development_index",
	"tertiary_school_enrollment_percent_of_gross",
	"government_health_expenditure_percent_of_gdp",
	"birth_rate",
	"hospital_beds_per_1000",
	"prevalence_of_overweight_adult",
	"fertility_rate",
	"growth_rate",
	"current_health_expenditure_percent_of_gdp",
	"basic_drinking_water_rate_rural",
	"basic_drinking_water_rate",
	"prevalence_of_undernourishment",
	"open_defecation_rate",
	"open_defecation_rate_rural",
	"open_defecation_rate_urban",
	"prevalence_of_hiv",
	"capital_health_expenditure_percent_of_gdp"
]
dataset = df[features]

In [37]:
# Prepare Data
train_set, test_set = train_test_split(dataset, test_size=0.33)

train_data = train_set.drop(LABEL, axis=1)
train_labels = train_set[LABEL].to_numpy(copy=True)

test_data = test_set.drop(LABEL, axis=1)
test_labels = train_set[LABEL].to_numpy(copy=True)

In [38]:
# Create Pipeline, to fill in missing value and normalize dataset 
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()

numerical_pipeline = Pipeline([
	("imputer", imputer),
	("scaler", scaler),
])

num_feature_names = train_data.columns

full_pipeline = ColumnTransformer([
	("num", numerical_pipeline, num_feature_names)
])

prepared_training_data = full_pipeline.fit_transform(train_data)
prepared_testing_data = full_pipeline.fit_transform(test_data)