In [None]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [None]:
df = pd.read_csv('input/raw/WDICSV.csv')

df

In [None]:
df_long = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
                  var_name="Year", value_name="Value")

df_wide = df_long.pivot_table(index=["Country Name", "Country Code", "Year"],
                              columns="Indicator Code", values="Value")

df_wide = df_wide.reset_index()

df_wide['Year'] = df_wide['Year'].apply(lambda year: pd.to_datetime(year, format='%Y') + relativedelta(month=12, day=31))

df_wide.to_csv('input/transformed/df_wide.csv', index=False)

df_wide

In [None]:
# Define target indicators for top and bottom 10% wealth share
target_top = "SI.DST.10TH.10"    # Top 10% wealth share
target_bottom = "SI.DST.FRST.10" # Bottom 10% wealth share
target_gini = "SI.POV.GINI"      # Gini index

# Exclude identifier columns and target columns from features
exclude_columns = ["Country Name", "Country Code", "Year", target_top, target_bottom, target_gini]
wealth_share_columns = [col for col in df_wide.columns if col.startswith("SI.DST") and col not in exclude_columns]
exclude_columns.extend(wealth_share_columns)
feature_columns = [col for col in df_wide.columns if col not in exclude_columns]

In [None]:
exclude_columns

## Missing Value Handling with KNN Imputation
- took ~25min to run

In [None]:
# First, make a copy of the feature data
X_impute = df_wide[feature_columns].copy()

# Check missing values before imputation
missing_before = X_impute.isna().sum().sum()
print(f"Missing values before imputation: {missing_before:,}")

# Approach 1: First scale, then impute
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_impute),
    columns=X_impute.columns,
    index=X_impute.index
)

# Apply KNN imputation on the scaled data
imputer = KNNImputer(n_neighbors=5)
X_imputed_scaled = pd.DataFrame(
    imputer.fit_transform(X_scaled),
    columns=X_impute.columns,
    index=X_impute.index
)

# Convert back to original scale if needed
X_imputed = pd.DataFrame(
    scaler.inverse_transform(X_imputed_scaled),
    columns=X_impute.columns,
    index=X_impute.index
)

# Update the dataframe
df_wide_knn = df_wide.copy()
df_wide_knn[feature_columns] = X_imputed

# Save the transformed and imputed data for later use
df_wide_knn.to_csv('input/imputed/df_wide_knn_imputed.csv', index=False)

# Check missing values after imputation
missing_after = df_wide_knn[feature_columns].isna().sum().sum()
print(f"Missing values after imputation: {missing_after:,}")