In [1]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [2]:
df = pd.read_csv('input/raw/WDICSV.csv')

df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,17.488497,18.001597,18.558234,19.043572,19.586457,20.192064,20.828814,21.372164,22.100884,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.811504,7.096003,7.406706,7.666648,8.020952,8.403358,8.718306,9.097176,9.473374,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,38.152090,38.488233,38.779953,39.068462,39.445526,39.818645,40.276374,40.687817,41.211606,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.871956,33.922276,38.859598,40.223744,43.035073,44.390861,46.282371,48.127211,48.742043,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,17.672943,16.527554,24.627753,25.432092,27.061929,29.154282,31.022083,32.809138,33.760782,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397931,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,14.500000,,,,,,,,
397932,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,3.700000,,,,5.400000,,,,
397933,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,32.400000,,,,33.700000,,,,
397934,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.606951,59.740456,59.888983,60.053623,60.216147,60.377610,60.551609,60.693180,60.825294,


In [3]:
df_long = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
                  var_name="Year", value_name="Value")

df_wide = df_long.pivot_table(index=["Country Name", "Country Code", "Year"],
                              columns="Indicator Code", values="Value")

df_wide = df_wide.reset_index()

df_wide['Year'] = df_wide['Year'].apply(lambda year: pd.to_datetime(year, format='%Y') + relativedelta(month=12, day=31))

df_wide.to_csv('input/transformed/df_wide.csv', index=False)

df_wide

Indicator Code,Country Name,Country Code,Year,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.HA,AG.LND.ARBL.HA.PC,AG.LND.ARBL.ZS,...,per_sa_allsa.cov_q4_tot,per_sa_allsa.cov_q5_tot,per_si_allsi.adq_pop_tot,per_si_allsi.ben_q1_tot,per_si_allsi.cov_pop_tot,per_si_allsi.cov_q1_tot,per_si_allsi.cov_q2_tot,per_si_allsi.cov_q3_tot,per_si_allsi.cov_q4_tot,per_si_allsi.cov_q5_tot
0,Afghanistan,AFG,1960-12-31,,,,,,,,...,,,,,,,,,,
1,Afghanistan,AFG,1961-12-31,,0.143791,377500.0,57.878356,7650000.0,0.830251,11.728991,...,,,,,,,,,,
2,Afghanistan,AFG,1962-12-31,,0.142857,378000.0,57.955016,7700000.0,0.818765,11.805651,...,,,,,,,,,,
3,Afghanistan,AFG,1963-12-31,,0.141935,378500.0,58.031676,7750000.0,0.806915,11.882311,...,,,,,,,,,,
4,Afghanistan,AFG,1964-12-31,,0.141026,379050.0,58.116002,7800000.0,0.794757,11.958972,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16955,Zimbabwe,ZWE,2019-12-31,623.474178,42.386111,162000.0,39.518358,4000000.0,0.261928,8.099009,...,18.893734,14.565084,15.862261,5.703138,4.807448,1.800944,5.747923,2.041201,4.674356,9.559706
16956,Zimbabwe,ZWE,2020-12-31,212.474438,32.278234,162000.0,39.754073,4000000.0,0.257618,8.320764,...,,,,,,,,,,
16957,Zimbabwe,ZWE,2021-12-31,212.474438,33.771180,162000.0,39.385906,4000000.0,0.253209,7.952922,...,,,,,,,,,,
16958,Zimbabwe,ZWE,2022-12-31,212.474438,33.330587,,39.489284,,,8.058050,...,,,,,,,,,,


In [5]:
# Define target indicators for top and bottom 10% wealth share
target_top = "SI.DST.10TH.10"    # Top 10% wealth share
target_bottom = "SI.DST.FRST.10" # Bottom 10% wealth share
target_gini = "SI.POV.GINI"      # Gini index

In [20]:
# Exclude identifier columns and target columns from features
exclude_columns = ["Country Name", "Country Code", "Year", target_top, target_bottom, target_gini]
wealth_share_columns = [col for col in df_wide.columns if col.startswith("SI.DST") and col not in exclude_columns]
poverty_columns = [col for col in df_wide.columns if 'POV' in col and col not in exclude_columns]
exclude_columns.extend(wealth_share_columns)
feature_columns = [col for col in df_wide.columns if col not in exclude_columns]

In [22]:
indicators = pd.read_csv('reference/indicator_lookup.csv')['Indicator Code']
indicators

0          EG.CFT.ACCS.ZS
1       EG.CFT.ACCS.RU.ZS
2       EG.CFT.ACCS.UR.ZS
3          EG.ELC.ACCS.ZS
4       EG.ELC.ACCS.RU.ZS
              ...        
1491       SG.VAW.REFU.ZS
1492    SP.M15.2024.FE.ZS
1493    SP.M18.2024.FE.ZS
1494    SH.DYN.AIDS.FE.ZS
1495       SH.HIV.INCD.YG
Name: Indicator Code, Length: 1496, dtype: object

In [None]:
wealth_share_indicators = indicators[indicators.str.contains('DST')].tolist()
poverty_indicators = indicators.str.contains('POV')
economic_

1

In [21]:
exclude_columns

['Country Name',
 'Country Code',
 'Year',
 'SI.DST.10TH.10',
 'SI.DST.FRST.10',
 'SI.POV.GINI',
 'SI.DST.02ND.20',
 'SI.DST.03RD.20',
 'SI.DST.04TH.20',
 'SI.DST.05TH.20',
 'SI.DST.50MD',
 'SI.DST.FRST.20']

## Missing Value Handling with KNN Imputation
- took ~25min to run

In [None]:
# First, make a copy of the feature data
X_impute = df_wide[feature_columns].copy()

# Check missing values before imputation
missing_before = X_impute.isna().sum().sum()
print(f"Missing values before imputation: {missing_before:,}")

# Approach 1: First scale, then impute
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_impute),
    columns=X_impute.columns,
    index=X_impute.index
)

# Apply KNN imputation on the scaled data
imputer = KNNImputer(n_neighbors=5)
X_imputed_scaled = pd.DataFrame(
    imputer.fit_transform(X_scaled),
    columns=X_impute.columns,
    index=X_impute.index
)

# Convert back to original scale if needed
X_imputed = pd.DataFrame(
    scaler.inverse_transform(X_imputed_scaled),
    columns=X_impute.columns,
    index=X_impute.index
)

# Update the dataframe
df_wide_knn = df_wide.copy()
df_wide_knn[feature_columns] = X_imputed

# Save the transformed and imputed data for later use
df_wide_knn.to_csv('input/imputed/df_wide_knn_imputed.csv', index=False)

# Check missing values after imputation
missing_after = df_wide_knn[feature_columns].isna().sum().sum()
print(f"Missing values after imputation: {missing_after:,}")