In [33]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import os

In [34]:
# params
YEAR_RANGE = range(2000, 2020)

# Define target indicators
target_top = "SI.DST.10TH.10"    # Top 10% wealth share
target_bottom = "SI.DST.FRST.10" # Bottom 10% wealth share
target_gini = "SI.POV.GINI"      # Gini index

targets = [target_top, target_bottom, target_gini]

# Load Data

In [35]:
df = pd.read_csv('input/raw/WDICSV.csv')
lookup = pd.read_csv('output/reference/indicator_lookup.csv')
df_uncorrelated = pd.read_csv('input/correlation_analysis/filtered_dataset_without_correlated_indicators.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,17.488497,18.001597,18.558234,19.043572,19.586457,20.192064,20.828814,21.372164,22.100884,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.811504,7.096003,7.406706,7.666648,8.020952,8.403358,8.718306,9.097176,9.473374,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,38.152090,38.488233,38.779953,39.068462,39.445526,39.818645,40.276374,40.687817,41.211606,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.871956,33.922276,38.859598,40.223744,43.035073,44.390861,46.282371,48.127211,48.742043,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,17.672943,16.527554,24.627753,25.432092,27.061929,29.154282,31.022083,32.809138,33.760782,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397931,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,14.500000,,,,,,,,
397932,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,3.700000,,,,5.400000,,,,
397933,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,32.400000,,,,33.700000,,,,
397934,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.606951,59.740456,59.888983,60.053623,60.216147,60.377610,60.551609,60.693180,60.825294,


# Transform to Wide Table Format

In [36]:
df_long = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
                  var_name="Year", value_name="Value")

df_wide = df_long.pivot_table(index=["Country Name", "Country Code", "Year"],
                              columns="Indicator Code", values="Value")

df_wide = df_wide.reset_index()

df_wide['Year'] = df_wide['Year'].apply(lambda year: pd.to_datetime(year, format='%Y') + relativedelta(month=12, day=31))

# filter if applicable
print(f"Data date range: {df_wide['Year'].dt.year.min()} to {df_wide['Year'].dt.year.max()}")
if YEAR_RANGE != None:
    df_wide = df_wide[df_wide['Year'].dt.year.isin(YEAR_RANGE)]
    print(f"Filtered: {YEAR_RANGE.start} to {YEAR_RANGE.stop}")
else:
    YEAR_RANGE = range(df_wide['Year'].dt.year.min(), df_wide['Year'].dt.year.max() + 1)

# Save the complete wide dataset
df_wide.to_csv(f'input/transformed/df_wide_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)

# Summary statistics
total_columns = df_wide.shape[1] - 3  # Excluding Country Name, Country Code, Year
non_null_counts = df_wide.count()
missing_pct = (1 - non_null_counts / len(df_wide)) * 100

print(f"\nDataset shape: {df_wide.shape[0]} rows × {df_wide.shape[1]} columns")
print(f"Total indicators (features): {total_columns}")
print(f"Average missing values: {missing_pct.mean():.1f}%")

df_wide

Data date range: 1960 to 2023
Filtered: 2000 to 2020

Dataset shape: 5300 rows × 1497 columns
Total indicators (features): 1494
Average missing values: 44.7%


Indicator Code,Country Name,Country Code,Year,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.HA,AG.LND.ARBL.HA.PC,AG.LND.ARBL.ZS,...,per_sa_allsa.cov_q4_tot,per_sa_allsa.cov_q5_tot,per_si_allsi.adq_pop_tot,per_si_allsi.ben_q1_tot,per_si_allsi.cov_pop_tot,per_si_allsi.cov_q1_tot,per_si_allsi.cov_q2_tot,per_si_allsi.cov_q3_tot,per_si_allsi.cov_q4_tot,per_si_allsi.cov_q5_tot
40,Afghanistan,AFG,2000-12-31,100.000000,0.650787,377940.0,57.945817,7683000.0,0.381663,11.779587,...,,,,,,,,,,
41,Afghanistan,AFG,2001-12-31,100.000000,2.394898,377950.0,57.947350,7683000.0,0.378766,11.779587,...,,,,,,,,,,
42,Afghanistan,AFG,2002-12-31,104.751559,3.194390,377900.0,57.939684,7678000.0,0.359152,11.771921,...,,,,,,,,,,
43,Afghanistan,AFG,2003-12-31,166.822535,3.478546,378840.0,58.083805,7772000.0,0.341881,11.916042,...,,,,,,,,,,
44,Afghanistan,AFG,2004-12-31,170.466492,4.243778,379280.0,58.151266,7816000.0,0.331740,11.983503,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16951,Zimbabwe,ZWE,2015-12-31,251.118211,23.988735,162000.0,39.917986,4000000.0,0.277797,8.469789,...,,,,,,,,,,
16952,Zimbabwe,ZWE,2016-12-31,944.202899,38.774366,162000.0,40.138999,4000000.0,0.273967,8.686746,...,,,,,,,,,,
16953,Zimbabwe,ZWE,2017-12-31,671.559633,45.259348,162000.0,39.808908,4000000.0,0.270043,8.361615,...,9.032539,4.561982,29.259153,0.817641,3.407481,0.609872,1.100615,2.446502,4.778166,8.099734
16954,Zimbabwe,ZWE,2018-12-31,623.474178,40.147088,162000.0,39.983535,4000000.0,0.266056,8.550695,...,,,,,,,,,,


# Identify Columns to Exclude

We exclude columns based on two criteria:
1. **Indicator Category**: Remove merchandise indicators that are country-specific, wealth distribution indicators, and poverty indicators
2. **Data Quality**: Remove columns with too many missing values

This two-step approach ensures we use high-quality data for the imputation process.

In [37]:
# STEP 1: Exclude based on indicator category
print("\n=== Step 1: Excluding Based on Indicator Category ===")

# 1.1. Merchandise indicators that are country-specific (with region codes R1-R6)
merch_indicators = lookup[lookup['Indicator Name'].str.contains('merch', case=False, na=False)].copy()
merch_indicators = merch_indicators[merch_indicators['Indicator Code'].str.contains(r'R[1-6]', na=False)].copy()
print(f"Region-specific merchandise indicators identified: {len(merch_indicators)}")

# 1.2. Wealth share indicators (our targets)
wealth_share_indicators = lookup[lookup['Indicator Code'].str.contains('DST')].copy()
print(f"Wealth distribution indicators identified: {len(wealth_share_indicators)}")

# 1.3. Poverty indicators (our targets)
poverty_indicators = lookup[lookup['Indicator Code'].str.contains('POV')].copy()
print(f"Poverty indicators identified: {len(poverty_indicators)}")

# Combine all indicators to drop based on category
category_indicators_to_drop = pd.concat([merch_indicators, wealth_share_indicators, poverty_indicators]).reset_index(drop=True)
category_drop_list = category_indicators_to_drop['Indicator Code'].tolist()
print(f"Total indicators to drop based on category: {len(category_drop_list)}")

category_indicators_to_drop


=== Step 1: Excluding Based on Indicator Category ===
Region-specific merchandise indicators identified: 12
Wealth distribution indicators identified: 12
Poverty indicators identified: 11
Total indicators to drop based on category: 35


Unnamed: 0,Indicator Code,Indicator Name,Topic,Coarse_Topic
0,TM.VAL.MRCH.R1.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
1,TM.VAL.MRCH.R2.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
2,TM.VAL.MRCH.R3.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
3,TM.VAL.MRCH.R4.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
4,TM.VAL.MRCH.R5.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
5,TM.VAL.MRCH.R6.ZS,Merchandise imports from low- and middle-incom...,Private Sector & Trade: Imports,Economy & Finance
6,TX.VAL.MRCH.R1.ZS,Merchandise exports to low- and middle-income ...,Private Sector & Trade: Exports,Economy & Finance
7,TX.VAL.MRCH.R2.ZS,Merchandise exports to low- and middle-income ...,Private Sector & Trade: Exports,Economy & Finance
8,TX.VAL.MRCH.R3.ZS,Merchandise exports to low- and middle-income ...,Private Sector & Trade: Exports,Economy & Finance
9,TX.VAL.MRCH.R4.ZS,Merchandise exports to low- and middle-income ...,Private Sector & Trade: Exports,Economy & Finance


In [38]:
# STEP 2: Analyze missing data
print("\n=== Step 2: Analyzing Missing Data ===")

# Calculate missing value percentage for each column (excluding identifiers)
descriptors = ['Country Name', 'Country Code', 'Year']
feature_columns = [col for col in df_wide.columns if col not in descriptors]

# Calculate missing percentages
missing_percentages = df_wide[feature_columns].isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

# Identify completely empty columns
completely_empty_cols = df_wide.columns[df_wide.isnull().all()].tolist()
print(f"Completely empty columns: {len(completely_empty_cols)}")

# Identify columns with > 95% missing values
high_missing_cols = missing_percentages[missing_percentages > 95].index.tolist()
print(f"Columns with >95% missing values: {len(high_missing_cols)}")

# Identify columns with > 80% missing values
medium_missing_cols = missing_percentages[(missing_percentages > 80)].index.tolist()
print(f"Columns with >80% missing values: {len(medium_missing_cols)}")

# Create a summary dataframe of missing data
missing_summary = pd.DataFrame({
    'Indicator Code': missing_percentages.index,
    'Missing Percentage': missing_percentages.values,
    'Available Count': df_wide.shape[0] - df_wide[missing_percentages.index].isnull().sum().values,
    'Missing Count': df_wide[missing_percentages.index].isnull().sum().values
})

# Add indicator names if available
missing_summary = missing_summary.merge(lookup[['Indicator Code', 'Indicator Name', 'Topic']], on='Indicator Code', how='left')

# We'll exclude completely empty columns and those with > 80% missing values
quality_drop_list = medium_missing_cols
print(f"\nTotal indicators to drop based on quality (>80% missing): {len(quality_drop_list)}")

# Display the top 10 columns with the most missing values
missing_summary.head(10)


=== Step 2: Analyzing Missing Data ===
Completely empty columns: 10
Columns with >95% missing values: 98
Columns with >80% missing values: 290

Total indicators to drop based on quality (>80% missing): 290


Unnamed: 0,Indicator Code,Missing Percentage,Available Count,Missing Count,Indicator Name,Topic
0,DT.NFL.SPRP.CD,100.0,0,5300,"Net official flows from UN agencies, SPRP (cur...",Economic Policy & Debt: Official development a...
1,DT.NFL.UNCV.CD,100.0,0,5300,"Net official flows from UN agencies, UNCOVID (...",Economic Policy & Debt: Official development a...
2,DT.NFL.SDGF.CD,100.0,0,5300,"Net official flows from UN agencies, SDGFUND (...",Economic Policy & Debt: Official development a...
3,DT.NFL.WITC.CD,100.0,0,5300,"Net official flows from UN agencies, WTO-ITC (...",Economic Policy & Debt: Official development a...
4,DT.DOD.PVLX.EX.ZS,100.0,0,5300,Present value of external debt (% of exports o...,Economic Policy & Debt: External debt: Debt ra...
5,DT.NFL.UNWN.CD,100.0,0,5300,"Net official flows from UN agencies, UNWOMEN (...",Economic Policy & Debt: Official development a...
6,DT.NFL.UNIDO.CD,100.0,0,5300,"Net official flows from UN agencies, UNIDO (cu...",Economic Policy & Debt: Official development a...
7,DT.DOD.PVLX.CD,100.0,0,5300,Present value of external debt (current US$),Economic Policy & Debt: External debt: Debt ou...
8,DT.NFL.UNCTAD.CD,100.0,0,5300,"Net official flows from UN agencies, UNCTAD (c...",Economic Policy & Debt: Official development a...
9,DT.NFL.UNCD.CD,100.0,0,5300,"Net official flows from UN agencies, UNCDF (cu...",Economic Policy & Debt: Official development a...


In [39]:
print("\n=== Step 3: Exclude based on correlation ===")
# Exclude indicators based on correlation analysis that was performed in correlation_analysis.py

uncorrelated_features = df_uncorrelated['Indicator Code'].unique().tolist()
raw_features = df_wide.columns[3:].tolist()  # Exclude Country Name, Country Code, Year

# figure out which features to drop
correlation_drop_list = list(set(raw_features) - set(uncorrelated_features))
print(f"Total indicators to drop based on correlation analysis: {len(correlation_drop_list)}")


=== Step 3: Exclude based on correlation ===
Total indicators to drop based on correlation analysis: 913


In [40]:
print("\n=== Step 4: Combining Exclusion Lists ===")

# Combine category and quality-based exclusions
drop_list = list(set(category_drop_list + quality_drop_list + correlation_drop_list))
print(f"Total indicators to drop (combined): {len(drop_list)}")
print(f"  - From category exclusions: {len(category_drop_list)}")
print(f"  - From quality exclusions: {len(quality_drop_list)}")
print(f"  - From correlation exclusions: {len(correlation_drop_list)}")
print(f"  - Overlap: {len(set(category_drop_list) & set(quality_drop_list) & set(correlation_drop_list))}")

# Create a comprehensive reference of all dropped indicators
drop_reasons = pd.DataFrame({
    'Indicator Code': drop_list,
    'Category Exclusion': [code in category_drop_list for code in drop_list],
    'Quality Exclusion': [code in quality_drop_list for code in drop_list],
    'Correlated Exclusion': [code in correlation_drop_list for code in drop_list],
    'Missing Percentage': [missing_percentages.get(code, 100) for code in drop_list]
})

# Add indicator details
drop_reasons = drop_reasons.merge(lookup[['Indicator Code', 'Indicator Name', 'Topic', 'Coarse_Topic']], on='Indicator Code', how='left')

# Save the comprehensive drop list
drop_reasons.to_csv(f'output/reference/dropped_indicators_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)

drop_reasons


=== Step 4: Combining Exclusion Lists ===
Total indicators to drop (combined): 1056
  - From category exclusions: 35
  - From quality exclusions: 290
  - From correlation exclusions: 913
  - Overlap: 3


Unnamed: 0,Indicator Code,Category Exclusion,Quality Exclusion,Correlated Exclusion,Missing Percentage,Indicator Name,Topic,Coarse_Topic
0,DT.NFL.UNRW.CD,False,True,False,88.301887,"Net official flows from UN agencies, UNRWA (cu...",Economic Policy & Debt: Official development a...,Economy & Finance
1,DT.ODA.ODAT.GN.ZS,False,False,True,30.566038,Net ODA received (% of GNI),Economic Policy & Debt: Official development a...,Economy & Finance
2,SP.POP.65UP.FE.ZS,False,False,True,0.000000,"Population ages 65 and above, female (% of fem...",Health: Population: Structure,Health
3,SG.TIM.UWRK.FE,False,True,False,96.679245,Proportion of time spent on unpaid domestic an...,Gender: Participation & access,Other
4,BM.KLT.DINV.CD.WD,False,False,True,19.377358,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economy & Finance
...,...,...,...,...,...,...,...,...
1051,SP.POP.DPND.OL,False,False,True,0.000000,"Age dependency ratio, old (% of working-age po...",Health: Population: Dynamics,Health
1052,SL.AGR.EMPL.ZS,False,False,True,11.320755,Employment in agriculture (% of total employme...,Social Protection & Labor: Economic activity,Economy & Finance
1053,FM.LBL.BMNY.GD.ZS,False,False,True,24.358491,Broad money (% of GDP),Financial Sector: Monetary holdings (liabilities),Other
1054,SP.POP.7074.MA.5Y,False,False,True,0.000000,"Population ages 70-74, male (% of male populat...",Health: Population: Structure,Health


In [41]:
# STEP 4: Create filtered datasets
print("\n=== Step 4: Creating Filtered Datasets ===")

# Create filtered dataset (excluding dropped indicators)
valid_columns = [col for col in df_wide.columns if col not in drop_list]
df_wide_filtered = df_wide[valid_columns].copy()

# Create excluded dataset (only dropped indicators + identifiers)
excluded_columns = [col for col in df_wide.columns if col in drop_list]
df_wide_excluded = df_wide[excluded_columns].copy()

# Create targets dataset (only target variables + identifiers)
df_wide_targets = df_wide[targets].copy()

# Print summary statistics
total_indicators = len(df_wide.columns) - 3  # Excluding identifiers
included_indicators = len(df_wide_filtered.columns) - 3
excluded_indicators = len(excluded_columns) - 3

print(f"Original dataset: {df_wide.shape[0]} rows × {total_indicators} indicators")
print(f"Filtered dataset: {df_wide_filtered.shape[0]} rows × {included_indicators} indicators")
print(f"Excluded dataset: {df_wide_excluded.shape[0]} rows × {excluded_indicators} indicators")
print(f"Targets dataset: {df_wide_targets.shape[0]} rows × {len(targets)} targets")

# Calculate percentage of indicators retained
retained_pct = included_indicators / total_indicators * 100
print(f"\nRetained {retained_pct:.1f}% of original indicators")

# Save the filtered datasets
df_wide_filtered.to_csv(f'input/transformed/df_wide_filtered_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)
df_wide_excluded.to_csv(f'input/transformed/df_wide_excluded_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)
df_wide_targets.to_csv(f'input/transformed/df_wide_targets_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)

# Display target availability
print("\nTarget variable availability:")
for target in targets:
    available = df_wide_targets[target].notna().sum()
    pct = available / len(df_wide_targets) * 100
    print(f"  - {target}: {available:,} available ({pct:.2f}%)")

# Verify filtering worked correctly
print("\nVerifying filtering logic:")
for col in df_wide_filtered.columns:
    if col not in descriptors and col in drop_list:
        print(f"WARNING: Column {col} was in drop_list but still appears in filtered dataset!")
        
print("Filtering validation complete.")


=== Step 4: Creating Filtered Datasets ===
Original dataset: 5300 rows × 1494 indicators
Filtered dataset: 5300 rows × 438 indicators
Excluded dataset: 5300 rows × 1053 indicators
Targets dataset: 5300 rows × 3 targets

Retained 29.3% of original indicators

Target variable availability:
  - SI.DST.10TH.10: 1,456 available (27.47%)
  - SI.DST.FRST.10: 1,456 available (27.47%)
  - SI.POV.GINI: 1,457 available (27.49%)

Verifying filtering logic:
Filtering validation complete.


# KNN Imputation

We'll use K-Nearest Neighbors imputation to fill in missing values in the filtered dataset.
Before imputation, we need to:
1. Identify any completely missing columns that remain in the filtered dataset
2. Apply standardization before imputation to ensure features are on the same scale
3. Inverse the standardization after imputation to restore original scale

In [44]:
# Start with the filtered dataset, drop where target is null
df_wide_knn = df_wide_filtered[df_wide[targets].isnull().any(axis=1)].copy()

# Select only the numeric columns to impute
impute_columns = [col for col in df_wide_knn.columns if col not in excluded_columns and col not in descriptors]

# Check dimensions before imputation
print(f"\nDataFrame shape: {df_wide_knn.shape}")
print(f"Number of columns to impute: {len(impute_columns)}")

# Analyze missing values before imputation
missing_before = df_wide_knn[impute_columns].isna().sum().sum()
total_elements = df_wide_knn[impute_columns].size
missing_pct = missing_before / total_elements * 100
print(f"Missing values before imputation: {missing_before:,} ({missing_pct:.2f}%)")

# Initialize the scaler and imputer
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights='distance')

# Scale -> Impute -> Inverse Scale
print("\nPerforming KNN imputation...")

X_scaled = scaler.fit_transform(df_wide_knn[impute_columns])
X_imputed_scaled = imputer.fit_transform(X_scaled)
X_imputed = scaler.inverse_transform(X_imputed_scaled)

# Update only the imputed columns in the dataframe
df_wide_knn[impute_columns] = X_imputed

# Check missing values after imputation
missing_after = df_wide_knn[impute_columns].isna().sum().sum()
print(f"Missing values after imputation: {missing_after:,}")

# Save the imputed data
df_wide_knn.to_csv(f'input/imputed/df_wide_knn_imputed_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv', index=False)
print(f"\nImputed dataset saved to input/imputed/df_wide_knn_imputed_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")

# Display a summary of imputation results
print(f"\nImputation Summary:")
print(f"  - Total values: {total_elements:,}")
print(f"  - Values imputed: {missing_before:,} ({missing_pct:.2f}%)")
print(f"  - Final dataset shape: {df_wide_knn.shape[0]:,} rows × {df_wide_knn.shape[1]:,} columns")


DataFrame shape: (3844, 441)
Number of columns to impute: 438
Missing values before imputation: 726,385 (43.14%)

Performing KNN imputation...
Missing values after imputation: 0

Imputed dataset saved to input/imputed/df_wide_knn_imputed_2000_to_2020.csv

Imputation Summary:
  - Total values: 1,683,672
  - Values imputed: 726,385 (43.14%)
  - Final dataset shape: 3,844 rows × 441 columns


In [43]:
# Final summary of the processed datasets
print("=== Summary of Processed Datasets ===\n")
print(f"Date range: {YEAR_RANGE.start} to {YEAR_RANGE.stop}")
print(f"\nOutput files:")
print(f"  1. Complete dataset: input/transformed/df_wide_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  2. Filtered dataset: input/transformed/df_wide_filtered_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  3. Excluded indicators: input/transformed/df_wide_excluded_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  4. Target variables: input/transformed/df_wide_targets_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  5. Imputed dataset: input/imputed/df_wide_knn_imputed_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"\nReference files:")
print(f"  1. Category-based exclusions: output/reference/category_dropped_indicators_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  2. Missing data analysis: output/reference/missing_data_analysis_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"  3. Complete drop list: output/reference/dropped_indicators_{YEAR_RANGE.start}_to_{YEAR_RANGE.stop}.csv")
print(f"\nDataset statistics:")
print(f"  - Total observations: {df_wide_knn.shape[0]:,}")
print(f"  - Indicators retained: {df_wide_knn.shape[1] - 3:,}")
print(f"  - Indicators excluded: {len(drop_list):,}")
print(f"\nNext steps:")
print("  - Validate imputed data")
print("  - Proceed with analysis and modeling")

=== Summary of Processed Datasets ===

Date range: 2000 to 2020

Output files:
  1. Complete dataset: input/transformed/df_wide_2000_to_2020.csv
  2. Filtered dataset: input/transformed/df_wide_filtered_2000_to_2020.csv
  3. Excluded indicators: input/transformed/df_wide_excluded_2000_to_2020.csv
  4. Target variables: input/transformed/df_wide_targets_2000_to_2020.csv
  5. Imputed dataset: input/imputed/df_wide_knn_imputed_2000_to_2020.csv

Reference files:
  1. Category-based exclusions: output/reference/category_dropped_indicators_2000_to_2020.csv
  2. Missing data analysis: output/reference/missing_data_analysis_2000_to_2020.csv
  3. Complete drop list: output/reference/dropped_indicators_2000_to_2020.csv

Dataset statistics:
  - Total observations: 3,844
  - Indicators retained: 438
  - Indicators excluded: 1,056

Next steps:
  - Validate imputed data
  - Proceed with analysis and modeling
