<a href="https://colab.research.google.com/github/dohyung-kim/ccri/blob/main/script/pillar2_processing_ccri1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rasterio scikit-learn

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3


In [2]:
import pandas as pd
import glob
import os
import shutil
import numpy as np
from scipy.stats import gmean
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

In [4]:
#define function to remove outliers and find min max values to reduce skew
def trim_outliers_iteratively(values, max_skew=2, max_kurtosis=3.5):
    """
    Iteratively removes extreme min/max values until skewness <= 2 and kurtosis <= 3.5.
    Returns the trimmed dataset along with the final min and max.
    """
    values = values.dropna().values  # Ensure no NaNs
    while True:
        current_skew = skew(values)
        current_kurtosis = kurtosis(values)

        if abs(current_skew) <= max_skew and current_kurtosis <= max_kurtosis:
            break  # Stop if conditions are met

        # Remove min and max values
        min_val, max_val = values.min(), values.max()
        values = values[(values > min_val) & (values < max_val)]  # Trim extreme min/max

    return values, values.min(), values.max()

Normalize and obtain min max value for P2 indicators

In [5]:
# Define source and destination folder paths
source_folder = "/content/drive/MyDrive/"
destination_folder = "/content/drive/MyDrive/CCRI/pillar2/"

# Get all copied CSV files
csv_files = glob.glob(os.path.join(destination_folder, "*.csv"))

# List of variables to reverse normalize
reverse_columns = [
    'P2_LSCED',
    'P2_Birth_Attendant_Y15T19',
    'P2_ED_CR_L2',
    'P2_Immunization_DTP1',
    'P2_Immunization_DTP3',
    'P2_WASH_Drinking_Water',
    'P2_WASH_Sanitation',
    'P2_basic_hygiene',
    'P2_electricity_access',
    'P2_Social_Protection'
]

# Initialize an empty DataFrame for merged results
merged_df = pd.DataFrame()

for file in csv_files:
    if os.path.basename(file) == "P2_Merged_Normalized_avg.csv":
      continue
    df = pd.read_csv(file)

    if 'iso3' not in df.columns or 'value' not in df.columns:
        continue  # Skip files missing required columns
    # Normalize 'value' column
    df = df[['iso3', 'value']].dropna()
    # Apply iterative trimming
    trimmed_values, min_trimmed, max_trimmed = trim_outliers_iteratively(df['value'])
    # Normalize using final min/max
    df['value_normalized'] = 10 * (df['value'] - min_trimmed) / (max_trimmed - min_trimmed)

    # Ensure values are within [0,10]
    df['value_normalized'] = np.clip(df['value_normalized'], 0, 10)
    df['min'] = min_trimmed
    df['max'] = max_trimmed

    # Extract filename for column naming
    filename = os.path.basename(file).replace(".csv", "")

    # Reverse normalization for specific columns
    if filename in reverse_columns:
        df['value_normalized'] = 10 - df['value_normalized']  # Reverse normalize

    # Rename column
    df.rename(columns={'value_normalized': filename + '_value_normalized'}, inplace=True)
    df.rename(columns={'min': filename + '_min'}, inplace=True)
    df.rename(columns={'max': filename + '_max'}, inplace=True)

    print(f"processed : {filename} | min: {min_trimmed} | max: {max_trimmed}" )

    # Merge with the main DataFrame (use suffixes to avoid column name conflicts)
    if merged_df.empty:
        merged_df = df[['iso3', filename + '_value_normalized', filename + '_min', filename + '_max']]
    else:
        merged_df = merged_df.merge(df[['iso3', filename + '_value_normalized', filename + '_min', filename + '_max']],
                                     on='iso3', how='left', suffixes=('', f'_{filename}'))

# Filter columns that end with '_value_normalized'
pillar2_columns = [col for col in merged_df.columns if col.endswith('_value_normalized')]

# Compute the average across all '_value_normalized' columns (ignoring NaNs)
merged_df["P2_arithmetic_avg"] = merged_df[pillar2_columns].apply(np.nanmean, axis=1)

# Calculate geometric average
merged_df["P2_geometric_avg"] = merged_df[pillar2_columns].apply(
    lambda x: gmean(x[~np.isnan(x)] + 1e-10) if np.any(~np.isnan(x)) else np.nan, axis=1
)

# Save the final merged dataset
output_file = os.path.join(destination_folder, "P2_Merged_Normalized_avg.csv")
merged_df.to_csv(output_file, index=False)

print(f"Processed data saved to {output_file}")


processed : P2_WASH_Drinking_Water | min: 35.11656657990258 | max: 100.0
processed : P2_WASH_Sanitation | min: 9.336216099 | max: 100.0
processed : P2_LSCED | min: 0.0267945 | max: 72.58609
processed : P2_Nutrition_Wasting | min: 0.1 | max: 22.7
processed : P2_Nutrition_Stunting_Modeled | min: 1.2 | max: 56.5
processed : P2_Child_Mortality | min: 0.404633951697431 | max: 30.2232985695993
processed : P2_Immunization_DTP1 | min: 45.0 | max: 98.0
processed : P2_Immunization_DTP3 | min: 35.0 | max: 98.0
processed : P2_PT_Labor | min: 0.3 | max: 41.5
processed : P2_Learning_Poverty | min: 2.330512762069702 | max: 98.50421142578124
processed : P2_ED_CR_L2 | min: 5.8079791 | max: 99.800003
processed : P2_Birth_Attendant_Y15T19 | min: 25.4 | max: 100.0
processed : P2_Child_poverty | min: 2.6400771141052246 | max: 83.46825408935547
processed : P2_Child_Marriage | min: 0.0 | max: 76.3
processed : P2_food_poverty | min: 0.0 | max: 70.1
processed : P2_Social_Protection | min: 0.0 | max: 100.0
proc