In [None]:
# Combine all data from P1, P2 into final CCRI layer

In [59]:
import os
import pandas as pd
import geopandas as gpd
import glob
import numpy as np

In [69]:
# Query data dir (avoiding hard-coding paths when working between users)
data_dir = input('Enter data directory:')
print(data_dir)

Enter data directory: /Users/kelseydoerksen/code/unicef/ccri/data


/Users/kelseydoerksen/code/unicef/ccri/data


In [70]:
# Check that data dir was input correctly
data_dir

'/Users/kelseydoerksen/code/unicef/ccri/data'

In [365]:
# Read in all the data we need
# -- exposure files
p1_exposure_file = pd.read_csv('{}/CCRI_results_misc/Merged_Exposure_Data.csv'.format(data_dir))
p2_exposure_file = pd.read_csv('{}/CCRI_results_misc/P2_Merged_Normalized_avg.csv'.format(data_dir))
p1p2_scores = pd.read_csv('{}/CCRI_results_misc/p1_p2_avg_ccri.csv'.format(data_dir))

# -- attribute files
wb_income = pd.read_csv('{}/Misc/WB_INCOME.csv'.format(data_dir))
unicef_ro = pd.read_csv('{}/Misc/UNICEF_PROG_REG_GLOBAL.csv'.format(data_dir))

# -- population files
childpop = pd.read_csv('{}/CCRI_results_misc/child_pop_sum_adm0.csv'.format(data_dir))
worldpop = pd.read_csv('{}/Misc/World_Population_ByAOI_adm0.csv'.format(data_dir))

# -- boundary file
adm0 = gpd.read_file('{}/misc/adm0_boundaries_simple.geojson'.format(data_dir))

In [366]:
# Merge P exposures by ISO3
merged_P = (p1_exposure_file.merge(p2_exposure_file, left_on='ISO3', right_on='iso3', how='left').drop('iso3', axis=1))
all_P = (merged_P.merge(p1p2_scores, left_on='ISO3', right_on='ISO3', how='left').drop('iso3', axis=1))

In [367]:
all_P = all_P.drop(columns=['P2_arithmetic_avg_y', 'rank_reverse_x'])
all_P = all_P.rename(columns={'P2_arithmetic_avg_x': 'P2_arithmetic_avg', 'rank_reverse_y': 'rank_reverse'})

In [368]:
# Add WB income
wb_income = wb_income[['Region_Code', 'ISO3Code']]
df = (all_P.merge(wb_income, left_on='ISO3', right_on='ISO3Code', how='left').drop('ISO3Code', axis=1).rename(columns={'Region_Code': 'wb_income'}))

In [369]:
# Add Regional Office
unicef_ro = unicef_ro[['Region_Code','ISO3Code']]
df = (df.merge(unicef_ro, left_on='ISO3', right_on='ISO3Code', how='left').drop('ISO3Code', axis=1).rename(columns={'Region_Code': 'unicef_ro'}))

In [370]:
# Add population data
df_w_childpop = (df.merge(childpop, how='left').rename(columns={'child_population': 'u18_pop'}))

In [371]:
df_w_allpop = (df_w_childpop.merge(worldpop, left_on='name', right_on='adm0_name', how='left').rename(columns={'sum': 'total_pop'})).drop(columns=['iso3','adm0_name'])

# Set population data as integer
df_w_allpop['u18_pop'] = df_w_allpop['u18_pop'].astype(int)
df_w_allpop['total_pop'] = df_w_allpop['total_pop'].astype(int)

In [372]:
# Using simplified boundaries for geometry 
adm0 = adm0[['ISO3', 'name', 'ucode','uuid','geometry']]
df_combined = (df_w_allpop.merge(adm0, on=['ISO3', 'name'], how='left'))

In [373]:
# Grabbing actual exposure numbers
# Define file paths
exposure_path = "{}/p1_exposure".format(data_dir)
total_population_file = "{}/CCRI_results_misc/child_pop_sum_adm0.csv".format(data_dir)

# Load total child population data
total_pop_df = pd.read_csv(total_population_file, usecols=['ISO3', 'name', 'child_population'])

# Ensure unique ISO3-name pairs before merging
total_pop_df = total_pop_df.groupby(['ISO3', 'name'], as_index=False).agg({'child_population': 'mean'})

# Get all CSV files for exposure
exposure_files = glob.glob(os.path.join(exposure_path, "*.csv"))

# Initialize empty list for processed data
exposure_data_list = []

### **Process Each File in One Loop**
for file in exposure_files:
    if file == total_population_file:
        continue  # Skip total population file

    df = pd.read_csv(file)  # Read full file to check available columns
    filename_only = os.path.basename(file)
    hazard_name = '_'.join(filename_only.split('_')[:2])  # Extract hazard name

    # Ensure required columns exist
    required_cols = {'ISO3', 'name', 'child_population_exposed'}
    if not required_cols.issubset(df.columns):
        print(f"Skipping {file}: Missing columns {required_cols - set(df.columns)}")
        continue  # Skip if required columns are missing

    df.dropna(subset=['child_population_exposed'], inplace=True)
    
    # Merge with total population data
    df = df.merge(total_pop_df, on=['ISO3', 'name'], how='left')

    # Compute relative exposure (%)
    df['{}_u18_rel_exp'.format(hazard_name)] = np.where(
        (df['child_population'] > 0) & (~df['child_population'].isna()),
        (df['child_population_exposed'] / df['child_population']) * 100,
        0
    )

    # Rename to hazard
    df = df.rename(columns={'child_population_exposed': '{}_u18_exp'.format(hazard_name)})
    df = df.drop(columns=['child_population'])
    
    
    exposure_data_list.append(df)

In [374]:
merged_exposure_df = pd.concat(exposure_data_list, axis=1).drop_duplicates(subset=['ISO3', 'name'])
# Ensure no duplicate columns before merging
merged_exposure_df = merged_exposure_df.loc[:, ~merged_exposure_df.columns.duplicated()]

In [375]:
df_combined = (df_combined.merge(merged_exposure_df, on=['ISO3', 'name'], how='left'))

In [376]:
# --- Adding P2 data
vul_path = "{}/p2_vulnerability".format(data_dir)
total_population_file = "{}/CCRI_results_misc/child_pop_sum_adm0.csv".format(data_dir)

# Load total child population data
total_pop_df = pd.read_csv(total_population_file, usecols=['ISO3', 'name', 'child_population'])

# Ensure unique ISO3-name pairs before merging
total_pop_df = total_pop_df.groupby(['ISO3', 'name'], as_index=False).agg({'child_population': 'mean'})

# Get all CSV files for exposure
p2_vul_files = glob.glob(os.path.join(vul_path, "*.csv"))

# Initialize empty list for processed data
vul_data_list = []

for file in p2_vul_files:
    df = pd.read_csv(file)  # Read full file to check available columns
    if 'iso3' not in df.columns or 'value' not in df.columns:
        continue  # Skip files missing required columns
    # Normalize 'value' column
    df = df[['iso3', 'value']].dropna()
    df = df.rename(columns={'iso3': 'ISO3'}, errors='ignore')
    filename_only = os.path.basename(file)
    hazard_name = '_'.join(filename_only.split('.csv')[:1])  # Extract hazard name

    # merge with population data
    df = (df.merge(total_pop_df, on='ISO3', how='left'))

    # Compute total and relative vulnerability
    df['{}_u18_vul'.format(hazard_name)] = np.where(
        (df['child_population'] > 0) & (~df['child_population'].isna()),
        (df['child_population'] * (df['value']/100)),
        0
    )

    df['{}_u18_vul_rel'.format(hazard_name)] = np.where(
        (df['child_population'] > 0) & (~df['child_population'].isna()),
        (df['value']),
        0
    )

    # Rename for relative
    df = df.drop(columns=['child_population', 'time_period', 'data_source', 'value'], errors='ignore')
    
    vul_data_list.append(df)

In [377]:
for i in range(len(vul_data_list)):
    df_combined = (df_combined.merge(vul_data_list[i], on=['ISO3', 'name'], how='left'))

In [380]:
# Remove any columns we don't want and rearrange as well
# -- Dropping
# Drop min, max
df_combined = df_combined.drop(df_combined.filter(regex='max').columns, axis=1)
df_combined = df_combined.drop(df_combined.filter(regex='min').columns, axis=1)

# -- Renaming
df_combined = df_combined.rename(columns={'name': 'adm_name', 'P1_P2_geometric_avg': 'ccri', 'ISO3':'iso3'})

In [381]:
list(df_combined.columns)

['iso3',
 'adm_name',
 'heatwave_frequency_absolute',
 'heatwave_frequency_relative',
 'heatwave_duration_absolute',
 'heatwave_duration_relative',
 'heatwave_severity_absolute',
 'heatwave_severity_relative',
 'river_flood_absolute',
 'river_flood_relative',
 'coastal_flood_absolute',
 'coastal_flood_relative',
 'pluvial_flood_absolute',
 'pluvial_flood_relative',
 'tropical_storm_absolute',
 'tropical_storm_relative',
 'extreme_heat_absolute',
 'extreme_heat_relative',
 'sand_dust_absolute',
 'sand_dust_relative',
 'vectorborne_malariapv_absolute',
 'vectorborne_malariapv_relative',
 'vectorborne_malariapf_absolute',
 'vectorborne_malariapf_relative',
 'agricultural_drought_absolute',
 'agricultural_drought_relative',
 'air_pollution_absolute',
 'air_pollution_relative',
 'drought_spi_absolute',
 'drought_spi_relative',
 'drought_sma_absolute',
 'drought_sma_relative',
 'fire_frequency_absolute',
 'fire_frequency_relative',
 'fire_FRP_absolute',
 'fire_FRP_relative',
 'P2_WASH_Drinki

In [383]:
# Reorder
df_combined = df_combined[['adm_name', 'iso3', 'ucode','uuid', 'unicef_ro', 'wb_income', 'total_pop', 'u18_pop', 'geometry','drought_sma_u18_exp',
 'drought_sma_u18_rel_exp',
 'agricultural_drought_u18_exp',
 'agricultural_drought_u18_rel_exp',
 'heatwave_severity_u18_exp',
 'heatwave_severity_u18_rel_exp',
 'coastal_flood_u18_exp',
 'coastal_flood_u18_rel_exp',
 'river_flood_u18_exp',
 'river_flood_u18_rel_exp',
 'fire_FRP_u18_exp',
 'fire_FRP_u18_rel_exp',
 'heatwave_frequency_u18_exp',
 'heatwave_frequency_u18_rel_exp',
 'vectorborne_malariapv_u18_exp',
 'vectorborne_malariapv_u18_rel_exp',
 'air_pollution_u18_exp',
 'air_pollution_u18_rel_exp',
 'tropical_storm_u18_exp',
 'tropical_storm_u18_rel_exp',
 'extreme_heat_u18_exp',
 'extreme_heat_u18_rel_exp',
 'drought_spi_u18_exp',
 'drought_spi_u18_rel_exp',
 'vectorborne_malariapf_u18_exp',
 'vectorborne_malariapf_u18_rel_exp',
 'sand_dust_u18_exp',
 'sand_dust_u18_rel_exp',
 'pluvial_flood_u18_exp',
 'pluvial_flood_u18_rel_exp',
 'heatwave_duration_u18_exp',
 'heatwave_duration_u18_rel_exp',
 'fire_frequency_u18_exp',
 'fire_frequency_u18_rel_exp',
 'P2_Nutrition_Stunting_Modeled_u18_vul',
 'P2_Nutrition_Stunting_Modeled_u18_vul_rel',
 'P2_Immunization_DTP3_u18_vul',
 'P2_Immunization_DTP3_u18_vul_rel',
 'P2_Immunization_DTP1_u18_vul',
 'P2_Immunization_DTP1_u18_vul_rel',
 'P2_electricity_access_u18_vul',
 'P2_electricity_access_u18_vul_rel',
 'P2_food_poverty_u18_vul',
 'P2_food_poverty_u18_vul_rel',
 'P2_PT_Labor_u18_vul',
 'P2_PT_Labor_u18_vul_rel',
 'P2_Child_Marriage_u18_vul',
 'P2_Child_Marriage_u18_vul_rel',
 'P2_Child_Mortality_u18_vul',
 'P2_Child_Mortality_u18_vul_rel',
 'P2_Social_Protection_u18_vul',
 'P2_Social_Protection_u18_vul_rel',
 'P2_Birth_Attendant_Y15T19_u18_vul',
 'P2_Birth_Attendant_Y15T19_u18_vul_rel',
 'P2_Child_poverty_u18_vul',
 'P2_Child_poverty_u18_vul_rel',
 'P2_ED_CR_L2_u18_vul',
 'P2_ED_CR_L2_u18_vul_rel',
 'P2_Learning_Poverty_u18_vul',
 'P2_Learning_Poverty_u18_vul_rel',
 'P2_WASH_Sanitation_u18_vul',
 'P2_WASH_Sanitation_u18_vul_rel',
 'P2_LSCED_u18_vul',
 'P2_LSCED_u18_vul_rel',
 'P2_WASH_Drinking_Water_u18_vul',
 'P2_WASH_Drinking_Water_u18_vul_rel',
 'P2_basic_hygiene_u18_vul',
 'P2_basic_hygiene_u18_vul_rel','heatwave_frequency_absolute',
 'heatwave_frequency_relative',
 'heatwave_duration_absolute',
 'heatwave_duration_relative',
 'heatwave_severity_absolute',
 'heatwave_severity_relative',
 'river_flood_absolute',
 'river_flood_relative',
 'coastal_flood_absolute',
 'coastal_flood_relative',
 'pluvial_flood_absolute',
 'pluvial_flood_relative',
 'tropical_storm_absolute',
 'tropical_storm_relative',
 'extreme_heat_absolute',
 'extreme_heat_relative',
 'sand_dust_absolute',
 'sand_dust_relative',
 'vectorborne_malariapv_absolute',
 'vectorborne_malariapv_relative',
 'vectorborne_malariapf_absolute',
 'vectorborne_malariapf_relative',
 'agricultural_drought_absolute',
 'agricultural_drought_relative',
 'air_pollution_absolute',
 'air_pollution_relative',
 'drought_spi_absolute',
 'drought_spi_relative',
 'drought_sma_absolute',
 'drought_sma_relative',
 'fire_frequency_absolute',
 'fire_frequency_relative',
 'fire_FRP_absolute',
 'fire_FRP_relative',
 'P2_WASH_Drinking_Water_value_normalized',
 'P2_WASH_Sanitation_value_normalized',
 'P2_LSCED_value_normalized',
 'P2_Nutrition_Stunting_Modeled_value_normalized',
 'P2_Child_Mortality_value_normalized',
 'P2_Immunization_DTP1_value_normalized',
 'P2_Immunization_DTP3_value_normalized',
 'P2_PT_Labor_value_normalized',
 'P2_Learning_Poverty_value_normalized',
 'P2_ED_CR_L2_value_normalized',
 'P2_Birth_Attendant_Y15T19_value_normalized',
 'P2_Child_poverty_value_normalized',
 'P2_Child_Marriage_value_normalized',
 'P2_food_poverty_value_normalized',
 'P2_Social_Protection_value_normalized',
 'P2_basic_hygiene_value_normalized',
 'P2_electricity_access_value_normalized','P2_arithmetic_avg',
 'P1_geometric_avg',
 'ccri',
 'rank_reverse']]

In [384]:
gdf = gpd.GeoDataFrame(df_combined, geometry=df_combined['geometry'], crs='EPSG:4326')

In [386]:
gdf.to_file('{}/CCRI_P1_P2_format.geojson'.format(data_dir))