## Bootstrap Sampling

This script:
- Identifies all valid observations (e.g., the valid border edges)
- Creates an empty array for camp exposure values
- Generates a random sample of n with sampling with replacement of original data
- Adds the camp to the random sample (i.e., n+1 as sample size)
- Normalizes data within each random sample using existing methodology
- Appends the exposure value to the sample array
- Saves output observations to the full bootstrap object


The structure of the sample is recorded as:

```
{ 'Kakuma': [0.65, 0.78,...],
  'Bidi Bidi': [0.62, 0.64,...],
  ...
  }
```


- These observations are then sorted, and used as the basis for the upper and lower bound intervals

In [None]:
import pandas as pd
import json
import ast
import numpy as np
import plotly.express as px

### Load in the required data

In [None]:
final_data_winsorized = pd.read_csv("aggregated_base_data.csv")
camp_data = pd.read_csv("camp_data_final.csv")

### Define primary index construction functions

In [None]:
index_variables_update = [
    'seasonal_precipitation_max',
    'pdsi',
    'temperature_anomaly',
    'daytime_maximum_temperature',
    'specific_humidity',
    'precipitation',
    'interannual_coefficient_variation_precipitation',
    'ssm',
    'slope',
    'flow_accumulation',
    'friction',
]

variables_min_max = [
    'coefficient_variation_ndvi',
    'daytime_maximum_temperature',
    'evi_change',
    'flow_accumulation',
    'friction',
    'interannual_coefficient_variation_precipitation',
    'pdsi',
    'population',
    'precipitation',
    'seasonal_precipitation_max',
    'slope',
    'specific_humidity',
    'temperature_anomaly',
]

variables_max_min = ['ssm', 'susm']

In [None]:
def normalize_winsorized_data(dataset, min_max, max_min):
    dataset[min_max] = dataset[min_max].transform(lambda x: ((x - x.min()) / (x.max() - x.min())))
    dataset[max_min] = dataset[max_min].transform(lambda x: ((x.max() - x) / (x.max() - x.min())))
    return dataset

def generate_index(index_vars, normalized_dataset):
    normalized_dataset['exposure'] = normalized_dataset.loc[:, index_vars].sum(axis=1)
    normalized_dataset['exposure'] = normalized_dataset['exposure'].div(len(index_vars))
    normalized_dataset['rank'] = normalized_dataset['exposure'].rank(method="dense", ascending=False)
    normalized_dataset['percentile'] = normalized_dataset['exposure'].rank(pct=True)
    return normalized_dataset

def generate_min_max_exposure(index_vars, normalized_input):
    max_list = list(normalized_input.loc[:, index_vars].sum(axis=1))
    max_list = [x / len(index_vars) for x in max_list]
    min_list = list(normalized_input.loc[:, index_vars].sum(axis=1))
    min_list = [x / len(index_vars) for x in min_list]

    for var in index_vars:
        current_index_vars = list(index_vars)
        current_index_vars.remove(var)
        current_list = list(normalized_input.loc[:, current_index_vars].sum(axis=1))
        current_list = [x / len(current_index_vars) for x in current_list]

        max_list = np.maximum(np.array(max_list), np.array(current_list)).tolist()
        min_list = np.minimum(np.array(min_list), np.array(current_list)).tolist()

    return min_list, max_list

def iterate_min_max_exposure(normalized_dataset):

    min_list_out, max_list_out = generate_min_max_exposure(index_variables_update, normalized_dataset)
    min_max_index_data_update = normalized_dataset.copy(deep=True)
    min_max_index_data_update['min_exposure'] = min_list_out
    min_max_index_data_update['max_exposure'] = max_list_out

    min_max_index_data_update['min_rank'] = min_max_index_data_update['min_exposure'].rank(method="dense", ascending=False)
    min_max_index_data_update['min_percentile'] = min_max_index_data_update['min_exposure'].rank(pct=True)

    min_max_index_data_update['max_rank'] = min_max_index_data_update['max_exposure'].rank(method="dense", ascending=False)
    min_max_index_data_update['max_percentile'] = min_max_index_data_update['max_exposure'].rank(pct=True)
    return min_max_index_data_update

In [None]:
def gen_normalized_exposure_indices_bootstrap(camp):
    bootstrap_exposure = []
    dc_final_data_winsorized = final_data_winsorized.copy(deep=True)
    for i in range(10000):
        active_subset = final_data_winsorized.loc[(final_data_winsorized['camp_name'] == camp)]
        edge_options = active_subset["edge"].to_list()[0]
        deep_copy_final_data_winsorized = dc_final_data_winsorized[dc_final_data_winsorized['edge'].isin(ast.literal_eval(edge_options))].copy(deep=True)
        random_sample = deep_copy_final_data_winsorized.sample(frac=1, replace=True)

        active_subset = pd.concat([active_subset, random_sample])

        normalized_data = normalize_winsorized_data(active_subset.copy(deep=True), variables_min_max, variables_max_min)
        index_data_update = generate_index(index_variables_update, normalized_data.copy(deep=True))
        index_data_update = iterate_min_max_exposure(index_data_update.copy(deep=True))
        bootstrap_exposure.append(index_data_update.loc[(index_data_update["camp_name"] == camp)]["percentile"].values[0])

    return bootstrap_exposure

In [None]:
def gen_camp_exposure_bootstrap_json():
    all_bootstrap_data = {}
    for camp in camp_data["camp_name"].to_list():
        all_bootstrap_data[camp] = gen_normalized_exposure_indices_bootstrap(camp)
    return all_bootstrap_data
output = gen_camp_exposure_bootstrap_json()

In [None]:
for i in output:
    fig = px.histogram(output[i], title=i)
    fig.update_xaxes(matches=None)
    fig.show()

In [None]:
with open('bootstrap_sample_data_tenk.json', 'w') as f:
    json.dump(output, f)

In [None]:
median = []
lower_025 = []
top_025 = []
camp_array = []

for camp in output:
    lower_index = round(10000 * 0.025) - 1
    median_index = round(10000 / 2) - 1
    upper_index = 10000 - round(10000 * 0.025) - 1

    output[camp].sort()
    lower_025.append(output[camp][lower_index])
    median.append(output[camp][median_index])
    top_025.append(output[camp][upper_index])
    camp_array.append(camp)


bootstrap_data = []
for item in zip(camp_array, lower_025, median, top_025):
    bootstrap_data.append(item)

bootstrap_df = pd.DataFrame(bootstrap_data, columns=[
    "camp_name",
    "lower_025",
    "median",
    "top_025"
])

bootstrap_df["total_ci"] = bootstrap_df["top_025"] - bootstrap_df["lower_025"]
bootstrap_df

In [None]:
bootstrap_df.to_csv("bootstrap_summary.csv")