In [4]:
from urllib.request import urlopen
import json
import ast
import pandas as pd


## Revision Process

- [x] Build list of UNHCR IDs for each refugee camp in our sample
- [x] Generate list of queries using camps
- [x] Export database that is a matrix of camp level relevant countries
- [x] Check to make sure data is relevant to at least 2020
- [ ] Use csv of adjacent countries to remove population origin countries that are not relevant

Final output will be a database of camps, with the relevant countries for each camp

#### Example query for Kakuma

kakuma_id = 191

relevant populations to consider: "Registered asylum-seekers from __, Registered refugees from __ (refugee camps/centers)"

https://data.unhcr.org/population/get/origin?geo_id={kakuma_id}&population_collection=4,29

In [5]:
unhcr_id_database = pd.read_csv("camp_data_refugee_flows.csv")
unhcr_id_database

Unnamed: 0,index,country,camp_name,longitude,latitude,unhcr_id,note
0,5,Ethiopia,Kule,34.29153,8.275403,235,Gambela region
1,10,Ethiopia,Pugnido,34.2591,7.645984,235,Gambela region
2,11,Ethiopia,Melkadida,41.720828,4.519206,800,Somali region
3,17,Ethiopia,Nguenyyiel,34.240537,8.295809,235,Gambela region
4,1,Kenya,Kakuma,34.831663,3.744967,191,
5,2,Kenya,Hagadera,40.371067,0.001314,181,
6,3,Kenya,Dagahaley,40.290157,0.186054,180,
7,4,Kenya,Ifo,40.315197,0.11604,182,
8,7,South Sudan,Yida,30.08608,10.113055,817,Unity state
9,12,South Sudan,Pamir,30.355904,9.909969,817,Unity state


In [6]:
def extract_origin_populations():
    camps = unhcr_id_database["camp_name"].tolist()
    unhcr_ids = unhcr_id_database["unhcr_id"].tolist()
    countries = unhcr_id_database["country"].tolist()

    base_url = "https://data.unhcr.org/population/get/origin?geo_id={unhcr_geo_id}&population_collection=4,29"

    edge_database = pd.DataFrame(columns=["camp_name", "unhcr_id", "origin_countries", "origin_populations", "year_of_origin_data"])

    valid_country_edges = pd.read_csv("https://raw.githubusercontent.com/geodatasource/country-borders/master/GEODATASOURCE-COUNTRY-BORDERS.CSV")
    valid_country_edges = valid_country_edges.replace("Tanzania (the United Republic of)", "Tanzania")
    valid_country_edges = valid_country_edges.replace("Congo (the Democratic Republic of the)", "Dem. Rep. of the Congo")

    for idx, camp in enumerate(camps):
        tmp_json = json.load(urlopen(base_url.format(unhcr_geo_id= unhcr_ids[idx])))

        country_edges = valid_country_edges[valid_country_edges["country_name"].str.contains(countries[idx])]["country_border_name"].tolist()

        origin_countries = []
        origin_populations = []
        year_of_data = []

        for origin_unit in tmp_json["data"]:

            if origin_unit["pop_origin_name"] in country_edges:
                origin_countries.append(origin_unit["pop_origin_name"])
                origin_populations.append(origin_unit["individuals"])
                year_of_data.append(origin_unit["year"])

        ## Because South Sudan does not have disaggregated data in the operations portal, we use this current to 02/2022
        ## https://data.unhcr.org/en/documents/details/90875
        if camp == "Ajuong Thok":
            origin_countries = ["Sudan"]
            origin_populations = ["46167"]
            year_of_data = ["2022"]
        if camp == "Pamir":
            origin_countries = ["Sudan"]
            origin_populations = ["45507"]
            year_of_data = ["2022"]
        if camp == "Yida":
            origin_countries = ["Sudan"]
            origin_populations = ["33847"]
            year_of_data = ["2022"]

        ## we then check the total for each and if it's less than 1% of the pop we remove the edge
        total_camp_population = sum(list(map(int, origin_populations)))
        points = [z for z in zip(origin_countries, origin_populations, year_of_data) if int(z[1]) > (total_camp_population * 0.01)]

        [origin_countries, origin_populations, year_of_data] = [list(z) for z in zip(*points)]

        edge_database = pd.concat([
            edge_database,
            pd.DataFrame(data=[[camp, unhcr_ids[idx], origin_countries, origin_populations, year_of_data]], columns=["camp_name", "unhcr_id", "origin_countries", "origin_populations", "year_of_origin_data"])]
        )

    return pd.merge(edge_database, unhcr_id_database, on=["camp_name", "unhcr_id"]).sort_values(by="index").reset_index(drop=True)


origin_camp_database = extract_origin_populations()

origin_camp_database['origin_countries'] = origin_camp_database['origin_countries'].astype(str).str.replace('Dem. Rep. of the Congo', 'Dem Rep of the Congo', regex=False).apply(ast.literal_eval)

In [7]:
origin_camp_database

Unnamed: 0,camp_name,unhcr_id,origin_countries,origin_populations,year_of_origin_data,index,country,longitude,latitude,note
0,Kakuma,191,"[South Sudan, Somalia, Ethiopia, Uganda]","[100826, 38270, 6400, 1949]","[2022, 2022, 2022, 2022]",1,Kenya,34.831663,3.744967,
1,Hagadera,181,"[Somalia, Ethiopia]","[81293, 1327]","[2022, 2022]",2,Kenya,40.371067,0.001314,
2,Dagahaley,180,"[Somalia, Ethiopia]","[76282, 2176]","[2022, 2022]",3,Kenya,40.290157,0.186054,
3,Ifo,182,"[Somalia, Ethiopia]","[68456, 3375]","[2022, 2022]",4,Kenya,40.315197,0.11604,
4,Kule,235,[South Sudan],[374806],[2022],5,Ethiopia,34.29153,8.275403,Gambela region
5,Bidi Bidi,11097,[South Sudan],[209925],[2022],6,Uganda,31.382998,3.468533,Yumbe district
6,Yida,817,[Sudan],[33847],[2022],7,South Sudan,30.08608,10.113055,Unity state
7,Nduta,863,[Burundi],[76671],[2022],8,Tanzania,30.841255,-3.656156,
8,Mtendeli,863,[Burundi],[76671],[2022],9,Tanzania,30.888483,-3.427342,Using Nduta camp given proximity
9,Pugnido,235,[South Sudan],[374806],[2022],10,Ethiopia,34.2591,7.645984,Gambela region
