In [1]:
# !pip install geopy

In [2]:
import json
import pandas as pd
from collections import defaultdict
import heapq

In [3]:
file_path = "alt_geog_base.json"

with open(file_path, 'r') as file:
    data = json.load(file)

In [35]:
data[0:5]

[[{'geomaster_name': 'Balqa Governorate',
   'geomaster_id': '54',
   'admin_level': 'Region',
   'centroid_lon': '35.6667',
   'centroid_lat': '32',
   'pop_accommodation_code': 'X',
   'pop_accommodation_name': 'Not Specified',
   'pop_data_type_code': 'REG',
   'pop_data_type_name': 'Registered',
   'pop_type_code': 'REF',
   'pop_type_name': 'Refugees',
   'pop_frequency_name': 'Daily',
   'pop_frequency_code': 'D',
   'pop_frequency_cumulative': 'cumulative',
   'pop_frequency_cumulative_code': 'C',
   'pop_origin_name': 'Syrian Arab Rep.',
   'pop_origin_code': 'SYR',
   'date': '2022-12-31',
   'month': '12',
   'year': '2022',
   'population_groups': 'Registered refugees from Syrian Arab Republic',
   'population_group_id': '4772',
   'source': 'UNHCR',
   'individuals': '17363',
   'households': None,
   'male_04': '1068',
   'male_511': '1867',
   'male_1217': '1318',
   'male_1859': '4097',
   'male_60': '305',
   'male_total': None,
   'male_hhs': None,
   'female_04': '107

In [5]:
def round_to_n_sig_figs(x, n):
    return round(x, n - int(math.floor(math.log10(abs(x)))) - 1)

In [6]:
result = {}
for inner_list in data:
    for record in inner_list:
        geomaster_name = record['geomaster_name']

        if (record["admin_level"] == "Country" or record["admin_level"] == "Continent"):
            continue
                
        if geomaster_name not in result:
            result[geomaster_name] = {
                'geomaster_id': record['geomaster_id'],
                'geomaster_name': record['geomaster_name'],
                'admin_level': record['admin_level'],
                'total_population': 0,
                'nationalities': {}
            }

        nationality = record['pop_origin_name']
        individuals = int(record['individuals'])

        if nationality not in result[geomaster_name]['nationalities']:
            result[geomaster_name]['nationalities'][nationality] = {
                'population': 0,
                'records': []
            }

        result[geomaster_name]['total_population'] += individuals
        result[geomaster_name]['nationalities'][nationality]['population'] += individuals
        result[geomaster_name]['nationalities'][nationality]['records'].append(record)

for location in result.values():
    location['nationality_percentages'] = {}
    total_population = location['total_population']
    for nationality, nationality_data in location['nationalities'].items():
        if total_population > 0:
            share = nationality_data['population'] / total_population
            percentage = round_to_n_sig_figs(share * 100, 3)
        else:
            percentage = 0
        location['nationality_percentages'][nationality] = percentage


geojson = {
    'type': 'FeatureCollection',
    'features': [
        {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [0,0]
            },
            'properties': {
                'geomaster_id': location['geomaster_id'],
                'geomaster_name': location['geomaster_name'],
                'admin_level': location['admin_level'],
                'total_population': location['total_population'],
                'nationality_percentages': location['nationality_percentages'],
                'nationalities': location['nationalities']
            }
        }
        for location in result.values()
    ]
}

In [7]:
with open('reformatted_camp_data_alt.geojson', 'w') as outfile:
    json.dump(geojson, outfile)

In [33]:
total_population_all_camps = 0
num_camps = 0
num_camps_multiple_nationalities = 0
num_camps_no_population_nationality_data = 0

for location in result.values():
    num_camps += 1
    total_population_all_camps += location['total_population']
    
    num_nationalities = len(location['nationalities'])
    
    if num_nationalities == 0 or location['total_population'] in [0, None]:
        num_camps_no_population_nationality_data += 1
    else:
        num_camps_multiple_nationalities += 1

percentage_camps_multiple_nationalities = (num_camps_multiple_nationalities / num_camps) * 100
percentage_camps_no_population_nationality_data = (num_camps_no_population_nationality_data / num_camps) * 100

print("Total population of all geographies exclusive of region/country admin data:", total_population_all_camps)
print("Number of geographies:", num_camps)
print("Percentage of population in geographies with multiple nationalities:", round_to_n_sig_figs(percentage_camps_multiple_nationalities, 3),"percent")
print("Percentage of geographies with either no population or nationality data:", round_to_n_sig_figs(percentage_camps_no_population_nationality_data, 3),"percent")

Total population of all camps exclusive of region/country admin data: 12413470
Number of camps: 264
Percentage of population in camps with multiple nationalities: 87.9 percent
Percentage of camps with either no population or nationality data: 12.1 percent


In [36]:
average_population_per_camp = total_population_all_camps / num_camps

largest_camp_population = max(location['total_population'] for location in result.values())
smallest_camp_population = min(location['total_population'] for location in result.values() if location['total_population'] > 0)

nationality_populations = {}
for location in result.values():
    for nationality, nationality_data in location['nationalities'].items():
        if nationality not in nationality_populations:
            nationality_populations[nationality] = 0
        nationality_populations[nationality] += nationality_data['population']

top_5_nationalities = heapq.nlargest(5, nationality_populations, key=nationality_populations.get)

print("Average population per geographies:", round_to_n_sig_figs(average_population_per_camp, 5))
print("Largest geographies population:", largest_camp_population)
print("Smallest geographies population:", smallest_camp_population)
print("Top 5 nationalities across all geographies by total population:", top_5_nationalities)

Average population per geographies: 47021.0
Median geographies population: 1
Largest geographies population: 793035
Smallest geographies population: 1
Top 5 nationalities across all geographies by total population: ['South Sudan', 'Myanmar', 'Syrian Arab Rep.', 'Afghanistan', 'Dem. Rep. of the Congo']


In [37]:
nationality_populations = {}
nationality_highest_camp = {}

for location in result.values():
    for nationality, nationality_data in location['nationalities'].items():
        if nationality not in nationality_populations:
            nationality_populations[nationality] = 0
        nationality_populations[nationality] += nationality_data['population']

        if nationality not in nationality_highest_camp or nationality_data['population'] > nationality_highest_camp[nationality]['population']:
            nationality_highest_camp[nationality] = {
                'camp_id': location['geomaster_id'],
                'camp_name': location['geomaster_name'],
                'population': nationality_data['population']
            }

nationality_stats = {
    'Nationality': list(nationality_populations.keys()),
    'Total Population': list(nationality_populations.values()),
    'Geography ID': [nationality_highest_camp[nationality]['camp_id'] for nationality in nationality_populations],
    'Geography Name': [nationality_highest_camp[nationality]['camp_name'] for nationality in nationality_populations]
}

nationality_df = pd.DataFrame(nationality_stats)
nationality_df = nationality_df.sort_values('Total Population', ascending=False)

nationality_df

Unnamed: 0,Nationality,Total Population,Geography ID,Geography Name
9,South Sudan,3403383,802,Northern Region
46,Myanmar,1810481,11086,Cox's Bazar
0,Syrian Arab Rep.,1748470,93,Bekaa
44,Afghanistan,1286966,1687,Khyber Pakhtunkhwa
14,Dem. Rep. of the Congo,1265821,224,Western Region
8,Somalia,1144434,800,Somali
31,Central African Rep.,336737,1775,Nord-Ubangi
26,Burundi,322388,843,Kigoma
5,Eritrea,313518,169,Addis Ababa
28,Rwanda,256913,966,Nord-Kivu


In [40]:
top_25_camps = heapq.nlargest(25, result.values(), key=lambda x: x['total_population'])

camp_nationality_stats = {
    'Camp ID': [camp['geomaster_id'] for camp in top_25_camps],
    'Geography Name': [camp['geomaster_name'] for camp in top_25_camps],
    'Total Population': [camp['total_population'] for camp in top_25_camps],
    'Primary Nationality Origin': [max(camp['nationalities'], key=lambda x: camp['nationalities'][x]['population']) for camp in top_25_camps]
}

top_camps_df = pd.DataFrame(camp_nationality_stats)
top_camps_df = top_camps_df.sort_values('Total Population', ascending=False)

top_camps_df

Unnamed: 0,Camp ID,Geography Name,Total Population,Primary Nationality Origin
0,802,Northern Region,793035,South Sudan
1,1687,Khyber Pakhtunkhwa,671075,Afghanistan
2,11086,Cox's Bazar,579460,Myanmar
3,224,Western Region,578772,Dem. Rep. of the Congo
4,11775,Ukhia,494602,Myanmar
5,11784,Palong Khali,482760,Myanmar
6,235,Gambela,376981,South Sudan
7,93,Bekaa,318713,Syrian Arab Rep.
8,1683,Balochistan,314866,Afghanistan
9,301,White Nile,281628,South Sudan
