In [1]:
# !pip install geopy

In [2]:
import json
import pandas as pd
from collections import defaultdict
import heapq
import math

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="myGeocoder")

In [3]:
file_path = "extracted_base.json"

with open(file_path, 'r') as file:
    data = json.load(file)

In [24]:
data[0:5]

[[{'geomaster_name': 'Domiz 1 Camp',
   'geomaster_id': '8',
   'admin_level': 'Settlement',
   'longitude': '42.89142377999997',
   'latitude': '36.78232231',
   'pop_accommodation_code': 'C',
   'pop_accommodation_name': 'Refugee Camps/Centers',
   'pop_data_type_code': 'REG',
   'pop_data_type_name': 'Registered',
   'pop_type_code': 'REF',
   'pop_type_name': 'Refugees',
   'pop_frequency_name': 'Daily',
   'pop_frequency_code': 'D',
   'pop_frequency_cumulative': 'cumulative',
   'pop_frequency_cumulative_code': 'C',
   'pop_origin_name': 'Syrian Arab Rep.',
   'pop_origin_code': 'SYR',
   'date': '2022-12-31',
   'month': '12',
   'year': '2022',
   'population_groups': 'Registered refugees from Syrian Arab Republic (refugee camps/centers)',
   'population_group_id': '4773',
   'source': 'UNHCR',
   'individuals': '29684',
   'households': '6699',
   'male_04': '2283',
   'male_511': '3455',
   'male_1217': '1824',
   'male_1859': '6549',
   'male_60': '600',
   'male_total': '14

In [4]:
def round_to_n_sig_figs(x, n):
    return round(x, n - int(math.floor(math.log10(abs(x)))) - 1)

In [5]:
def get_country(latitude, longitude):
    location = geolocator.reverse(f"{latitude}, {longitude}", language='en')
    address = location.raw['address']
    return address.get('country', 'Unknown')

In [6]:
result = {}
for inner_list in data:
    for record in inner_list:
        geomaster_name = record['geomaster_name']

        if geomaster_name not in result:
            country = get_country(record['latitude'], record['longitude'])
            result[geomaster_name] = {
                'geomaster_id': record['geomaster_id'],
                'geomaster_name': record['geomaster_name'],
                'admin_level': record['admin_level'],
                'longitude': record['longitude'],
                'latitude': record['latitude'],
                'country': country,
                'total_population': 0,
                'nationalities': {}
            }

        nationality = record['pop_origin_name']
        individuals = int(record['individuals'])

        if nationality not in result[geomaster_name]['nationalities']:
            result[geomaster_name]['nationalities'][nationality] = {
                'population': 0,
                'records': []
            }

        result[geomaster_name]['total_population'] += individuals
        result[geomaster_name]['nationalities'][nationality]['population'] += individuals
        result[geomaster_name]['nationalities'][nationality]['records'].append(record)

for location in result.values():
    location['nationality_percentages'] = {}
    total_population = location['total_population']
    for nationality, nationality_data in location['nationalities'].items():
        if total_population > 0:
            share = nationality_data['population'] / total_population
            percentage = round_to_n_sig_figs(share * 100, 3)
        else:
            percentage = 0
        location['nationality_percentages'][nationality] = percentage


geojson = {
    'type': 'FeatureCollection',
    'features': [
        {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [float(location['longitude']), float(location['latitude'])]
            },
            'properties': {
                'geomaster_id': location['geomaster_id'],
                'geomaster_name': location['geomaster_name'],
                'admin_level': location['admin_level'],
                'country': location['country'], 
                'total_population': location['total_population'],
                'nationality_percentages': location['nationality_percentages'],
                'nationalities': location['nationalities']
            }
        }
        for location in result.values()
    ]
}

In [7]:
with open('reformatted_camp_data.geojson', 'w') as outfile:
    json.dump(geojson, outfile)

In [21]:
total_population_all_camps = 0
num_camps = 0
num_camps_multiple_nationalities = 0
num_camps_no_population_nationality_data = 0

for location in result.values():
    num_camps += 1
    total_population_all_camps += location['total_population']
    
    num_nationalities = len(location['nationalities'])
    
    if num_nationalities == 0 or location['total_population'] in [0, None]:
        num_camps_no_population_nationality_data += 1
    else:
        num_camps_multiple_nationalities += 1

percentage_camps_multiple_nationalities = (num_camps_multiple_nationalities / num_camps) * 100
percentage_camps_no_population_nationality_data = (num_camps_no_population_nationality_data / num_camps) * 100

print("Total population of all camps:", total_population_all_camps)
print("Number of camps:", num_camps)
print("Percentage of population in camps with multiple nationalities:", round_to_n_sig_figs(percentage_camps_multiple_nationalities, 3),"percent")
print("Percentage of camps with either no population or nationality data:", round_to_n_sig_figs(percentage_camps_no_population_nationality_data, 3),"percent")

Total population of all camps: 8664477
Number of camps: 211
Percentage of population in camps with multiple nationalities: 72.5 percent
Percentage of camps with no population or nationality data: 27.5 percent


In [22]:
average_population_per_camp = total_population_all_camps / num_camps

largest_camp_population = max(location['total_population'] for location in result.values())
smallest_camp_population = min(location['total_population'] for location in result.values() if location['total_population'] > 0)

nationality_populations = {}
for location in result.values():
    for nationality, nationality_data in location['nationalities'].items():
        if nationality not in nationality_populations:
            nationality_populations[nationality] = 0
        nationality_populations[nationality] += nationality_data['population']

top_5_nationalities = heapq.nlargest(5, nationality_populations, key=nationality_populations.get)

print("Average population per camp:", average_population_per_camp)
print("Largest camp population:", largest_camp_population)
print("Smallest camp population:", smallest_camp_population)
print("Top 5 nationalities across all camps by total population:", top_5_nationalities)

Average population per camp: 41063.87203791469
Largest camp population: 4183359
Smallest camp population: 1
Top 5 nationalities across all camps by total population: ['Various', 'Syrian Arab Rep.', 'Myanmar', 'Somalia', 'Burundi']


In [10]:
nationality_populations = {}
nationality_highest_camp = {}

for location in result.values():
    for nationality, nationality_data in location['nationalities'].items():
        if nationality not in nationality_populations:
            nationality_populations[nationality] = 0
        nationality_populations[nationality] += nationality_data['population']

        if nationality not in nationality_highest_camp or nationality_data['population'] > nationality_highest_camp[nationality]['population']:
            nationality_highest_camp[nationality] = {
                'camp_id': location['geomaster_id'],
                'camp_name': location['geomaster_name'],
                'country': location['country'],
                'population': nationality_data['population']
            }

nationality_stats = {
    'Nationality': list(nationality_populations.keys()),
    'Total Population': list(nationality_populations.values()),
    'Location': [nationality_highest_camp[nationality]['country'] for nationality in nationality_populations],
    'Highest Camp ID': [nationality_highest_camp[nationality]['camp_id'] for nationality in nationality_populations],
    'Highest Camp Name': [nationality_highest_camp[nationality]['camp_name'] for nationality in nationality_populations]
}

nationality_df = pd.DataFrame(nationality_stats)
nationality_df = nationality_df.sort_values('Total Population', ascending=False)

nationality_df

Unnamed: 0,Nationality,Total Population,Location,Highest Camp ID,Highest Camp Name
22,Various,4185060,Uganda,11900,Other
0,Syrian Arab Rep.,1723834,Lebanon,99,Bekaa
26,Myanmar,716238,Bangladesh,11792,Kutupalong Expansions
1,Somalia,554379,Kenya,181,Hagadera Ref. Camp
8,Burundi,369792,Tanzania,852,Kibondo
6,Dem. Rep. of the Congo,333139,Tanzania,850,Kasulu
7,South Sudan,288149,Kenya,11937,Turkana West
24,Mali,156121,Mauritania,1369,Mbéra Refugee Camp
25,Nigeria,137684,Niger,1390,Diffa Region
27,Colombia,60578,Ecuador,12024,Pichincha


In [23]:
top_25_camps = heapq.nlargest(25, result.values(), key=lambda x: x['total_population'])

camp_nationality_stats = {
    'Camp ID': [camp['geomaster_id'] for camp in top_25_camps],
    'Camp Name': [camp['geomaster_name'] for camp in top_25_camps],
    'Camp Location': [camp['country'] for camp in top_25_camps],
    'Total Population': [camp['total_population'] for camp in top_25_camps],
    'Highest Nationality': [max(camp['nationalities'], key=lambda x: camp['nationalities'][x]['population']) for camp in top_25_camps]
}

top_camps_df = pd.DataFrame(camp_nationality_stats)
top_camps_df = top_camps_df.sort_values('Total Population', ascending=False)

top_camps_df

Unnamed: 0,Camp ID,Camp Name,Camp Location,Total Population,Highest Nationality
0,11900,Other,Uganda,4183359,Various
1,99,Bekaa,Lebanon,318713,Syrian Arab Rep.
2,11792,Kutupalong Expansions,Bangladesh,253504,Myanmar
3,11937,Turkana West,Kenya,244286,South Sudan
4,81,North Lebanon,Lebanon,226508,Syrian Arab Rep.
5,191,Kakuma Refugee Camp,Kenya,196584,South Sudan
6,48,Amman,Jordan,196539,Syrian Arab Rep.
7,84,Beirut,Lebanon,180120,Syrian Arab Rep.
8,1390,Diffa Region,Niger,140203,Nigeria
9,850,Kasulu,Tanzania,129703,Dem. Rep. of the Congo
