# Data analysis and visualization

This script contains the code for analyzing and visualizing the data from the UNHCR datasets.

***

### Packages and settings

In [1]:
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format # Avoid scientific notation

*********

## Data sources

### 1. UNHCR refugees and asylum seekers estimates

In [2]:
UNHCR = pd.read_csv("../data/unhcr-refugees/population.csv")

In [3]:
UNHCR.tail()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate,Asylum-seekers,Returned refugees,IDPs of concern to UNHCR,Returned IDPss,Stateless persons,Others of concern,Other people in need of international protection,Host Community
68,2019,-,-,-,-,20414669,4148141,317181,43503362,5343793,4217774,3857179,3582202,2304506
69,2020,-,-,-,-,20661846,4184926,250951,48557439,3184118,4179331,3939756,3862102,4369021
70,2021,-,-,-,-,21327285,4616135,429234,51322623,5265622,4338192,4223095,4406432,6731133
71,2022,-,-,-,-,29429078,5442319,1356261,57321197,8324166,4428314,6008804,5217456,23957770
72,2023,-,-,-,-,31637408,6858499,1052074,63251367,5092064,4358188,5945550,5755363,26095474


### 2. UNRWA estimates

In [4]:
UNRWA = pd.read_csv("../data/unrwa-refugees/unrwa.csv",skiprows=14)

In [5]:
UNRWA.tail()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Total
67,2019,-,-,-,-,5629829
68,2020,-,-,-,-,5703521
69,2021,-,-,-,-,5792907
70,2022,-,-,-,-,5887353
71,2023,-,-,-,-,5936247


### 3. IMDC internally displaced people estimates

In [6]:
IMDC = pd.read_csv("../data/internally-displaced-idmc/idmc.csv")

In [7]:
IMDC.tail()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Total
30,2019,-,-,-,-,45667305
31,2020,-,-,-,-,48027950
32,2021,-,-,-,-,53246765
33,2022,-,-,-,-,62500170
34,2023,-,-,-,-,68279000


### 4. UN population estimates

In [8]:
POP = pd.read_csv("../data/un-population-estimates.csv")

  POP = pd.read_csv("../data/un-population-estimates.csv")


In [9]:
POP = POP[['Time', 'ISO3_code', 'Location', 'TPopulation1July']]

In [10]:
# Value from thousands to total
POP['TPopulation1July'] = POP['TPopulation1July'] * 1000

In [11]:
POP.tail()

Unnamed: 0,Time,ISO3_code,Location,TPopulation1July
43467,2097,WLF,Wallis and Futuna Islands,10036.0
43468,2098,WLF,Wallis and Futuna Islands,9989.0
43469,2099,WLF,Wallis and Futuna Islands,9940.0
43470,2100,WLF,Wallis and Futuna Islands,9887.0
43471,2101,WLF,Wallis and Futuna Islands,


---

## Data wrangling

### 1. Share of world population that is displaced

The count of displaced population includes:
- Refugees and people in situation akin to refugees, under the mandate of the UNHCR
- Refugees in the Palestinian Territories, under the mandate of the UNRWA
- Asylum seekers and people in akin situations, according to the UNHCR
- Internally displaced people, excluding those due to disasters, by the IDMC estimates.

The displaced population at the end of each year is then divided by the United Nations populate estimations for the 1st of July in the same year.

One important notice is that some of this numbers differ from those in the 2023 report, despite supposedly coming from the same source. Some of this can be reasonably assumed to be due to rounding up to the ten thousands slot. Others, not.

Namely, the data differs in the following entries in 2023:

- **Refugees under UNHCR's mandate**: 31,639,000 in the report x 31,637,408 in the data. (1592 less in the data)
- **Refugees under UNRWA's mandate**: 5,969,000 in the report x 5,936,247 in the data. (32573 less in the data)
- **Asylum seekers**: 6,860,000 in the report x 6,858,499 in the data.  (1501 less in the data)
- **Other people in need of protection** 5,755,000 in the report: x 5,755,363 in the data. (363 more in the data)
- **Internally displaced (IDMC)**: 68,279,000 in the report x 68,279,000 in the data. (Same values)

For the year 2023, the report also made an adjustment: it estimated that 70% of the 1.7 million of people internally displaced in Gaza were already counted under the "refugees under the URNWA's mandate" label. Thus, this share (1.7 million * 0.7) was removed from the total displaced people in the year, avoiding double counting.

When this is accounted for, the numbers we arrive at are still not the same as the report. We arrived at 117,269,517. The report says it's 117,305,000. The total difference ammounts to 28,483 people (0.02% of the total reported).


In [110]:
def share_of_world_pop(unhcr, unrwa, imdc, pop):
    
    # Create a count of the affected population in each criteria for any give year from 2013 onwards

    # Selects the valid entries from the UNHCR dataset
    unhcr_ = unhcr.copy() # Local copy to avoid in place modification
    unhcr_ = unhcr_[unhcr_.Year.isin(range(2014,2024))] # Last decade for comparisons
    unhcr_ = unhcr_[['Year', 'Refugees under UNHCR\'s mandate', 'Asylum-seekers', 
                     'Other people in need of international protection']]
    
    # Selects the refugees under the UNRWA authority (Palestinian territories)
    unrwa_ = unrwa.copy()
    unrwa_ = unrwa_[unrwa_.Year.isin(range(2014,2024))]
    unrwa_ = unrwa_[['Year', 'Total']]
    unrwa_ = unrwa_.rename(columns={'Total': 'Refugees under UNRWA\'s mandate'})
    
    # Selects the internally displaced people according to the IMDC data
    imdc_ = imdc.copy()
    imdc_ = imdc_[imdc_.Year.isin(range(2014,2024))]
    imdc_ = imdc_[['Year', 'Total']]
    imdc_ = imdc_.rename(columns={'Total': 'IDP\'s according to IMDC'})
    
    # Selects the population estimates for each year
    pop_ = pop.copy()
    pop_ = pop_[pop_.Time.isin(range(2014,2024))]
    pop_ = pop_[pop_.Location=='World']
    pop_ = pop_[['Time', 'TPopulation1July']]
    pop_ = pop_.rename(columns={"Time": "Year", "TPopulation1July":"Population"})
    
    
    # Merges everything by year
    data = unhcr_.merge(unrwa_).merge(imdc_).merge(pop_)
    data = data.replace("-", 0).astype(float)
        
    # Computes total population displaced
    data['Displaced population'] = data['Refugees under UNHCR\'s mandate'] + data['Asylum-seekers'] + \
                                   data['Other people in need of international protection'] + \
                                   data['Refugees under UNRWA\'s mandate'] + data['IDP\'s according to IMDC']
    
    # In 2023, we have to make an adjustment to account for a methodology change than was described
    # here: https://www.unhcr.org/refugee-statistics/insights/explainers/forcibly-displaced-pocs.html
    # "At the end of 2023, UNRWA estimates that 70 per cent of the 1.7 million IDPs in the Gaza Strip 
    # at end-2023 were Palestine refugees under its mandate. These internally displaced refugees under 
    # UNRWA’s mandate are only counted once in the global forcibly displaced total."
    # So, for 2023, we will remove 1.7 million * 0.7 from the total displaced population
    data.loc[data.Year==2023, 'Displaced population'] = data.loc[data.Year==2023, 'Displaced population'] - (1700000 * 0.7)
        
    # Compute the share of people per thousand that are displaced
    data['Displaced per thousand'] = (data['Displaced population'] / data['Population']) * 1000
    
    # Formats year
    data['Year'] = data.Year.astype(int)
    
    # Display, save and return variable
    display(data)
    data.to_csv("../output/Line chart – share of population forcibly displaced.csv", index=False)
    return data
    
    

In [111]:
data = share_of_world_pop(UNHCR, UNRWA, IMDC, POP)

Unnamed: 0,Year,Refugees under UNHCR's mandate,Asylum-seekers,Other people in need of international protection,Refugees under UNRWA's mandate,IDP's according to IMDC,Population,Displaced population,Displaced per thousand
0,2014,14384289.0,1794704.0,0.0,5149742.0,37877320.0,7339013419.0,59206055.0,8.07
1,2015,16110276.0,3223460.0,0.0,5241257.0,40451900.0,7426597537.0,65026893.0,8.76
2,2016,17184286.0,2729521.0,0.0,5340443.0,40220850.0,7513474238.0,65475100.0,8.71
3,2017,19940566.0,3089507.0,0.0,5442947.0,39934042.0,7599822404.0,68407062.0,9.0
4,2018,20359553.0,3501629.0,2592947.0,5545538.0,41312940.0,7683789828.0,73312607.0,9.54
5,2019,20414669.0,4148141.0,3582202.0,5629829.0,45667305.0,7764951032.0,79442146.0,10.23
6,2020,20661846.0,4184926.0,3862102.0,5703521.0,48027950.0,7840952880.0,82440345.0,10.51
7,2021,21327285.0,4616135.0,4406432.0,5792907.0,53246765.0,7909295152.0,89389524.0,11.3
8,2022,29429078.0,5442319.0,5217456.0,5887353.0,62500170.0,7975105156.0,108476376.0,13.6
9,2023,31637408.0,6858499.0,5755363.0,5936247.0,68279000.0,8045311448.0,117276517.0,14.58


In [109]:
data.Year

0   2014.00
1   2015.00
2   2016.00
3   2017.00
4   2018.00
5   2019.00
6   2020.00
7   2021.00
8   2022.00
9   2023.00
Name: Year, dtype: float64

### 2. Of all the displaced people, how many are internal, how many are in neighboring countries and how many are in other countries?

Here, we will use the 2023 data only. 

We will also need to account for the "double conting" of Palestinian refugees. To do so, we will remove 1.19 millon million (1.7 million * 0.7) from the refugee total, as explained above, so they are not doubly conted.

In [54]:
# Total displaced
total_displaced = int(data.loc[data.Year==2023]['Displaced population'])

# Total refugees
unhcr_refugees = int(data.loc[data.Year==2023]['Refugees under UNHCR\'s mandate'])
unrwa_refugees = int(data.loc[data.Year==2023]['Refugees under UNRWA\'s mandate'] - (1700000 * 0.7))
other_people = int(data.loc[data.Year==2023]['Other people in need of international protection'])
asylum_seekers = int(data.loc[data.Year==2023]['Asylum-seekers'])

# Total internally displaced population
internally_displaced = int(data.loc[data.Year==2023]['IDP\'s according to IMDC'])

In [126]:
# Total refugees
# UNHCR's mandate + Other people in need of international protection + UNRWA's mandate - 1.19 million double counted
total_refugees = unhcr_refugees + unrwa_refugees + other_people

In [127]:
# Check if the math we did is alright – that is, all displaced people are included in the sum
assert(total_refugees + internally_displaced + asylum_seekers == total_displaced)

Now we can compute the share of displaced people in each category.

In [131]:
# Dictionary to store the data
shares = {}

In [139]:
shares['Internally displaced'] = round(internally_displaced / total_displaced * 100, 2)
shares['Refugees and people in need of international protection'] = round(total_refugees / total_displaced * 100, 2)
shares['Asylum seekers'] = round(asylum_seekers / total_displaced * 100, 2)

In [140]:
assert (internally_displaced + total_refugees + asylum_seekers) == (total_displaced)

In [141]:
shares

{'Internally displaced': 58.22,
 'Refugees and people in need of international protection': 35.93,
 'Asylum seekers': 5.85}

In [134]:
to_export = pd.DataFrame.from_dict(shares, orient='index').reset_index()
to_export.to_csv("../output/pie-chart-shares-each-type.csv", index=False)