# Data Cleaning, preprocessing and merging

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

## asylum_seekers.csv

In [41]:
as_data = pd.read_csv("data/asylum_seekers.csv")

In [42]:
cols = ['Tota pending start-year', 'of which UNHCR-assisted(start-year)', 'Applied during year', 
        'decisions_recognized', 'decisions_other', 'Rejected', 'Otherwise closed', 'Total decisions', 
        'Total pending end-year', 'of which UNHCR-assisted(end-year)']
as_data[cols] = as_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [43]:
as_data = as_data[as_data.Year != 2000] # removing all the rows with year below 2001
print(as_data['Year'].value_counts(dropna=False))

2015    11225
2016    10461
2014     9908
2013     9259
2012     8644
2011     8299
2010     7905
2009     7159
2008     7042
2007     6924
2005     6721
2006     6656
2004     6601
2003     6359
2002     5862
2001     5542
Name: Year, dtype: int64


In [44]:
as_data = as_data.fillna( value = 0)

In [45]:
as_data.isnull().sum()

Year                                       0
Country / territory of asylum/residence    0
Origin                                     0
RSD procedure type / level                 0
Tota pending start-year                    0
of which UNHCR-assisted(start-year)        0
Applied during year                        0
decisions_recognized                       0
decisions_other                            0
Rejected                                   0
Otherwise closed                           0
Total decisions                            0
Total pending end-year                     0
of which UNHCR-assisted(end-year)          0
dtype: int64

In [46]:
as_data.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
5153,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5154,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,0.0,7.0,1.0,0.0
5155,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0
5156,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0
5157,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0
5158,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,0.0,324.0,72.0,0.0
5159,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,42.0,170.0,307.0,307.0
5160,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5161,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,41.0,421.0,89.0,89.0
5162,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,577.0,577.0,376.0,40.0


In [47]:
as_data = as_data.reset_index(drop=True)

In [50]:
as_data.to_csv("cleaned_data/cleaned_asylum_seekers.csv", index=False)

## demographic.csv

In [52]:
demo = pd.read_csv('data/cleaned_demographics.csv')

In [53]:
# Sum up the columns 
demo['5-17f'] = demo.iloc[:, 5:8].sum(axis=1)
demo['5-17m'] = demo.iloc[:, -9:-6].sum(axis=1)

In [54]:
# drop the columns that are not needed
demo = demo.drop(['5-11f', '12-17f', '5-11m', '12-17m'], axis=1)

In [55]:
demo.to_csv('cleaned_data/cleaned_aggregated_columns_demographics.csv', index=False)

## persons_of_concern.csv

In [56]:
person_of_concern_data = "./data/persons_of_concern.csv"
poc_data = pd.read_csv(person_of_concern_data)

In [57]:
cols = ['Refugees (incl. refugee-like situations)', 'Asylum-seekers (pending cases)', 
        'Returned refugees', 'Stateless persons', 'Others of concern', 'Total Population']

poc_data[cols] = poc_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [58]:
data_post_year_2001 = poc_data[poc_data['Year'] > 2000]

In [59]:
data_post_year_2001 = data_post_year_2001.fillna(0)

In [60]:
data_post_year_2001.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,Total Population
26904,2001,Afghanistan,Afghanistan,0.0,0.0,0.0,1200000.0,0.0,0.0,0.0,1200000.0
26905,2001,Afghanistan,Iran (Islamic Rep. of),3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26906,2001,Afghanistan,Iraq,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26907,2001,Angola,Angola,0.0,0.0,0.0,202000.0,0.0,0.0,0.0,202000.0
26908,2001,Angola,Burundi,18.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0
26909,2001,Angola,Cameroon,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
26910,2001,Angola,Dem. Rep. of the Congo,11933.0,636.0,1.0,0.0,0.0,0.0,0.0,12570.0
26911,2001,Angola,Congo,51.0,227.0,0.0,0.0,0.0,0.0,0.0,278.0
26912,2001,Angola,Comoros,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0
26913,2001,Angola,Cuba,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [61]:
df = data_post_year_2001.reset_index(drop=True)

In [62]:
data_post_year_2001.to_csv(r'cleaned_data/cleaned_people_of_concern.csv', index=False)

## resettlement.csv

In [63]:
resettlement_data = "./data/resettlement.csv"
r_data = pd.read_csv(resettlement_data)

In [64]:
r_data = r_data[r_data['Year'] >= 2001]

In [65]:
r_data = r_data.replace(to_replace ="*", value ="0")

In [66]:
r_data.to_csv('cleaned_data/cleaned_resettlement.csv', index=False)

## Merging asylum_seeker.csv, persons_of_concern.csv and resettlement.csv

In [24]:
file1 = pd.read_csv("cleaned_data/cleaned_people_of_concern.csv")
file2 = pd.read_csv("cleaned_data/cleaned_asylum_seekers.csv")
file3 = pd.read_csv("cleaned_data/cleaned_resettlement.csv")

In [40]:
merge_results = pd.merge(file1, file2, on=['Year', 'Country / territory of asylum/residence','Origin'], how='right')

In [41]:
merge_results

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,...,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
0,2001,Afghanistan,Iran (Islamic Rep. of),3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,110.0,21.0,0.0,68.0,21.0,110.0,0.0,0.0
1,2001,Afghanistan,Iraq,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,15.0,3.0,0.0,7.0,5.0,15.0,0.0,0.0
2,2001,Angola,Burundi,18.0,3.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
3,2001,Angola,Cameroon,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,2001,Angola,Dem. Rep. of the Congo,11933.0,636.0,1.0,0.0,0.0,0.0,0.0,...,595.0,595.0,129.0,24.0,0.0,58.0,6.0,88.0,636.0,596.0
5,2001,Angola,Congo,51.0,227.0,0.0,0.0,0.0,0.0,0.0,...,235.0,235.0,5.0,1.0,0.0,12.0,0.0,13.0,227.0,227.0
6,2001,Angola,Comoros,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
7,2001,Angola,Cuba,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0
8,2001,Angola,Guinea,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
9,2001,Angola,Guinea-Bissau,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0


In [50]:
final_results = pd.merge(merge_results, file3, on=['Year', 'Country / territory of asylum/residence','Origin'], how='inner')

In [49]:
final_results

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,...,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Value
0,2001,Australia,Afghanistan,6262.0,452.0,0.0,0.0,0.0,0.0,0.0,...,0.0,363.0,247.0,0.0,93.0,5.0,345.0,54.0,0.0,334
1,2001,Australia,Afghanistan,6262.0,452.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2161.0,1666.0,0.0,429.0,6.0,2101.0,398.0,0.0,334
2,2001,Australia,United Arab Emirates,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7
3,2001,Australia,Burundi,17.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,2.0,0.0,2.0,0.0,4.0,1.0,0.0,4
4,2001,Australia,Burundi,17.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,0.0,7.0,0.0,0.0,4
5,2001,Australia,Bosnia and Herzegovina,4882.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,3.0,0.0,0.0,2.0,5.0,5.0,0.0,373
6,2001,Australia,Bosnia and Herzegovina,4882.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,2.0,0.0,7.0,0.0,9.0,3.0,0.0,373
7,2001,Australia,China,323.0,1021.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1176.0,11.0,0.0,1001.0,56.0,1068.0,334.0,0.0,18
8,2001,Australia,China,323.0,1021.0,0.0,0.0,0.0,0.0,0.0,...,0.0,804.0,37.0,0.0,786.0,71.0,894.0,687.0,0.0,18
9,2001,Australia,Congo,22.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,15


In [75]:
final_results.to_csv('cleaned_data/merged_data.csv', index=False)