# Data Cleaning, preprocessing and merging

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

## asylum_seekers.csv

In [2]:
as_data = pd.read_csv("data/asylum_seekers.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
cols = ['Tota pending start-year', 'of which UNHCR-assisted(start-year)', 'Applied during year', 
        'decisions_recognized', 'decisions_other', 'Rejected', 'Otherwise closed', 'Total decisions', 
        'Total pending end-year', 'of which UNHCR-assisted(end-year)']

as_data[cols] = as_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [4]:
as_data = as_data[as_data.Year != 2000] # removing all the rows with year below 2001
print(as_data['Year'].value_counts(dropna=False))

2015    11225
2016    10461
2014     9908
2013     9259
2012     8644
2011     8299
2010     7905
2009     7159
2008     7042
2007     6924
2005     6721
2006     6656
2004     6601
2003     6359
2002     5862
2001     5542
Name: Year, dtype: int64


In [5]:
as_data = as_data.fillna( value = 0)

In [6]:
as_data.isnull().sum()

Year                                       0
Country / territory of asylum/residence    0
Origin                                     0
RSD procedure type / level                 0
Tota pending start-year                    0
of which UNHCR-assisted(start-year)        0
Applied during year                        0
decisions_recognized                       0
decisions_other                            0
Rejected                                   0
Otherwise closed                           0
Total decisions                            0
Total pending end-year                     0
of which UNHCR-assisted(end-year)          0
dtype: int64

In [7]:
as_data.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
5153,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5154,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,0.0,7.0,1.0,0.0
5155,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0
5156,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0
5157,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129715,2016,United States of America,Zimbabwe,G / IN,232.0,0.0,229.0,16.0,0.0,0.0,10.0,28.0,435.0,0.0
129716,2016,United States of America,Zimbabwe,G / EO,142.0,0.0,12.0,0.0,0.0,0.0,23.0,31.0,138.0,0.0
129717,2016,South Africa,Zimbabwe,G / AR,94.0,9.0,0.0,0.0,0.0,0.0,94.0,94.0,0.0,0.0
129718,2016,South Africa,Zimbabwe,G / FI,41238.0,4124.0,7964.0,73.0,0.0,7869.0,0.0,7942.0,41260.0,0.0


In [8]:
as_data = as_data.reset_index(drop=True)

In [9]:
as_data.to_csv("cleaned_data/cleaned_asylum_seekers.csv", index=False)

## demographic.csv

In [10]:
demo = pd.read_csv('data/cleaned_demographics.csv')

In [11]:
# Sum up the columns 
demo['5-17f'] = demo.iloc[:, 5:8].sum(axis=1)
demo['5-17m'] = demo.iloc[:, -9:-6].sum(axis=1)

In [12]:
# drop the columns that are not needed
demo = demo.drop(['5-11f', '12-17f', '5-11m', '12-17m'], axis=1)

In [13]:
demo.to_csv('cleaned_data/cleaned_aggregated_columns_demographics.csv', index=False)

## persons_of_concern.csv

In [14]:
person_of_concern_data = "./data/persons_of_concern.csv"
poc_data = pd.read_csv(person_of_concern_data)

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
cols = ['Refugees (incl. refugee-like situations)', 'Asylum-seekers (pending cases)', 
        'Returned refugees', 'Stateless persons', 'Others of concern', 'Total Population']

poc_data[cols] = poc_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [16]:
data_post_year_2001 = poc_data[poc_data['Year'] > 2000]

In [17]:
data_post_year_2001 = data_post_year_2001.fillna(0)

In [18]:
data_post_year_2001.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,Total Population
26904,2001,Afghanistan,Afghanistan,0.0,0.0,0.0,1200000.0,0.0,0.0,0.0,1200000.0
26905,2001,Afghanistan,Iran (Islamic Rep. of),3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26906,2001,Afghanistan,Iraq,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26907,2001,Angola,Angola,0.0,0.0,0.0,202000.0,0.0,0.0,0.0,202000.0
26908,2001,Angola,Burundi,18.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...
117316,2016,Zimbabwe,Somalia,24.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0
117317,2016,Zimbabwe,Syrian Arab Rep.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117318,2016,Zimbabwe,Uganda,7.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
117319,2016,Zimbabwe,South Africa,0.0,0.0,0.0,0.0,0.0,0.0,7.0,8.0


In [19]:
df = data_post_year_2001.reset_index(drop=True)

In [20]:
data_post_year_2001.to_csv(r'cleaned_data/cleaned_people_of_concern.csv', index=False)

## resettlement.csv

In [21]:
resettlement_data = "./data/resettlement.csv"
r_data = pd.read_csv(resettlement_data)

In [22]:
r_data = r_data[r_data['Year'] >= 2001]

In [23]:
r_data = r_data.replace(to_replace ="*", value ="0")

In [24]:
r_data.to_csv('cleaned_data/cleaned_resettlement.csv', index=False)

## Adding columns to csv files

In [5]:
file1 = pd.read_csv("cleaned_data/cleaned_people_of_concern.csv")
file2 = pd.read_csv("cleaned_data/cleaned_asylum_seekers.csv")
file3 = pd.read_csv("cleaned_data/cleaned_resettlement.csv")

In [6]:
file2['Successful'] = file2['decisions_recognized']+file2['decisions_other']
file2['Unsuccessful'] = file2['Rejected']+file2['Otherwise closed']

file2 = file2.rename(columns={"Otherwise closed": "Otherwise_closed"})
file2['Otherwise_closed'] = file2.apply(lambda x: 1.0 if x.Otherwise_closed <= 0.0 else x.Otherwise_closed, axis=1)

In [7]:
file2

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful
0,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0
1,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,1.0,7.0,1.0,0.0,5.0,2.0
2,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0,1573.0,436.0
3,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0,129.0,118.0
4,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0,212.0,62.0
5,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,1.0,324.0,72.0,0.0,223.0,101.0
6,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,42.0,170.0,307.0,307.0,107.0,63.0
7,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
8,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,41.0,421.0,89.0,89.0,190.0,231.0
9,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,577.0,577.0,376.0,40.0,0.0,577.0


### Group by Origin and Target

In [8]:
file2_combine = file2.drop(['Year'], axis=1)

file2_combine = file2_combine.groupby(['Country / territory of asylum/residence', 'Origin', 'RSD procedure type / level'])

In [9]:
file2_combine = file2_combine.sum()

# resetting index 
file2_combine.reset_index(inplace = True) 

In [10]:
file2_combine

Unnamed: 0,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful
0,Afghanistan,China,U / FI,10.0,2.0,7.0,0.0,0.0,0.0,12.0,11.0,6.0,2.0,0.0,11.0
1,Afghanistan,Egypt,U / FI,0.0,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,3.0
2,Afghanistan,Eritrea,U / FI,1.0,1.0,2.0,2.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0
3,Afghanistan,Indonesia,U / FI,0.0,0.0,5.0,0.0,0.0,5.0,1.0,5.0,0.0,0.0,0.0,5.0
4,Afghanistan,Iran (Islamic Rep. of),U / AR,6.0,6.0,7.0,0.0,0.0,0.0,6.0,3.0,15.0,15.0,0.0,3.0
5,Afghanistan,Iran (Islamic Rep. of),U / FA,46.0,46.0,14.0,12.0,0.0,8.0,4.0,23.0,37.0,37.0,12.0,11.0
6,Afghanistan,Iran (Islamic Rep. of),U / FI,172.0,82.0,314.0,54.0,0.0,123.0,124.0,301.0,185.0,99.0,54.0,244.0
7,Afghanistan,Iraq,U / FA,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,3.0,3.0,0.0,0.0
8,Afghanistan,Iraq,U / FI,45.0,5.0,78.0,11.0,0.0,35.0,49.0,88.0,35.0,3.0,11.0,77.0
9,Afghanistan,Kazakhstan,U / FI,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,0.0


In [11]:
file2_combine[file2_combine['Country / territory of asylum/residence']=='Afghanistan']

Unnamed: 0,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful
0,Afghanistan,China,U / FI,10.0,2.0,7.0,0.0,0.0,0.0,12.0,11.0,6.0,2.0,0.0,11.0
1,Afghanistan,Egypt,U / FI,0.0,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,3.0
2,Afghanistan,Eritrea,U / FI,1.0,1.0,2.0,2.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0
3,Afghanistan,Indonesia,U / FI,0.0,0.0,5.0,0.0,0.0,5.0,1.0,5.0,0.0,0.0,0.0,5.0
4,Afghanistan,Iran (Islamic Rep. of),U / AR,6.0,6.0,7.0,0.0,0.0,0.0,6.0,3.0,15.0,15.0,0.0,3.0
5,Afghanistan,Iran (Islamic Rep. of),U / FA,46.0,46.0,14.0,12.0,0.0,8.0,4.0,23.0,37.0,37.0,12.0,11.0
6,Afghanistan,Iran (Islamic Rep. of),U / FI,172.0,82.0,314.0,54.0,0.0,123.0,124.0,301.0,185.0,99.0,54.0,244.0
7,Afghanistan,Iraq,U / FA,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,3.0,3.0,0.0,0.0
8,Afghanistan,Iraq,U / FI,45.0,5.0,78.0,11.0,0.0,35.0,49.0,88.0,35.0,3.0,11.0,77.0
9,Afghanistan,Kazakhstan,U / FI,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,0.0


### Calculating the acceptance rate

In [12]:
file2_combine['acceptance_rate'] = file2_combine.apply(
    lambda a: 0.0 if(a.decisions_recognized == 0.0 and a.decisions_other == 0.0 
                     and a.Rejected == 0.0 and a.Otherwise_closed <= 0.0)
            else 
            (a.decisions_recognized + a.decisions_other)/(a.decisions_recognized + a.decisions_other 
            + a.Rejected + a.Otherwise_closed), axis=1)

In [13]:
file2_combine

Unnamed: 0,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful,acceptance_rate
0,Afghanistan,China,U / FI,10.0,2.0,7.0,0.0,0.0,0.0,12.0,11.0,6.0,2.0,0.0,11.0,0.000000
1,Afghanistan,Egypt,U / FI,0.0,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,3.0,0.000000
2,Afghanistan,Eritrea,U / FI,1.0,1.0,2.0,2.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,0.400000
3,Afghanistan,Indonesia,U / FI,0.0,0.0,5.0,0.0,0.0,5.0,1.0,5.0,0.0,0.0,0.0,5.0,0.000000
4,Afghanistan,Iran (Islamic Rep. of),U / AR,6.0,6.0,7.0,0.0,0.0,0.0,6.0,3.0,15.0,15.0,0.0,3.0,0.000000
5,Afghanistan,Iran (Islamic Rep. of),U / FA,46.0,46.0,14.0,12.0,0.0,8.0,4.0,23.0,37.0,37.0,12.0,11.0,0.500000
6,Afghanistan,Iran (Islamic Rep. of),U / FI,172.0,82.0,314.0,54.0,0.0,123.0,124.0,301.0,185.0,99.0,54.0,244.0,0.179402
7,Afghanistan,Iraq,U / FA,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,3.0,3.0,0.0,0.0,0.000000
8,Afghanistan,Iraq,U / FI,45.0,5.0,78.0,11.0,0.0,35.0,49.0,88.0,35.0,3.0,11.0,77.0,0.115789
9,Afghanistan,Kazakhstan,U / FI,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,0.0,0.000000


### Encoding columns of accepted or rejected based on acceptance rate

In [14]:
file2_combine['accepted/rejected'] = file2_combine.apply(
    lambda x: 1 if(x.acceptance_rate >= 0.5) else 0, axis=1)

In [15]:
#List unique values in the df['RSD procedure type / level'] column
file2_combine["RSD procedure type / level"].unique()

array(['U / FI', 'U / AR', 'U / FA', 'G / FA', 'G / FI', 'U / RA',
       'J / FI', 'G / AR', 'G / JR', 'G / RA', 'G / SP', 'J / FA',
       'G / NA', 'G / fi', 'G / ar', 'J / AR', 'J / RA', 'U / JR',
       'G / BL', '0', 'G / TA', 'G / CA', 'G / TP', 'U / NA', 'G / EO',
       'G / IN'], dtype=object)

#### Encoding RSD procedure type / level

In [16]:
file2_combine['Encoded procedure type'] = file2_combine['RSD procedure type / level'].astype('category').cat.codes

#### Encoding Country / territory of asylum/residence

In [17]:
file2_combine['Country / territory of asylum/residence'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Anguilla',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bolivia (Plurinational State of)', 'Bonaire',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands',
       'Central African Rep.', 'Chad', 'Chile', 'China',
       'China, Hong Kong SAR', 'China, Macao SAR', 'Colombia', 'Congo',
       'Costa Rica', 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Rep.',
       "Côte d'Ivoire", 'Dem. Rep. of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Rep.', 'Ecuador', 'Egypt', 'El Salvador',
       'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece',
    

In [18]:
file2_combine['Encoded Target Country'] = file2_combine['Country / territory of asylum/residence'].astype('category').cat.codes

#### Encoding Origin

In [19]:
file2_combine['Origin'].unique()

array(['China', 'Egypt', 'Eritrea', 'Indonesia', 'Iran (Islamic Rep. of)',
       'Iraq', 'Kazakhstan', 'Kyrgyzstan', 'Liberia', 'Nigeria',
       'Pakistan', 'Palestinian', 'Russian Federation',
       'Syrian Arab Rep.', 'Tajikistan', 'Turkey', 'Uzbekistan',
       'Afghanistan', 'Algeria', 'Armenia', 'Azerbaijan', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Bosnia and Herzegovina',
       'Cameroon', 'Central African Rep.', 'Chad', 'Czech Rep.',
       'Dem. Rep. of the Congo', 'Gabon', 'Gambia', 'Ghana', 'Guinea',
       'India', 'Jordan', 'Libya', 'Malaysia', 'Mali', 'Mauritania',
       'Montenegro', 'Morocco', 'Nepal', 'Peru', 'Rep. of Moldova',
       'Romania', 'Saudi Arabia', 'Senegal',
       'Serbia and Kosovo (S/RES/1244 (1999))', 'Somalia', 'Sri Lanka',
       'Sudan', 'The former Yugoslav Republic of Macedonia', 'Tunisia',
       'Ukraine', 'Angola', 'Benin', 'Burkina Faso', 'Burundi', 'Congo',
       "Côte d'Ivoire", 'Ethiopia', 'France', 'Guinea-Bissau', 'L

In [20]:
file2_combine['Encoded Origin'] = file2_combine['Origin'].astype('category').cat.codes

### Drop Attributes

In [21]:
file2_combine = file2_combine.drop(['of which UNHCR-assisted(start-year)', 'of which UNHCR-assisted(end-year)', 
                                    'Applied during year', 'Tota pending start-year', 'Total pending end-year'], axis = 1)

In [22]:
file2_combine

Unnamed: 0,Country / territory of asylum/residence,Origin,RSD procedure type / level,decisions_recognized,decisions_other,Rejected,Otherwise_closed,Total decisions,Successful,Unsuccessful,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin
0,Afghanistan,China,U / FI,0.0,0.0,0.0,12.0,11.0,0.0,11.0,0.000000,0,22,0,41
1,Afghanistan,Egypt,U / FI,0.0,0.0,0.0,3.0,3.0,0.0,3.0,0.000000,0,22,0,62
2,Afghanistan,Eritrea,U / FI,2.0,0.0,0.0,3.0,2.0,2.0,0.0,0.400000,0,22,0,65
3,Afghanistan,Indonesia,U / FI,0.0,0.0,5.0,1.0,5.0,0.0,5.0,0.000000,0,22,0,92
4,Afghanistan,Iran (Islamic Rep. of),U / AR,0.0,0.0,0.0,6.0,3.0,0.0,3.0,0.000000,0,20,0,93
5,Afghanistan,Iran (Islamic Rep. of),U / FA,12.0,0.0,8.0,4.0,23.0,12.0,11.0,0.500000,1,21,0,93
6,Afghanistan,Iran (Islamic Rep. of),U / FI,54.0,0.0,123.0,124.0,301.0,54.0,244.0,0.179402,0,22,0,93
7,Afghanistan,Iraq,U / FA,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0,21,0,94
8,Afghanistan,Iraq,U / FI,11.0,0.0,35.0,49.0,88.0,11.0,77.0,0.115789,0,22,0,94
9,Afghanistan,Kazakhstan,U / FI,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0,22,0,101


### Exporting the cleaned and preprocessed file

In [23]:
file2_combine.to_csv('cleaned_data/cleaned_asylum_seekers_added.csv', index=False)