# Data Cleaning, preprocessing and merging

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

## asylum_seekers.csv

In [2]:
as_data = pd.read_csv("data/asylum_seekers.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
cols = ['Tota pending start-year', 'of which UNHCR-assisted(start-year)', 'Applied during year', 
        'decisions_recognized', 'decisions_other', 'Rejected', 'Otherwise closed', 'Total decisions', 
        'Total pending end-year', 'of which UNHCR-assisted(end-year)']
as_data[cols] = as_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [10]:
as_data = as_data[as_data.Year != 2000] # removing all the rows with year below 2001
print(as_data['Year'].value_counts(dropna=False))

2015    11225
2016    10461
2014     9908
2013     9259
2012     8644
2011     8299
2010     7905
2009     7159
2008     7042
2007     6924
2005     6721
2006     6656
2004     6601
2003     6359
2002     5862
2001     5542
Name: Year, dtype: int64


In [5]:
as_data = as_data.fillna( value = 0)

In [6]:
as_data.isnull().sum()

Year                                       0
Country / territory of asylum/residence    0
Origin                                     0
RSD procedure type / level                 0
Tota pending start-year                    0
of which UNHCR-assisted(start-year)        0
Applied during year                        0
decisions_recognized                       0
decisions_other                            0
Rejected                                   0
Otherwise closed                           0
Total decisions                            0
Total pending end-year                     0
of which UNHCR-assisted(end-year)          0
dtype: int64

In [7]:
as_data.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
5153,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5154,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,0.0,7.0,1.0,0.0
5155,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0
5156,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0
5157,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0
5158,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,0.0,324.0,72.0,0.0
5159,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,42.0,170.0,307.0,307.0
5160,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5161,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,41.0,421.0,89.0,89.0
5162,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,577.0,577.0,376.0,40.0


In [47]:
as_data = as_data.reset_index(drop=True)

In [50]:
as_data.to_csv("cleaned_data/cleaned_asylum_seekers.csv", index=False)

## demographic.csv

In [52]:
demo = pd.read_csv('data/cleaned_demographics.csv')

In [53]:
# Sum up the columns 
demo['5-17f'] = demo.iloc[:, 5:8].sum(axis=1)
demo['5-17m'] = demo.iloc[:, -9:-6].sum(axis=1)

In [54]:
# drop the columns that are not needed
demo = demo.drop(['5-11f', '12-17f', '5-11m', '12-17m'], axis=1)

In [55]:
demo.to_csv('cleaned_data/cleaned_aggregated_columns_demographics.csv', index=False)

## persons_of_concern.csv

In [56]:
person_of_concern_data = "./data/persons_of_concern.csv"
poc_data = pd.read_csv(person_of_concern_data)

In [57]:
cols = ['Refugees (incl. refugee-like situations)', 'Asylum-seekers (pending cases)', 
        'Returned refugees', 'Stateless persons', 'Others of concern', 'Total Population']

poc_data[cols] = poc_data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [58]:
data_post_year_2001 = poc_data[poc_data['Year'] > 2000]

In [59]:
data_post_year_2001 = data_post_year_2001.fillna(0)

In [60]:
data_post_year_2001.replace(to_replace ="*", value ="0")

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,Total Population
26904,2001,Afghanistan,Afghanistan,0.0,0.0,0.0,1200000.0,0.0,0.0,0.0,1200000.0
26905,2001,Afghanistan,Iran (Islamic Rep. of),3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26906,2001,Afghanistan,Iraq,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26907,2001,Angola,Angola,0.0,0.0,0.0,202000.0,0.0,0.0,0.0,202000.0
26908,2001,Angola,Burundi,18.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0
26909,2001,Angola,Cameroon,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
26910,2001,Angola,Dem. Rep. of the Congo,11933.0,636.0,1.0,0.0,0.0,0.0,0.0,12570.0
26911,2001,Angola,Congo,51.0,227.0,0.0,0.0,0.0,0.0,0.0,278.0
26912,2001,Angola,Comoros,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0
26913,2001,Angola,Cuba,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [61]:
df = data_post_year_2001.reset_index(drop=True)

In [62]:
data_post_year_2001.to_csv(r'cleaned_data/cleaned_people_of_concern.csv', index=False)

## resettlement.csv

In [63]:
resettlement_data = "./data/resettlement.csv"
r_data = pd.read_csv(resettlement_data)

In [64]:
r_data = r_data[r_data['Year'] >= 2001]

In [65]:
r_data = r_data.replace(to_replace ="*", value ="0")

In [66]:
r_data.to_csv('cleaned_data/cleaned_resettlement.csv', index=False)

## Adding columns to csv files

In [55]:
file1 = pd.read_csv("cleaned_data/cleaned_people_of_concern.csv")
file2 = pd.read_csv("cleaned_data/cleaned_asylum_seekers.csv")
file3 = pd.read_csv("cleaned_data/cleaned_resettlement.csv")

In [56]:
file2['Successful']=file2['decisions_recognized']+file2['decisions_other']
file2['Unsuccessful']=file2['Rejected']+file2['Otherwise closed']

### Calculating the acceptance rate

In [57]:
file2['acceptance_rate'] = file2.apply(
    lambda a: 0.0 if(a.decisions_recognized == 0.0 and a.decisions_other == 0.0 and a.Rejected == 0.0)
            else 
            (a.decisions_recognized + a.decisions_other)/(a.decisions_recognized + a.decisions_other 
            + a.Rejected), axis=1)

### Adding columns of accepted or rejected based on acceptance rate

In [58]:
file2['accepted/rejected'] = file2.apply(
    lambda x: 1 if(x.acceptance_rate >= 0.5) else 0, axis=1)

### Encoding the column

In [59]:
# This row has 0 for all the data, therefore drop the entire of this row
file2 = file2.drop([118624])

In [60]:
file2

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful,acceptance_rate,accepted/rejected
0,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.000000,0
1,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,0.0,7.0,1.0,0.0,5.0,2.0,0.714286,1
2,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0,1573.0,436.0,0.864286,1
3,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0,129.0,118.0,0.826923,1
4,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0,212.0,62.0,0.831373,1
5,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,0.0,324.0,72.0,0.0,223.0,101.0,0.688272,1
6,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,42.0,170.0,307.0,307.0,107.0,63.0,0.835938,1
7,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.000000,0
8,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,41.0,421.0,89.0,89.0,190.0,231.0,0.500000,1
9,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,577.0,577.0,376.0,40.0,0.0,577.0,0.000000,0


#### Encoding RSD procedure type / level

In [61]:
file2['RSD procedure type / level'].unique()

array(['G / AR', 'G / FI', 'U / FI', 'G / EO', 'G / IN', 'U / FA',
       'G / JR', 'G / FA', 'G / RA', 'G / NA', 'G / CA', 'J / FI',
       'U / AR', 'G / fi', 'U / JR', 'G / ar', 'J / AR', 'U / RA',
       'J / FA', 'G / BL', 'G / SP', 'J / RA', 'G / TP', 'G / TA',
       'U / NA'], dtype=object)

In [62]:
file2['Encoded procedure type'] = file2['RSD procedure type / level'].astype('category').cat.codes

#### Encoding Country / territory of asylum/residence

In [63]:
file2['Country / territory of asylum/residence'].unique()

array(['South Africa', 'Uzbekistan', 'United States of America',
       'Ukraine', 'Turkey', 'Tunisia', 'Turkmenistan', 'Tajikistan',
       'Thailand', 'Syrian Arab Rep.', 'Sweden', 'Slovenia', 'Slovakia',
       'Serbia and Kosovo (S/RES/1244 (1999))', 'Singapore',
       'Saudi Arabia', 'Russian Federation', 'Romania', 'Qatar',
       'Portugal', 'Poland', 'Oman', 'New Zealand', 'Nepal', 'Norway',
       'Netherlands', 'Malaysia', 'Mozambique', 'Malta',
       'The former Yugoslav Republic of Macedonia', 'Rep. of Moldova',
       'Latvia', 'Luxembourg', 'Lithuania', 'Sri Lanka', 'Lebanon',
       'Kuwait', 'Rep. of Korea', 'Cambodia', 'Kyrgyzstan', 'Kenya',
       'Kazakhstan', 'Japan', 'Jordan', 'Italy', 'Iceland', 'Iraq',
       'Ireland', 'India', 'Indonesia', 'Hungary', 'Croatia',
       'China, Hong Kong SAR', 'Greece', 'United Kingdom', 'France',
       'Finland', 'Estonia', 'Spain', 'Egypt', 'Denmark', 'Germany',
       'Czech Rep.', 'Cyprus', 'Cuba', "Côte d'Ivoire", 'China'

In [64]:
file2['Encoded Target Country'] = file2['Country / territory of asylum/residence'].astype('category').cat.codes

In [65]:
file2

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country
0,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.000000,0,0,156
1,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,0.0,7.0,1.0,0.0,5.0,2.0,0.714286,1,5,156
2,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,189.0,2009.0,1316.0,1316.0,1573.0,436.0,0.864286,1,21,184
3,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,91.0,247.0,164.0,0.0,129.0,118.0,0.826923,1,3,182
4,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,19.0,274.0,166.0,0.0,212.0,62.0,0.831373,1,6,182
5,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,0.0,324.0,72.0,0.0,223.0,101.0,0.688272,1,5,178
6,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,42.0,170.0,307.0,307.0,107.0,63.0,0.835938,1,21,174
7,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.000000,0,21,173
8,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,41.0,421.0,89.0,89.0,190.0,231.0,0.500000,1,21,175
9,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,577.0,577.0,376.0,40.0,0.0,577.0,0.000000,0,5,166


#### Encoding Origin

In [66]:
file2['Origin'].unique()

array(['Afghanistan', 'Angola', 'Albania', 'Andorra',
       'United Arab Emirates', 'Argentina', 'Armenia',
       'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan',
       'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas', 'Bosnia and Herzegovina',
       'Belarus', 'Belize', 'Bolivia (Plurinational State of)', 'Brazil',
       'Barbados', 'Bhutan', 'Botswana', 'Central African Rep.', 'Canada',
       'Switzerland', 'Chile', 'China', "Côte d'Ivoire", 'Cameroon',
       'Dem. Rep. of the Congo', 'Congo', 'Colombia', 'Comoros',
       'Cabo Verde', 'Costa Rica', 'Cuba', 'Cyprus', 'Czech Rep.',
       'Germany', 'Djibouti', 'Dominica', 'Denmark', 'Dominican Rep.',
       'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Western Sahara',
       'Spain', 'Estonia', 'Ethiopia', 'Finland', 'Fiji', 'France',
       'Gabon', 'United Kingdom', 'Georgia', 'Ghana', 'Guinea', 'Gambia',
       'Guinea-Bissau', 'Equatorial Guinea', 'Greece', 'Gr

In [67]:
file2['Encoded Origin'] = file2['Origin'].astype('category').cat.codes

In [68]:
file2

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,...,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year),Successful,Unsuccessful,acceptance_rate,accepted/rejected,Encoded procedure type,Encoded Target Country,Encoded Origin
0,2001,South Africa,Afghanistan,G / AR,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.000000,0,0,156,0
1,2001,South Africa,Afghanistan,G / FI,8.0,0.0,0.0,5.0,0.0,2.0,...,7.0,1.0,0.0,5.0,2.0,0.714286,1,5,156,0
2,2001,Uzbekistan,Afghanistan,U / FI,1235.0,1235.0,2090.0,1573.0,0.0,247.0,...,2009.0,1316.0,1316.0,1573.0,436.0,0.864286,1,21,184,0
3,2001,United States of America,Afghanistan,G / EO,186.0,0.0,225.0,129.0,0.0,27.0,...,247.0,164.0,0.0,129.0,118.0,0.826923,1,3,182,0
4,2001,United States of America,Afghanistan,G / IN,152.0,0.0,274.0,212.0,0.0,43.0,...,274.0,166.0,0.0,212.0,62.0,0.831373,1,6,182,0
5,2001,Ukraine,Afghanistan,G / FI,23.0,0.0,373.0,223.0,0.0,101.0,...,324.0,72.0,0.0,223.0,101.0,0.688272,1,5,178,0
6,2001,Turkey,Afghanistan,U / FI,46.0,46.0,431.0,107.0,0.0,21.0,...,170.0,307.0,307.0,107.0,63.0,0.835938,1,21,174,0
7,2001,Tunisia,Afghanistan,U / FI,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.000000,0,21,173,0
8,2001,Turkmenistan,Afghanistan,U / FI,128.0,128.0,382.0,190.0,0.0,190.0,...,421.0,89.0,89.0,190.0,231.0,0.500000,1,21,175,0
9,2001,Tajikistan,Afghanistan,G / FI,233.0,40.0,720.0,0.0,0.0,0.0,...,577.0,376.0,40.0,0.0,577.0,0.000000,0,5,166,0


### Exporting the cleaned and preprocessed file

In [69]:
file2.to_csv('cleaned_data/cleaned_asylum_seekers_added.csv', index=False)