In [1]:
import pandas as pd

In [29]:
df = pd.read_csv("RefugeeMigrant_Deaths.csv")

df_routes = df[~(df.route=="Unknown")]
df_routes = df_routes.drop(['quarter', 'location'],axis=1)
df_routes = df_routes.dropna(how='any')
df_routes = df_routes.astype({"Year": int, 'dead_and_missing': int})

In [30]:
df_routes.isnull().sum()

route                     0
Location                  0
CartoDB_Cause_of_death    0
cause_of_death            0
Year                      0
dead_and_missing          0
dtype: int64

#### Prepare CSV format

In [31]:
filteredsankey = pd.DataFrame(df_routes[['Location', 'Year', 'dead_and_missing', 'route', 'CartoDB_Cause_of_death', 'cause_of_death']]\
.rename({'Location': 'state', 'Year': 'fiscal_year', 'dead_and_missing': 'value',
                      'route': 'agency', 'CartoDB_Cause_of_death': 'bureau', 'cause_of_death': 'program'}, axis=1)\
.reset_index(drop=True)\
.groupby(['state','fiscal_year','agency','bureau','program'])['value']\
                          .sum()\
                             .reset_index())
                          

In [32]:
filteredsankey = filteredsankey.rename({'state': 'location', 'fiscal_year': 'event_year', 'agency': 'route', 
                       'bureau': 'causal', 'program': 'reason'}, axis=1)


In [33]:
filteredsankey.to_csv("filteredsankey.csv")

#### Prepare JSON format

In [4]:
# prepare nodes dictionary
df_nodes = df_routes.copy()
df_nodes = df_nodes.astype({"Year": str})
df_nodes = pd.DataFrame(df_nodes[['route','Location', 'Year', 'CartoDB_Cause_of_death']]\
                        .unstack(level=1).reset_index(drop=True).unique())
df_nodes.columns = ['name']
dict_nodes = df_nodes.to_dict('records')

In [5]:
# prepare links dictionary
dict_yearRoute = df_routes.groupby(['Year','route'])['dead_and_missing']\
                          .sum()\
                          .reset_index()\
                          .sort_values(by=['Year'], ascending=False)\
                          .astype({"Year": str})\
                          .rename({'Year': 'source', 'route': 'target', 'dead_and_missing': 'value'}, axis=1)\
                          .to_dict('records')

dict_routeLocation = df_routes.groupby(['route', 'Location'])['dead_and_missing']\
                              .sum()\
                              .reset_index()\
                              .sort_values(by=['route', 'Location', 'dead_and_missing'], ascending=True)\
                              .rename({'route': 'source', 'Location': 'target', 'dead_and_missing': 'value'}, axis=1)\
                              .to_dict('records')

dict_locationCausalities = df_routes.groupby(['Location', 'CartoDB_Cause_of_death'])['dead_and_missing']\
                              .sum()\
                              .reset_index()\
                              .sort_values(by=['Location','CartoDB_Cause_of_death', 'dead_and_missing'], ascending=True)\
                              .rename({'Location': 'source', 'CartoDB_Cause_of_death': 'target', 'dead_and_missing': 'value'}, axis=1)\
                              .to_dict('records')

dict_causalitiesYear = df_routes.groupby(['CartoDB_Cause_of_death', 'Year'])['dead_and_missing']\
                          .sum()\
                          .reset_index()\
                          .sort_values(by=['CartoDB_Cause_of_death'], ascending=False)\
                          .astype({"Year": str})\
                          .rename({'CartoDB_Cause_of_death': 'source', 'Year': 'target', 'dead_and_missing': 'value'}, axis=1)\
                          .to_dict('records')

# collect all the link dictionaries
# dict_links = dict_yearRoute + dict_routeLocation + dict_locationCausalities
dict_links = dict_routeLocation + dict_locationCausalities + dict_causalitiesYear


In [6]:
data = {'links': dict_links,
        'nodes': dict_nodes}

In [7]:
import json

with open('data.json', 'w') as fp:
    json.dump(data, fp)

In [8]:
data

{'links': [{'source': 'Apulia and Calabria Route',
   'target': 'Albania',
   'value': 63},
  {'source': 'Apulia and Calabria Route', 'target': 'Greece', 'value': 13},
  {'source': 'Apulia and Calabria Route', 'target': 'Italy', 'value': 383},
  {'source': 'Apulia and Calabria Route',
   'target': 'Southern Adriatic',
   'value': 3},
  {'source': 'Central European Route', 'target': 'Germany', 'value': 2},
  {'source': 'Central European Route', 'target': 'Hungary', 'value': 2},
  {'source': 'Central European Route', 'target': 'Lampedusa', 'value': 19},
  {'source': 'Central European Route', 'target': 'Libya', 'value': 82},
  {'source': 'Central European Route', 'target': 'Niger', 'value': 18},
  {'source': 'Central European Route', 'target': 'Sardinia', 'value': 4},
  {'source': 'Central European Route', 'target': 'Sicily', 'value': 12},
  {'source': 'Central Mediterranean Route', 'target': 'Africa', 'value': 54},
  {'source': 'Central Mediterranean Route', 'target': 'Algeria', 'value':

### Add percentage column for InternationalMigrantStocks_x/InternationalMigrantStocks_y for selected age group of selected gender, destination and year 

In [7]:
df = pd.read_csv("data/International_and_totalStock_GenderAge.csv")

In [8]:
df['Gender'] = [i.replace('bothsexes ', 'bothsexes') if i=='bothsexes ' else i for i in df['Gender']]
df['Gender'] = [i.replace('female ', 'female') if i=='female ' else i for i in df['Gender']]
df['Gender'] = [i.replace('male ', 'male') if i=='male ' else i for i in df['Gender']]

In [13]:
df1 = df.groupby(['Year','Destination', 'Gender', 'AgeGroup'])['InternationalMigrantStocks'].sum().reset_index()

In [14]:
df2= df.groupby(['Year','Destination', 'Gender', 'AgeGroup'])['InternationalMigrantStocks'].sum().reset_index()\
.groupby(['Year','Destination', 'Gender'])['InternationalMigrantStocks'].sum().reset_index()

In [15]:
merged = pd.merge(df1, df2, how='left', on=['Year','Destination', 'Gender'])
# percentage of InternationalMigrantStocks_x/InternationalMigrantStocks_y for selected age group of selected gender, destination and year 
merged['ratio'] = merged['InternationalMigrantStocks_x']/merged['InternationalMigrantStocks_y']

In [16]:
merged = merged.rename({'InternationalMigrantStocks_y': 'Total_Y_D_G'}, axis=1)

In [17]:
# format_dict = { 'ratio': '{:.2%}'}
# merged.style.format(format_dict)

In [18]:
merged.head()

Unnamed: 0,Year,Destination,Gender,AgeGroup,InternationalMigrantStocks_x,Total_Y_D_G,ratio
0,1990,Afghanistan,bothsexes,0-4,3693,57686,0.064019
1,1990,Afghanistan,bothsexes,10-14,4505,57686,0.078095
2,1990,Afghanistan,bothsexes,15-19,5370,57686,0.09309
3,1990,Afghanistan,bothsexes,20-24,6490,57686,0.112506
4,1990,Afghanistan,bothsexes,25-29,7154,57686,0.124016


In [19]:
final = pd.merge(df, merged, how='left', on=['Year','Destination', 'Gender', 'AgeGroup'])

In [20]:
final = final.drop('InternationalMigrantStocks_x', axis=1)

In [21]:
final.to_csv("data/International_and_totalStock_GenderAge.csv", index=False)

### Zero Migration Flow Creator

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # set seaborn default parameters for plots
# may have to pip install it: 
# pip install pycountry
import pycountry
import os
data_folder = 'data'
figures_folder = 'figures'
gender_dict = {'b': 'both', 'f': 'female', 'm': 'male'}

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_country_name(country_code, alpha=True):
    if alpha:
        return pycountry.countries.get(alpha_3=country_code).name
    else:
        return pycountry.countries.get(numeric=country_code).name

## Data preparation

See section below for the explanations of the particular choices of `demo` and `stock`.

In [3]:
file_path = os.sep.join([data_folder, 'migflows_allcountries_gender_separated_1990_2015.csv'])
df = pd.read_csv(file_path)

# set whether you want to keep flows equal to zero in the dataset
remove_zero_flows_bool = True

print(f'Number of rows in original dataset: {df.shape[0]}.')

# select only time intervals of 5 years from 1990 to 2015 
df = df[(df.interval == 5) & (df.year0 >= 1990)]
print(f'Number of rows after filtering time interval: {df.shape[0]}.')

# select only the world population estimates of year 2015 
df = df[df.demo == 'wpp2015']
print(f'Number of rows after selecting only wpp2015 demographics: {df.shape[0]}.')

# select only the migrant stocks of year 2015 
df = df[df.stock == 'un15']
print(f'Number of rows after selecting only un2015 migrant stocks: {df.shape[0]}.')

# remove irrelevant and redundant columns
df.drop(columns=['stock', 'demo', 'interval'], inplace=True)

# request user to choose whether to keep the rows with 0 flow (this avoids forgetting about this option)
remove_zero_flows_str = input("Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.")
if remove_zero_flows_str.lower() == 'true':
    remove_zero_flows_bool = True
else:
    remove_zero_flows_bool = False
    
# remove rows with flow equal to 0
if remove_zero_flows_bool:
    df = df[df.flow > 0]
    print(f'Number of rows after removing rows with flow equal to 0: {df.shape[0]}.')

# get all countries in dataset
countries = set(df.orig.unique()).union(df.dest.unique())
print(f'There are {len(countries)} countries in MigrFlowSex dataset.')

iso_countries = {country.alpha_3 for country in pycountry.countries}
not_iso_countries = countries - iso_countries
print(f'There are {len(not_iso_countries)} not-ISO countries.')

# remove not-ISO countries
print(f"Let's remove them.")
df = df[~(df['orig'].isin(not_iso_countries) | df['dest'].isin(not_iso_countries))]
print(f'Number of rows after removing non-ISO countries {df.shape[0]}.')

countries_clean = set(df.orig.unique()).union(df.dest.unique())
assert len(countries_clean) == (len(countries) - len(not_iso_countries))
print(f'There are {len(countries_clean)} countries after removal.')

Number of rows in original dataset: 10321454.
Number of rows after filtering time interval: 4901986.
Number of rows after selecting only wpp2015 demographics: 1757100.
Number of rows after selecting only un2015 migrant stocks: 590448.
Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.True
Number of rows after removing rows with flow equal to 0: 166052.
There are 202 countries in MigrFlowSex dataset.
There are 3 not-ISO countries.
Let's remove them.
Number of rows after removing non-ISO countries 162889.
There are 199 countries after removal.


### correct the both gender values based on female and male flow numbers

In [4]:
df_new = pd.pivot_table(df, values = 'flow', 
                        index=['year0', 'orig','dest', 'orig_code','dest_code'], columns = 'sex').reset_index()
# fill nan values with 0
df_new = df_new.fillna(0, axis=1)

# correct the type of the flow numbers
if df_new.columns.str.contains("b").any()==True:
            df_new['b'] = df_new['b'].astype(int)
        
if df_new.columns.str.contains("f").any()==True:
            df_new['f'] = df_new['f'].astype(int)
        
if df_new.columns.str.contains("m").any()==True:
            df_new['m'] = df_new['m'].astype(int)
        
# drop column 'b' to calculate new one from 'f' and 'm' values
df_new = df_new.drop(['b'], axis=1)
df_new['b'] = df_new['f'] + df_new['m']

# convert the data frame format again
df_new = pd.melt(df_new, id_vars=['year0', 'orig', 'dest', 'orig_code', 'dest_code'], var_name='sex', value_name='flow')
df_new = df_new[['sex', 'year0', 'orig', 'dest', 'orig_code', 'dest_code', 'flow']]

df_new.head()

Unnamed: 0,sex,year0,orig,dest,orig_code,dest_code,flow
0,f,1990,ABW,CAN,533,124,0
1,f,1990,ABW,DEU,533,276,0
2,f,1990,ABW,FRA,533,250,0
3,f,1990,ABW,GBR,533,826,0
4,f,1990,ABW,NLD,533,528,0


In [5]:
# request user to choose whether to keep the rows with 0 flow (this avoids forgetting about this option)
remove_zero_flows_str = input("Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.")
if remove_zero_flows_str.lower() == 'true':
    remove_zero_flows_bool = True
else:
    remove_zero_flows_bool = False
    
# remove rows with flow equal to 0
if remove_zero_flows_bool:
    df_new = df_new[df_new.flow > 0]
    print(f'Number of rows after removing rows with flow equal to 0: {df_new.shape[0]}.')


Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.True
Number of rows after removing rows with flow equal to 0: 167492.


In [6]:
# write to file
filename = f'migflows_gender_separated_1990_2015_filtered{"_without0flows" if remove_zero_flows_bool else ""}.csv'
df_new.to_csv(os.sep.join([data_folder, filename]), index=False)