In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # set seaborn default parameters for plots
# may have to pip install it: 
# pip install pycountry
import pycountry
import os
data_folder = 'data'
figures_folder = 'figures'
gender_dict = {'b': 'both', 'f': 'female', 'm': 'male'}

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_country_name(country_code, alpha=True):
    if alpha:
        return pycountry.countries.get(alpha_3=country_code).name
    else:
        return pycountry.countries.get(numeric=country_code).name

## Data preparation

See section below for the explanations of the particular choices of `demo` and `stock`.

In [3]:
# request user to choose whether to keep the rows with 0 flow
remove_zero_flows_str = input("Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.")
if remove_zero_flows_str.lower() == 'true':
    remove_zero_flows_bool = True
else:
    remove_zero_flows_bool = False

Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.True


In [4]:
file_path = os.sep.join([data_folder, 'Abel', 'migflows_allcountries_gender_separated_1990_2015.csv'])
df = pd.read_csv(file_path)

# set whether you want to keep flows equal to zero in the dataset
remove_zero_flows_bool = True

print(f'Number of rows in original dataset: {df.shape[0]}.')

# select only time intervals of 5 years from 1990 to 2015 
df = df[(df.interval == 5) & (df.year0 >= 1990)]
print(f'Number of rows after filtering time interval: {df.shape[0]}.')

# select only the world population estimates of year 2015 
df = df[df.demo == 'wpp2015']
print(f'Number of rows after selecting only wpp2015 demographics: {df.shape[0]}.')

# select only the migrant stocks of year 2015 
df = df[df.stock == 'un15']
print(f'Number of rows after selecting only un2015 migrant stocks: {df.shape[0]}.')

# remove irrelevant and redundant columns
df.drop(columns=['stock', 'demo', 'interval'], inplace=True)

# request user to choose whether to keep the rows with 0 flow (this avoids forgetting about this option)
remove_zero_flows_str = input("Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.")
if remove_zero_flows_str.lower() == 'true':
    remove_zero_flows_bool = True
else:
    remove_zero_flows_bool = False
    
# remove rows with flow equal to 0
if remove_zero_flows_bool:
    df = df[df.flow > 0]
    print(f'Number of rows after removing rows with flow equal to 0: {df.shape[0]}.')

# get all countries in dataset
countries = set(df.orig.unique()).union(df.dest.unique())
print(f'There are {len(countries)} countries in MigrFlowSex dataset.')

iso_countries = {country.alpha_3 for country in pycountry.countries}
not_iso_countries = countries - iso_countries
print(f'There are {len(not_iso_countries)} not-ISO countries.')

# remove not-ISO countries
print(f"Let's remove them.")
df = df[~(df['orig'].isin(not_iso_countries) | df['dest'].isin(not_iso_countries))]
print(f'Number of rows after removing non-ISO countries {df.shape[0]}.')

countries_clean = set(df.orig.unique()).union(df.dest.unique())
assert len(countries_clean) == (len(countries) - len(not_iso_countries))
print(f'There are {len(countries_clean)} countries after removal.')

Number of rows in original dataset: 10321454.
Number of rows after filtering time interval: 4901986.
Number of rows after selecting only wpp2015 demographics: 1757100.
Number of rows after selecting only un2015 migrant stocks: 590448.
Do you want to remove the rows with 0 flow? Answer by 'True' or 'False'.True
Number of rows after removing rows with flow equal to 0: 166052.
There are 202 countries in MigrFlowSex dataset.
There are 3 not-ISO countries.
Let's remove them.
Number of rows after removing non-ISO countries 162889.
There are 199 countries after removal.


### correct the both gender values based on female and male flow numbers

In [5]:
df_new = pd.pivot_table(df, values = 'flow', 
                        index=['orig','dest', 'orig_code','dest_code'], columns = 'sex').reset_index()
# fill nan values with 0
df_new = df_new.fillna(0, axis=1)

# correct the type of the flow numbers
if df_new.columns.str.contains("b").any()==True:
            df_new['b'] = df_new['b'].astype(int)
        
if df_new.columns.str.contains("f").any()==True:
            df_new['f'] = df_new['f'].astype(int)
        
if df_new.columns.str.contains("m").any()==True:
            df_new['m'] = df_new['m'].astype(int)
        
# drop column 'b' to calcualte new one from 'f' and 'm' values
df_new = df_new.drop(['b'], axis=1)
df_new['b'] = df_new['f'] + df_new['m']

# convert the data frame format again
df_new = pd.melt(df_new, id_vars=['orig', 'dest', 'orig_code', 'dest_code'], var_name='sex', value_name='flow')
df_new = df_new[['sex', 'orig', 'dest', 'orig_code', 'dest_code', 'flow']]

df_new.head()

Unnamed: 0,sex,orig,dest,orig_code,dest_code,flow
0,f,ABW,AGO,533,24,0
1,f,ABW,ARE,533,784,0
2,f,ABW,ATG,533,28,0
3,f,ABW,AUS,533,36,4
4,f,ABW,AUT,533,40,0


In [6]:
# write to file
filename = f'migflows_gender_separated_1990_2015_filtered{"_without0flows" if remove_zero_flows_bool else ""}.csv'
df_new.to_csv(os.sep.join([data_folder, filename]), index=False)

### Some statistics on the dataset

#### `year0` and `sex`

In [51]:
df.columns

Index(['sex', 'year0', 'orig', 'dest', 'orig_code', 'dest_code', 'flow'], dtype='object')

In [52]:
df.groupby(['year0', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,orig,dest,orig_code,dest_code,flow
year0,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990,b,11461,11461,11461,11461,11461
1990,f,10361,10361,10361,10361,10361
1990,m,10645,10645,10645,10645,10645
1995,b,11539,11539,11539,11539,11539
1995,f,10461,10461,10461,10461,10461
1995,m,10630,10630,10630,10630,10630
2000,b,11537,11537,11537,11537,11537
2000,f,10597,10597,10597,10597,10597
2000,m,10759,10759,10759,10759,10759
2005,b,11703,11703,11703,11703,11703


#### `flow`

In [62]:
print(f'There were {100 * 166052 / 590448:.1f} % non-zero flow rows.')

There were 28.1 % non-zero flow rows.


In [50]:
df.flow.describe()

count    1.628890e+05
mean     2.349624e+03
std      2.250251e+04
min      1.000000e+00
25%      4.000000e+00
50%      2.500000e+01
75%      2.420000e+02
max      2.824460e+06
Name: flow, dtype: float64

In [21]:
## This cell was how we processed the data for the first milestone
## The print statements below show our previous cleaning results

# file_path = os.sep.join([data_folder, 'Abel', 'migflows_allcountries_gender_separated_1990_2015.csv'])
# df = pd.read_csv(file_path)
# print(f'Number of rows in original dataset: {df.shape[0]}.')

# # select only time intervals of 5 years from 1990 to 2015 
# df = df[(df.interval == 5) & (df.year0 >= 1990)]
# print(f'Number of rows after filtering time interval: {df.shape[0]}.')

# # remove irrelevant and redundant columns
# df.drop(columns=['stock', 'demo', 'interval'], inplace=True)

# # get all countries in dataset
# countries = set(df.orig.unique()).union(df.dest.unique())
# print(f'There are {len(countries)} countries in MigrFlowSex dataset.')

# iso_countries = {country.alpha_3 for country in pycountry.countries}
# not_iso_countries = countries - iso_countries
# print(f'There are {len(not_iso_countries)} not-ISO countries.')

# # remove not-ISO countries
# print(f"Let's remove them.")
# df = df[~(df['orig'].isin(not_iso_countries) | df['dest'].isin(not_iso_countries))]
# print(f'Number of rows after removing non-ISO countries {df.shape[0]}.')

# countries_clean = set(df.orig.unique()).union(df.dest.unique())
# assert len(countries_clean) == (len(countries) - len(not_iso_countries))
# print(f'There are {len(countries_clean)} countries after removal.')

## This printed:
## Number of rows in original dataset: 10321454.
## Number of rows after filtering time interval: 4901986.
## There are 204 countries in MigrFlowSex dataset.
## There are 4 not-ISO countries.
## Let's remove them.
## Number of rows after removing non-ISO countries 4755853.
## There are 200 countries after removal.

Number of rows in original dataset: 10321454.
Number of rows after filtering time interval: 4901986.
There are 204 countries in MigrFlowSex dataset.
There are 4 not-ISO countries.
Let's remove them.
Number of rows after removing non-ISO countries 4755853.
There are 200 countries after removal.


## Why did we choose these `demo` and `stock`

In [38]:
file_path = os.sep.join([data_folder, 'Abel', 'migflows_allcountries_gender_separated_1990_2015.csv'])
df = pd.read_csv(file_path)
print(f'Number of rows in original dataset: {df.shape[0]}.')

# select only time intervals of 5 years from 1990 to 2015 
df = df[(df.interval == 5) & (df.year0 >= 1990)]
print(f'Number of rows after filtering time interval: {df.shape[0]}.')

Number of rows in original dataset: 10321454.
Number of rows after filtering time interval: 4901986.


In [27]:
df.groupby('stock').count()

Unnamed: 0_level_0,demo,sex,year0,interval,orig,dest,orig_code,dest_code,flow
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
un12,1387716,1387716,1387716,1387716,1387716,1387716,1387716,1387716,1387716
un13,1354892,1354892,1354892,1354892,1354892,1354892,1354892,1354892,1354892
un15,1474892,1474892,1474892,1474892,1474892,1474892,1474892,1474892,1474892
wb11,684486,684486,684486,684486,684486,684486,684486,684486,684486


`un15` contains the most data in the period of interest. It is also the most updated one, suggesting more accurate estimates.

In [28]:
df.groupby('demo').count()

Unnamed: 0_level_0,stock,sex,year0,interval,orig,dest,orig_code,dest_code,flow
demo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
wpp2010,1507786,1507786,1507786,1507786,1507786,1507786,1507786,1507786,1507786
wpp2012,1637100,1637100,1637100,1637100,1637100,1637100,1637100,1637100,1637100
wpp2015,1757100,1757100,1757100,1757100,1757100,1757100,1757100,1757100,1757100


`wpp15` contains the most data in the period of interest. It is also the most updated one, suggesting more accurate estimates.

## Which time intervals have several `demo` and `stock`?

In [43]:
df.groupby(['year0', 'demo', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,stock,interval,orig,dest,orig_code,dest_code,flow
year0,demo,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990,wpp2010,b,150546,150546,150546,150546,150546,150546,150546
1990,wpp2010,f,150546,150546,150546,150546,150546,150546,150546
1990,wpp2010,m,150546,150546,150546,150546,150546,150546,150546
1990,wpp2012,b,155633,155633,155633,155633,155633,155633,155633
1990,wpp2012,f,155633,155633,155633,155633,155633,155633,155633
1990,wpp2012,m,155633,155633,155633,155633,155633,155633,155633
1990,wpp2015,b,155633,155633,155633,155633,155633,155633,155633
1990,wpp2015,f,155633,155633,155633,155633,155633,155633,155633
1990,wpp2015,m,155633,155633,155633,155633,155633,155633,155633
1995,wpp2010,b,150546,150546,150546,150546,150546,150546,150546


In [44]:
df.groupby(['year0', 'stock', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,demo,interval,orig,dest,orig_code,dest_code,flow
year0,stock,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990,un12,b,115643,115643,115643,115643,115643,115643,115643
1990,un12,f,115643,115643,115643,115643,115643,115643,115643
1990,un12,m,115643,115643,115643,115643,115643,115643,115643
1990,un13,b,116044,116044,116044,116044,116044,116044,116044
1990,un13,f,116044,116044,116044,116044,116044,116044,116044
1990,un13,m,116044,116044,116044,116044,116044,116044,116044
1990,un15,b,116044,116044,116044,116044,116044,116044,116044
1990,un15,f,116044,116044,116044,116044,116044,116044,116044
1990,un15,m,116044,116044,116044,116044,116044,116044,116044
1990,wb11,b,114081,114081,114081,114081,114081,114081,114081


## Make country codes and names a _json_ dictionary

In [15]:
filename = 'country_codes_and_names.json'
with open(os.path.join(data_folder, filename), 'w') as f:
    f.write('[\n')
    for i, country in enumerate(pycountry.countries):
#         line_string = ','.join(['{"numeric": "{}"'.format(country.numeric), 
        if i != len(pycountry.countries) - 1:
            last_line = '"iso_a2": "{}"}},\n'.format(country.alpha_2)
        else:
            last_line = '"iso_a2": "{}"}}\n'.format(country.alpha_2)
        line_string = ', '.join([
            '{{"numeric": "{}"'.format(country.numeric), 
#             '\{"numeric": "{}"'.format(country.numeric), 
#             f'\{"numeric": "{country.numeric}"', 
            '"name": "{}"'.format(country.name), 
#             f'"name": "{country.name}"', 
            '"iso_a3": "{}"'.format(country.alpha_3), 
#             f'"iso_a3": "{country.iso_a3}"', 
            last_line])
#             '"iso_a2": "{}"},\n'.format(country.iso_a2)])
        print(line_string)
#                                 f'"iso_a2": "{country.iso_a2}"\},\n'])
#                                 f'"iso_a2": "{country.iso_a2}"\},\n'])
        f.write(line_string)
    f.write(']')

{"numeric": "533", "name": "Aruba", "iso_a3": "ABW", "iso_a2": "AW"},

{"numeric": "004", "name": "Afghanistan", "iso_a3": "AFG", "iso_a2": "AF"},

{"numeric": "024", "name": "Angola", "iso_a3": "AGO", "iso_a2": "AO"},

{"numeric": "660", "name": "Anguilla", "iso_a3": "AIA", "iso_a2": "AI"},

{"numeric": "248", "name": "Åland Islands", "iso_a3": "ALA", "iso_a2": "AX"},

{"numeric": "008", "name": "Albania", "iso_a3": "ALB", "iso_a2": "AL"},

{"numeric": "020", "name": "Andorra", "iso_a3": "AND", "iso_a2": "AD"},

{"numeric": "784", "name": "United Arab Emirates", "iso_a3": "ARE", "iso_a2": "AE"},

{"numeric": "032", "name": "Argentina", "iso_a3": "ARG", "iso_a2": "AR"},

{"numeric": "051", "name": "Armenia", "iso_a3": "ARM", "iso_a2": "AM"},

{"numeric": "016", "name": "American Samoa", "iso_a3": "ASM", "iso_a2": "AS"},

{"numeric": "010", "name": "Antarctica", "iso_a3": "ATA", "iso_a2": "AQ"},

{"numeric": "260", "name": "French Southern Territories", "iso_a3": "ATF", "iso_a2": "TF"},

In [23]:
# csv is not a wise choice ast there are commas in the country names
filename = 'country_codes_and_names.csv'
with open(os.path.join(data_folder, filename), 'w') as f:
    line_string = f'numeric,name,iso_a3,iso_a2\n'
    f.write(line_string)
    for country in pycountry.countries:
        line_string = ','.join([country.numeric, 
                        country.name,
                        country.alpha_3,
                        country.alpha_2 + '\n'])
#         print(','.join([country.numeric, 
#                         country.name,
#                         country.alpha_3,
#                         country.alpha_2]))
        f.write(line_string)

## Explorations

In [90]:

list(pycountry.countries)[0]

Country(alpha_2='AW', alpha_3='ABW', name='Aruba', numeric='533')

In [98]:
for c in pycountry.countries:
    print(c)

Country(alpha_2='AW', alpha_3='ABW', name='Aruba', numeric='533')
Country(alpha_2='AF', alpha_3='AFG', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan')
Country(alpha_2='AO', alpha_3='AGO', name='Angola', numeric='024', official_name='Republic of Angola')
Country(alpha_2='AI', alpha_3='AIA', name='Anguilla', numeric='660')
Country(alpha_2='AX', alpha_3='ALA', name='Åland Islands', numeric='248')
Country(alpha_2='AL', alpha_3='ALB', name='Albania', numeric='008', official_name='Republic of Albania')
Country(alpha_2='AD', alpha_3='AND', name='Andorra', numeric='020', official_name='Principality of Andorra')
Country(alpha_2='AE', alpha_3='ARE', name='United Arab Emirates', numeric='784')
Country(alpha_2='AR', alpha_3='ARG', name='Argentina', numeric='032', official_name='Argentine Republic')
Country(alpha_2='AM', alpha_3='ARM', name='Armenia', numeric='051', official_name='Republic of Armenia')
Country(alpha_2='AS', alpha_3='ASM', name='American Samoa', n

In [102]:
df.dest_code[:20]

5481379    124
5481396    276
5481410    250
5481413    826
5481483    528
5481536    840
5481557     40
5481558     31
5481564    100
5481568    112
5481577    124
5481578    756
5481593    203
5481594    276
5481600    818
5481608    250
5481611    826
5481619    300
5481632    372
5481636    376
Name: dest_code, dtype: int64

In [99]:
pycountry.countries.get(numeric='040')

Country(alpha_2='AT', alpha_3='AUT', name='Austria', numeric='040', official_name='Republic of Austria')