# World Migrant Flow Diagrams
This workbook prepares sankey diagrams for world migrants. You can filter by origin or destination, choose what level of detail to see, and what direction the colour of the diagram flows. 

Source data: https://www.un.org/en/development/desa/population/migration/data/estimates2/estimates17.asp

Read in and preview data

In [1]:
import numpy as np
import pandas as pd
from floweaver import *

df_dest = pd.read_csv('../data/raw/migrant_flow.csv') # The rows are destination, columns are origin
display(df_dest.head())
display(df_dest.shape)

df_countries = pd.read_csv('../data/raw/country_index.csv')
display(df_countries.head())
display(df_countries.shape)

Unnamed: 0,Year,Country,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Anguilla,Antigua and Barbuda,...,Uruguay,Uzbekistan,Vanuatu,Venezuela (Bolivarian Republic of),Viet Nam,Wallis and Futuna Islands,Western Sahara,Yemen,Zambia,Zimbabwe
0,1990.0,Burundi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1990.0,Comoros,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1990.0,Djibouti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,289.0,0.0,0.0
3,1990.0,Eritrea,0.0,0.0,0.0,0.0,0.0,247.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,82.0
4,1990.0,Ethiopia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0


(1639, 234)

Unnamed: 0,Country,Continent,Subcontinent,More Developed Regions,Less Developed Regions,Least developed countries,High-income Countries,Middle-income Countries,Upper-middle-income Countries,Lower-middle-income Countries,Low-income Countries,Sub-Saharan Africa
0,Burundi,Africa,Eastern Africa,N,Y,Y,N,N,N,N,Y,Y
1,Comoros,Africa,Eastern Africa,N,Y,Y,N,N,N,N,Y,Y
2,Djibouti,Africa,Eastern Africa,N,Y,Y,N,Y,N,Y,N,Y
3,Eritrea,Africa,Eastern Africa,N,Y,Y,N,N,N,N,Y,Y
4,Ethiopia,Africa,Eastern Africa,N,Y,Y,N,N,N,N,Y,Y


(232, 12)

In [2]:
# Filter the dataframe
def filter_df(df, filters, filter_on):
    if filters:
        df = df[df[filter_on].isin(filters)]
    return df

# Set the source/target and waypoints
def set_level(cont, subcont, country, cont_name, subcont_name, country_name):
    outer = country_name
    middle = subcont_name
    inner = cont_name
    if subcont and not country:
        outer = subcont_name
        middle = cont_name
    if not country and not subcont:
        middle = cont_name
        outer = cont_name   
    if country and not subcont:
        middle = cont_name
    return outer, middle, inner

# Determine at what level the groupings should be shown
def show_level(origin_cont_show, origin_subcont_show, origin_country_show,
               dest_cont_show, dest_subcont_show, dest_country_show,
               source, source_waypoint1, source_waypoint2,
               target_waypoint2, target_waypoint1, target):

    ordering = [
        [source],
        [source_waypoint1] if sum([origin_cont_show, origin_subcont_show, origin_country_show])>=2 else None,
        [source_waypoint2] if sum([origin_cont_show, origin_subcont_show, origin_country_show])>=3 else None,
        [target_waypoint2] if sum([dest_cont_show, dest_subcont_show, dest_country_show])>=3 else None,
        [target_waypoint1] if sum([dest_cont_show, dest_subcont_show, dest_country_show])>=2 else None,
        [target],   
    ]

    ordering = [i for i in ordering if i]

    waypoint_list = [
        source_waypoint1 if sum([origin_cont_show, origin_subcont_show, origin_country_show])>=2 else None,
        source_waypoint2 if sum([origin_cont_show, origin_subcont_show, origin_country_show])>=3 else None,
        target_waypoint2 if sum([dest_cont_show, dest_subcont_show, dest_country_show])>=3 else None,
        target_waypoint1 if sum([dest_cont_show, dest_subcont_show, dest_country_show])>=2 else None,
    ]

    waypoint_list = [i for i in waypoint_list if i]

    bundles = [
        Bundle(source, target, waypoints=waypoint_list),
    ]

    bundles = [i for i in bundles if i]

    return ordering, bundles

# TODO: Make interactive user selections
# Create the sankey diagram
def get_diagram(year=2017, colour_from=True,
                origin_cont_filter=[], origin_subcont_filter=[],  origin_country_filter=[],
                dest_cont_filter=[],   dest_subcont_filter=[],    dest_country_filter=[],
                origin_cont_show=True, origin_subcont_show=False, origin_country_show=False,
                dest_cont_show=True,   dest_subcont_show=False,   dest_country_show=False):
    
    # Subset by selected year
    df = df_dest[df_dest['Year']==year]
    df = df.drop('Year', axis=1)
    
    # Melt data such that each row has the origin, destination, and number of migrants
    df = pd.melt(df, id_vars=['Country'])
    df['value'] = df['value'].astype(float)
    df = df[['variable','Country','value']]
    df = df.rename(index=str, columns={'variable': 'Origin',
                                       'Country': 'Destination',
                                       'value': 'Migrants'})
    
    # Merge with country index to get information about origin and destination country
    df = pd.merge(df, df_countries, how='left', left_on='Origin', right_on='Country')
    df = pd.merge(df, df_countries, how='left', left_on='Destination', 
                  right_on='Country', suffixes=('_origin','_dest'))
    
    # TODO: group income and developed (not y/n)
    
    size = dict(width=1200, height=1000,
           margins=dict(left=250, right=250))
    
    source, source_waypoint1, source_waypoint2 = set_level(origin_cont_show, origin_subcont_show, origin_country_show,
                                                           'Continent_origin', 'Subcontinent_origin', 'Country_origin')
    target, target_waypoint1, target_waypoint2 = set_level(dest_cont_show, dest_subcont_show, dest_country_show,
                                                           'Continent_dest', 'Subcontinent_dest', 'Country_dest')
    
    df['source']=source
    df['target']=target

    df = filter_df(df, origin_cont_filter, 'Continent_origin')
    df = filter_df(df, origin_subcont_filter, 'Subcontinent_origin')
    df = filter_df(df, origin_country_filter, 'Country_origin')
    df = filter_df(df, dest_cont_filter, 'Continent_dest')
    df = filter_df(df, dest_subcont_filter, 'Subcontinent_dest')
    df = filter_df(df, dest_country_filter, 'Country_dest')
    sankey_data = Dataset(df)

    # Partition the data to get the names for each location
    source_waypoint1_part = sankey_data.partition(source_waypoint1)
    source_waypoint2_part = sankey_data.partition(source_waypoint2)
    source_part = sankey_data.partition(source)
    target_waypoint2_part = sankey_data.partition(target_waypoint2)
    target_waypoint1_part = sankey_data.partition(target_waypoint1)
    target_part = sankey_data.partition(target)
    
    nodes = {
        source_waypoint2: Waypoint(source_waypoint2_part),
        source_waypoint1: Waypoint(source_waypoint1_part),
        source: ProcessGroup([source], source_part),
        target_waypoint2: Waypoint(target_waypoint2_part),
        target_waypoint1: Waypoint(target_waypoint1_part),
        target: ProcessGroup([target], target_part),
    }

    # Set left to right ordering and which flows to show
    ordering, bundles = show_level(origin_cont_show, origin_subcont_show, origin_country_show,
                                   dest_cont_show, dest_subcont_show, dest_country_show,
                                   source, source_waypoint1, source_waypoint2,
                                   target_waypoint2, target_waypoint1, target)
    
    colour_choice = target_part if colour_from else source_part
    sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=colour_choice)
    
    display(weave(sdd, sankey_data, palette='Set2_8', measures='Migrants')\
            .to_widget(**size).auto_save_png('../data/processed/example1.png'))



## Create the diagram
Select your filters and what you want to see, then run the cell below to generate the diagram.


Defaults:
```python
year=2017, # Select year of data from: 1990, 1995, 2000, 2005, 2010, 2015, 2017
colour_from=False, # True for unique colour for each destination. False for unique colour for each origin
origin_cont_filter=[], # Filter origin continents
origin_subcont_filter=[], # Filter origin subcontinents
origin_country_filter=[], # Filter origin countries
dest_cont_filter=[], # Filter destination continents
dest_subcont_filter=[], # Filter destination subcontinents
dest_country_filter=[], #Filter destination countries
origin_cont_show=True, # True to show origin continents. False otherwise.
origin_subcont_show=False, # True to show origin subcontinents. False otherwise.
origin_country_show=False, # True to show origin countries. False otherwise.
dest_cont_show=True, # True to show destination continents. False otherwise.
dest_subcont_show=False, # True to show destination subcontinents. False otherwise.
dest_country_show=False) # True to show destination countries. False otherwise.
```

The following example generates a diagram showing the origin of all immigrants to the United States of America in 2017, grouped by their continent of origin.

In [3]:
get_diagram(2017, colour_from=False,
            dest_country_show=True, origin_country_show=True,
            dest_cont_show=False,
            dest_country_filter=['United States of America'])  

SankeyWidget(groups=[{'id': 'Country_origin', 'type': 'process', 'title': '', 'nodes': ['Country_origin^Afghan…