In [114]:
import pandas as pd
import numpy as np
import math
import json
import os

# ATTENTION : under windows split on "\\", under linux split on "/"
if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("..")
print(os.getcwd())

from matplotlib import pyplot as plt
import seaborn as sns

import utils

/home/lola/Documents/Études/EPFL/MA2/Data Visualization/MigrationViz


In [2]:
# Initialization
PATH_WORLD = "data_world/"
SEX_DESTINATION_ORIGIN = PATH_WORLD + "undesa_pd_2020_ims_stock_by_sex_destination_and_origin.xlsx"

In [92]:
# Import table
sankey_2020 = pd.read_excel(SEX_DESTINATION_ORIGIN, sheet_name="Table 2", index_col=0, skiprows=9, na_values='..')

# Keep only lines of interest
sankey_2020 = sankey_2020[22:]

# Drop columns of no interest (keep only continent and sub-continent)
column_index = [1, 2, 3] + [x for x in range(4,24)]
columns_to_drop = sankey_2020.columns[column_index]
sankey_2020.drop(axis='columns', columns=columns_to_drop, inplace=True)

# Reset index 
sankey_2020.reset_index(drop=True, inplace=True)

# Rename columns
sankey_2020.rename(columns={"Region, development group of destination": "region",
                            "Unnamed: 3":"location_code"}, inplace=True)

# Remove .1 and * details
sankey_2020.columns = sankey_2020.columns.str.replace(r'[*\.1]', '', regex=True)
sankey_2020["region"] = sankey_2020.region.str.replace("*","")

# Remove stupid space in front of names in column and rows
sankey_2020.columns = [col.strip() for col in sankey_2020.columns]
sankey_2020["region"] = sankey_2020["region"].str.strip()

Unnamed: 0,"Region, development group of destination",Unnamed: 2,Unnamed: 3,WORLD,Sub-Saharan Africa,Northern Africa and Western Asia,Central and Southern Asia,Eastern and South-Eastern Asia,Latin America and the Caribbean,Oceania (excluding Australia and New Zealand),...,Caribbean,Central America,South America,NORTHERN AMERICA,OCEANIA,Australia and New Zealand.1,Melanesia,Micronesia,Polynesia*,OTHER
22,AFRICA,,903,25389464,19378341,2497202,147995,101658,32524,2068,...,13714,2089,16721,53563,14483,12415,146,82,1840,2515243
23,Eastern Africa,,910,7682801,6323588,728695,52982,9142,57,0,...,57,0,0,3935,373,373,0,0,0,421867
24,Middle Africa,,911,3861568,2556505,404972,0,1641,531,0,...,531,0,0,2938,10,10,0,0,0,832165
25,Northern Africa,,912,3167926,1569747,1258042,19305,50264,1845,0,...,29,1038,778,23989,1361,1361,0,0,0,92781
26,Southern Africa,,913,3125072,2035175,28309,70151,29086,13774,197,...,3442,647,9685,17196,10823,10626,121,53,23,665583
27,Western Africa,,914,7552097,6893326,77184,5557,11525,16317,1871,...,9655,404,6258,5505,1916,45,25,29,1817,502847
28,ASIA,,935,85618502,764696,18246980,32579976,21626213,414658,1948,...,501,5863,408294,538199,101725,99777,842,1106,0,4176425
29,Central Asia,,5500,5564042,41,167108,501215,103230,20,0,...,20,0,0,320,107,107,0,0,0,357511
30,Eastern Asia,,906,8975729,9420,2195,317882,7382944,331677,0,...,0,2735,328942,215678,40224,40224,0,0,0,549438
31,South-Eastern Asia,,920,10615377,5442,26811,1733636,7984983,1761,1168,...,145,227,1389,90225,45076,43908,62,1106,0,641093


In [93]:
# Northern america is both a continent and a subregion, it must be present in both continent list and sub_continent 
# Due to incomplete numbers for other sub_continents of Oceania except Australia & New Zealand, Oceania will be used both as a sub_continent and a continent
# OTHER, representing 12 millions people world-wide, needs to be taken into account every where. 
# TODO Make sure it does not cause problems later number wise 

# 
continent = ["AFRICA", "ASIA", "EUROPE", "LATIN AMERICA AND THE CARIBBEAN", "NORTHERN AMERICA", "OCEANIA", "OTHER"]
common_columns = ["region"]
continent_subregion = ["NORTHERN AMERICA", "OCEANIA", "OTHER"]
sub_continent = [x for x in sankey_2020.columns if x not in continent and x not in common_columns]

# sub_continent.insert(0, common_columns[1])
# sub_continent.insert(0, common_columns[0])

# Add to sub continent: 
sub_continent.insert(19, continent_subregion[0]) # NORTHERN AMERICA
sub_continent.insert(20, continent_subregion[1]) # OCEANIA
sub_continent.insert(len(sub_continent), continent_subregion[-1]) # OTHER

In [113]:
# First try : with continent only 
sankey_2020_continent = sankey_2020.copy(deep=True)
sankey_2020_continent = sankey_2020_continent[common_columns + continent]

filt_continent = sankey_2020_continent["region"].isin(continent)
sankey_2020_continent = sankey_2020_continent[filt_continent]
display(sankey_2020_continent)

# Melt dataframe to convert it into long format
sankey_2020_continent_melted = sankey_2020_continent.melt(id_vars='region', var_name='target', value_name='value')

# Create indices for the nodes
nodes = sorted(set(sankey_2020_continent_melted['region']).union(set(sankey_2020_continent_melted['target'])))
nodes_indices = {node: idx for idx, node in enumerate(nodes)}

# List of nodes
nodes_list = [{'node': idx, 'name': node} for idx, node in enumerate(nodes)]

# List of links
links_list = [{'source': nodes_indices[row['region']], 'target': nodes_indices[row['target']], 'value': row['value']} for _, row in sankey_2020_continent_melted.iterrows()]

sankey_2020_continent_json = {'nodes': nodes_list, 'links': links_list}
print(sankey_2020_continent_json)

Unnamed: 0,region,AFRICA,ASIA,EUROPE,LATIN AMERICA AND THE CARIBBEAN,NORTHERN AMERICA,OCEANIA,OTHER
0,AFRICA,20917565,1207631,648455,32524,53563,14483,2515243
6,ASIA,4720103,68497762,7169630,414658,538199,101725,4176425
12,EUROPE,11024274,23203976,44246425,5395924,1100304,397036,1338129
17,LATIN AMERICA AND THE CARIBBEAN,48791,402369,1355886,11297173,1293053,5630,391721
21,NORTHERN AMERICA,3268757,17549235,6869872,25535633,1088520,343625,4053153
22,OCEANIA,587673,4050511,2983395,214569,254319,1107706,182480


{'nodes': [{'node': 0, 'name': 'AFRICA'}, {'node': 1, 'name': 'ASIA'}, {'node': 2, 'name': 'EUROPE'}, {'node': 3, 'name': 'LATIN AMERICA AND THE CARIBBEAN'}, {'node': 4, 'name': 'NORTHERN AMERICA'}, {'node': 5, 'name': 'OCEANIA'}, {'node': 6, 'name': 'OTHER'}], 'links': [{'source': 0, 'target': 0, 'value': 20917565}, {'source': 1, 'target': 0, 'value': 4720103}, {'source': 2, 'target': 0, 'value': 11024274}, {'source': 3, 'target': 0, 'value': 48791}, {'source': 4, 'target': 0, 'value': 3268757}, {'source': 5, 'target': 0, 'value': 587673}, {'source': 0, 'target': 1, 'value': 1207631}, {'source': 1, 'target': 1, 'value': 68497762}, {'source': 2, 'target': 1, 'value': 23203976}, {'source': 3, 'target': 1, 'value': 402369}, {'source': 4, 'target': 1, 'value': 17549235}, {'source': 5, 'target': 1, 'value': 4050511}, {'source': 0, 'target': 2, 'value': 648455}, {'source': 1, 'target': 2, 'value': 7169630}, {'source': 2, 'target': 2, 'value': 44246425}, {'source': 3, 'target': 2, 'value': 1

In [118]:
# Export the json data to a document
with open('./data_world/sankey_2020_continent.json', 'w') as f:
    json.dump(sankey_2020_continent_json, f, )

print('JSON file saved successfully')

JSON file saved successfully
