In [1]:
import pandas as pd
import numpy as np
import math
import json
import os

# ATTENTION : under windows split on "\\", under linux split on "/"
if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("..")
print(os.getcwd())

from matplotlib import pyplot as plt
import seaborn as sns

import utils

/home/lola/Documents/Études/EPFL/MA2/Data Visualization/MigrationViz


In [2]:
# Initialization
PATH_WORLD = "data_world/"
SEX_DESTINATION_ORIGIN = PATH_WORLD + "undesa_pd_2020_ims_stock_by_sex_destination_and_origin.xlsx"

In [3]:
# Import table
sankey_2020 = pd.read_excel(SEX_DESTINATION_ORIGIN, sheet_name="Table 2", index_col=0, skiprows=9, na_values='..')

# Keep only lines of interest
sankey_2020 = sankey_2020[22:]

# Drop columns of no interest (keep only continent and sub-continent)
column_index = [1, 2, 3] + [x for x in range(4,24)]
columns_to_drop = sankey_2020.columns[column_index]
sankey_2020.drop(axis='columns', columns=columns_to_drop, inplace=True)

# Reset index 
sankey_2020.reset_index(drop=True, inplace=True)

# Rename columns
sankey_2020.rename(columns={"Region, development group of destination": "destination_region",
                            "Unnamed: 3":"location_code"}, inplace=True)

# Remove .1 and * details
sankey_2020.columns = sankey_2020.columns.str.replace(r'[*\.1]', '', regex=True)
sankey_2020["destination_region"] = sankey_2020.destination_region.str.replace("*","")

# Remove stupid space in front of names in column and rows
sankey_2020.columns = [col.strip() for col in sankey_2020.columns]
sankey_2020["destination_region"] = sankey_2020["destination_region"].str.strip()

In [11]:
# Northern america is both a continent and a subregion, it must be present in both continent list and sub_continent 
# Due to incomplete numbers for other sub_continents of Oceania except Australia & New Zealand, Oceania will be used both as a sub_continent and a continent
# OTHER, representing 12 millions people world-wide, needs to be taken into account every where. 
# TODO Make sure it does not cause problems later number wise 

# 
continent = ["AFRICA", "ASIA", "EUROPE", "LATIN AMERICA AND THE CARIBBEAN", "NORTHERN AMERICA", "OCEANIA", "OTHER"]
common_columns = ["destination_region"]
continent_subregion = ["NORTHERN AMERICA", "OCEANIA", "OTHER"]
sub_continent = [x for x in sankey_2020.columns if x not in continent and x not in common_columns]
print(sub_continent)
# sub_continent.insert(0, common_columns[1])
# sub_continent.insert(0, common_columns[0])

# Add to sub continent: 
sub_continent.insert(19, continent_subregion[0]) # NORTHERN AMERICA
sub_continent.insert(20, continent_subregion[1]) # OCEANIA
sub_continent.insert(len(sub_continent), continent_subregion[-1]) # OTHER

# Remove micronesia & polynesia from sub_continent
sub_continent.remove('Micronesia')
sub_continent.remove('Polynesia')

print(sub_continent)

['Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Central Asia', 'Eastern Asia', 'South-Eastern Asia', 'Southern Asia', 'Western Asia', 'Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 'Caribbean', 'Central America', 'South America', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia']
['Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Central Asia', 'Eastern Asia', 'South-Eastern Asia', 'Southern Asia', 'Western Asia', 'Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 'Caribbean', 'Central America', 'South America', 'Australia and New Zealand', 'Melanesia', 'NORTHERN AMERICA', 'OCEANIA', 'OTHER']


In [5]:
# First try : with continent only 
sankey_2020_continent = sankey_2020.copy(deep=True)
sankey_2020_continent = sankey_2020_continent[common_columns + continent]

filt_continent = sankey_2020_continent["destination_region"].isin(continent)
sankey_2020_continent = sankey_2020_continent[filt_continent]
display(sankey_2020_continent)

# Melt dataframe to convert it into long format
sankey_2020_continent_melted = sankey_2020_continent.melt(id_vars='destination_region', var_name='source', value_name='value')

# Create indices for the nodes
nodes = sorted(set(sankey_2020_continent_melted['destination_region']).union(set(sankey_2020_continent_melted['source'])))
nodes_indices = {node: idx for idx, node in enumerate(nodes)}

# List of nodes
nodes_list = [{'node': idx, 'name': node} for idx, node in enumerate(nodes)]

# List of links
# Something is really not logical here
links_list = [{'source': nodes_indices[row['source']] , 'target': nodes_indices[row['destination_region']], 'value': row['value']} for _, row in sankey_2020_continent_melted.iterrows()]

sankey_2020_continent_json = {'nodes': nodes_list, 'links': links_list}
print(sankey_2020_continent_json)

Unnamed: 0,destination_region,AFRICA,ASIA,EUROPE,LATIN AMERICA AND THE CARIBBEAN,NORTHERN AMERICA,OCEANIA,OTHER
0,AFRICA,20917565,1207631,648455,32524,53563,14483,2515243
6,ASIA,4720103,68497762,7169630,414658,538199,101725,4176425
12,EUROPE,11024274,23203976,44246425,5395924,1100304,397036,1338129
17,LATIN AMERICA AND THE CARIBBEAN,48791,402369,1355886,11297173,1293053,5630,391721
21,NORTHERN AMERICA,3268757,17549235,6869872,25535633,1088520,343625,4053153
22,OCEANIA,587673,4050511,2983395,214569,254319,1107706,182480


{'nodes': [{'node': 0, 'name': 'AFRICA'}, {'node': 1, 'name': 'ASIA'}, {'node': 2, 'name': 'EUROPE'}, {'node': 3, 'name': 'LATIN AMERICA AND THE CARIBBEAN'}, {'node': 4, 'name': 'NORTHERN AMERICA'}, {'node': 5, 'name': 'OCEANIA'}, {'node': 6, 'name': 'OTHER'}], 'links': [{'source': 0, 'target': 0, 'value': 20917565}, {'source': 0, 'target': 1, 'value': 4720103}, {'source': 0, 'target': 2, 'value': 11024274}, {'source': 0, 'target': 3, 'value': 48791}, {'source': 0, 'target': 4, 'value': 3268757}, {'source': 0, 'target': 5, 'value': 587673}, {'source': 1, 'target': 0, 'value': 1207631}, {'source': 1, 'target': 1, 'value': 68497762}, {'source': 1, 'target': 2, 'value': 23203976}, {'source': 1, 'target': 3, 'value': 402369}, {'source': 1, 'target': 4, 'value': 17549235}, {'source': 1, 'target': 5, 'value': 4050511}, {'source': 2, 'target': 0, 'value': 648455}, {'source': 2, 'target': 1, 'value': 7169630}, {'source': 2, 'target': 2, 'value': 44246425}, {'source': 2, 'target': 3, 'value': 1

In [6]:
# Export the json data to a document
with open('./data_world/sankey_2020_continent.json', 'w') as f:
    json.dump(sankey_2020_continent_json, f, )

print('JSON file saved successfully')

JSON file saved successfully
