In [48]:
import networkx as nx
import pandas as pd
from typing import List, Dict, Tuple
import scipy as sp
import numpy as np
import math
import matplotlib.pyplot as plt

## Data gathering and cleaning

Our data is ...

Source: https://networks.skewed.de/net/fao_trade

### Data gathering

In [49]:
# target folder: where the data will be saved
target = "data/fao_trade"

# set it to True if you want to download the data
# download_data = True
download_data = False


In [50]:
if download_data:

    # data url source
    url = "https://networks.skewed.de/net/fao_trade/files/fao_trade.csv.zip"

    import urllib.request
    import zipfile
    import os

    # Create the data directory
    if not os.path.exists("data"):
        print("Creating data directory...")
        os.mkdir("data")

    # Download the data, unzip it and save it locally 
    print(f"Dowloading data from {url}...")
    f, _ = urllib.request.urlretrieve(url)
    print("Dowload OK!")

    with zipfile.ZipFile(f, 'r') as zip_ref:
        print("Extracting zip file...")
        zip_ref.extractall(target)
        print("Zip extraction OK!")

    print(f"Operation completed! Your CSV data is available at {target}")
     

### Data extraction and cleaning

Our data is made by 3 CSV files, that can be extracted into two `pandas.DataFrame` and one key-value `dict`.

1. `nodes` contains the list of countries and their attributes. Their only relevant attribute found in this dataset is the `name` of the country. The country name will be used as the node label.

2. `edges` contains all the weighted directed edges between countries(`source` and `target`). Each edge is labeled with the `layer` of the trade, that thanks to `gprops.layer_key` could be associated to a specific commodity. The weight of the edge is the value of the trade. 

    Before going on, we replace all the numerical indexes with testual values (both for countries and commodities).

3. `gprops` is a key-value which contains various meta-data, including the mapping between the `layer` and the commodity name.

NOTE: you may observe that our data does not describe a simple graph, but a multi-graph (a pair of nodes may have more than two directed edge between them). This is due to the fact that this dataset describes the trade of more than one commodity. We will see later how to deal with this.

Now we extract the data from CSV files, we clean it and then we will plot the head of the two `pandas.DataFrame` `nodes` and `edges`.

In [51]:
# --------------------------------------------------
# Extract nodes (as dataframe)

nodes = pd.read_csv(target + "/nodes.csv")
nodes.columns = nodes.columns.str.replace(" ", "")
nodes.columns = nodes.columns.str.replace("#", "")
nodes = nodes.set_index("index")

# drop redundant data
nodes = nodes.drop(["nodeLabel", "_pos"], axis="columns",)

# --------------------------------------------------
# Extract edges (as dataframe)

edges = pd.read_csv(target + "/edges.csv")
edges.columns = edges.columns.str.replace(" ", "")
edges.columns = edges.columns.str.replace("#", "")

# --------------------------------------------------
# Extract various graph properties (as dict)

gprops_df = pd.read_csv(target + "/gprops.csv", on_bad_lines='skip')
gprops_df.columns = gprops_df.columns.str.replace(" ", "")
gprops_df.columns = gprops_df.columns.str.replace("#", "")

# make a dict object out of the gprops dataframe
gprops = gprops_df.set_index("prop_name").to_dict()["value"]

# extract the layers keys (as a dict)
layer_keys = {}
for item in gprops["layer_key"] \
    .replace(" ", "").replace("\n", "") \
    .replace("{", "").replace("}", "") \
    .split("',"):

    keyvalue = item.split(":'")
    layer_keys[int(keyvalue[0])] = keyvalue[1]

gprops["layer_key"] = layer_keys

del layer_keys, gprops_df

# --------------------------------------------------
# replace numeric index with textual names

# replace country indexes in edges
edges["source"] = edges["source"].map(nodes["name"])
edges["target"] = edges["target"].map(nodes["name"])

# replace commodity indexes in edges
edges["layer"] = edges["layer"].map(gprops["layer_key"])

# set country name as node index
nodes = nodes.set_index("name")

In [52]:
nodes.head()

Afghanistan
Australia
Austria
Belgium
Brazil


In [53]:
edges.head()

Unnamed: 0,source,target,weight,layer
0,Afghanistan,Australia,6,Food_prep_nes
1,Afghanistan,France,14,Food_prep_nes
2,Afghanistan,Pakistan,1,Food_prep_nes
3,Afghanistan,United_Kingdom,13,Food_prep_nes
4,Afghanistan,United_States_of_America,3,Food_prep_nes


In [54]:
# show that there is more than one layer
edges[
    edges["source"].isin(['Afghanistan']) &
    edges["target"].isin(['Australia'])
]

Unnamed: 0,source,target,weight,layer
0,Afghanistan,Australia,6,Food_prep_nes
18,Afghanistan,Australia,1,"Flour,_wheat"
27,Afghanistan,Australia,8,Crude_materials
47,Afghanistan,Australia,8,"Fruit,_prepared_nes"
141,Afghanistan,Australia,2,"Nuts,_prepared_(exc._groundnuts)"
185,Afghanistan,Australia,3,Sugar_confectionery
211,Afghanistan,Australia,33,Raisins


In [85]:
# number of layers
len(edges["layer"].unique())

364

## Graph building

Here, in this section, we will build the graph from the data extracted in the previous section. We will also propose some solutions to deal the multi-layer problem.

### Deal the multi-layer problem

We propose three approaches to deal with the multi-layer problem:

1. First approach is a naive "filter and merge". If we are interested in analizing how the global trade works for some commodities, we can simply
   1. select only the edges that are related to the commodities we are interested in
   2. aggregate them by summing the weights of the edges between the same pair of nodes

    This way we obtain an unique aggregated directed and weighted graph, where each pair of nodes has at most two directed edges between them (one for each direction).

    If we are interested in analizing how the global trade works for all the commodities, we can simply keep all the edges and aggregate them.

2. Second approach is to build more graphs and then compare them. This approach may be useful if we want to compare differet commodities trades. Comparison may be done:
   1. comparing (normalized) topological and centrality indexes (also consiedering distribution)
   2. comparing different meso-scale structures with correlation indexes
   3. making visual comparison between matrixes plots

    We can also combine this approach with the first one to compare several "subsets" of the global trade.

3. Third approach is more peculiar. As discussed in https://www.nature.com/articles/ncomms7864, in a multi-layer graph, some layers are more similar or dissimilar than others; for this reason, aggregating dissimilar layer may introduce artificial structures that may be misleading. For this reason the article proposes an information theory based method to aggregate several layers withouth loosing too much information. Briefly, the article implement a sort of hierarchical clustering on the layers, where at each step you aggregate layers with similar import-export paths.

    This approach permits to obtain a smaller subset of relevant aggregated layers, that can be used to build a smaller number or comparable graphs.

In this notebook we will implement the first approach on the whole dataset. Then we will select some sample layers to make some comparisons.

### Build layers graphs

We start building a graph foreach layer. Then we also implement a function to merge graphs.

In [89]:
# extract the layers graphs 
def create_layers_graphs(edges: pd.DataFrame) -> Dict[str, nx.DiGraph]:
    """
    Create a dict of all graphs, one for each layer
    """

    layers_m = {}
    for layer in edges["layer"].unique():
        layers_m[layer] = nx.from_pandas_edgelist(
            edges[edges["layer"] == layer],
            source="source",
            target="target",
            edge_attr="weight",
            create_using=nx.DiGraph
        )
    return layers_m

layers_g = create_layers_graphs(edges)

In [98]:
def merge_weighted_graphs(G, H):
    """
    Merge two weighted graphs summing the weights of the edges in common.
    """

    g = nx.compose(G, H)
    edge_data = { e: G.edges[e]['weight'] + H.edges[e]['weight'] for e in G.edges & H.edges }
    nx.set_edge_attributes(g, edge_data, 'weight')
    return g

# merge all layers into a single graph
layers_names = list(layers_g.keys())
all_layers_g = layers_g[layers_names[0]]
for layer in layers_names[1:]:
    all_layers_g = merge_weighted_graphs(all_layers_g, layers_g[layer])


## Analysis on the whole dataset graph

Now we perform an analysis on the whole dataset graph (the one where all the layers are aggregated). 

This analaysis includes:
- an overview of the graph, with some simple topological indexes
- a calculation of some centrality indexes, an analysis of their distribution and a comparison between them
- ...

### Graph overview

In [107]:
n = len(all_layers_g.nodes)
m = len(all_layers_g.edges)

print(f"Nr. of nodes (Size): {n}")
print(f"Nr. of directed edges (Volume): {m}")
print(f"Total weight of directed edges/2: {sum([all_layers_g.edges[e]['weight'] for e in all_layers_g.edges])/2}")
print(f"Density: {round(m/(n*(n-1)), 4)}")


Nr. of nodes (Size): 214
Nr. of directed edges (Volume): 13736
Total weight of directed edges/2: 554747408.5
Density: 0.3013


In [111]:
nx.draw(all_layers_g)

ModuleNotFoundError: No module named 'matplotlib'

### Build a network