In [None]:
import pandas as pd
import numpy as np
from node2vec import Node2Vec
#from torch_geometric.nn import Node2Vec
from collections import defaultdict
from itertools import combinations
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
from utils import *


## Country borders (land)

In [None]:
G = parse_country_neighbors("../data/4. Additional Data Sources/4.4 Country Borders.txt")

In [None]:
for i in nx.connected_components(G):
    print(i)

## Country borders (maritime)

In [None]:
maritime = pd.read_excel("../data/4. Additional Data Sources/4.5 CERDI-seadistance.xlsx")[["iso1", "iso2", "seadistance"]]
maritime = maritime[maritime.seadistance > 0]

In [None]:
maritime.loc[:, "weights"] = maritime_distance_to_weight(maritime.loc[:, "seadistance"])

In [None]:
maritime.head()

In [None]:
maritime.describe()

In [None]:
already_added = set()
for ix, row in maritime.iterrows():
    # The edge from maritime distances
    edge = (min(row["iso1"], row["iso2"]), max(row["iso1"], row["iso2"]))
    # If already added, skip
    if edge in already_added:
        continue
    # If not added, and not a border country, add it
    elif not G.has_edge(row["iso1"], row["iso2"]):
        G.add_edge(row["iso1"], row["iso2"], weight=row["weights"])
    # If already a border country, but also a maritime connection, add to the weight
    else:
        G[row["iso1"]][row["iso2"]]["weight"] += row["weights"]
    already_added.add(edge)

In [None]:
for i in nx.connected_components(G):
    print(i)

In [None]:
largest_cc = max(nx.connected_components(G), key=len)
nx.diameter(G.subgraph(largest_cc).copy())

In [None]:
# Node2Vec 
node2vec = Node2Vec(G, dimensions=8, walk_length=10, num_walks=200, p=1, q=1, weight_key='weight', workers=4)
model = node2vec.fit(window=4, min_count=1, batch_words=4)

In [None]:
model.wv['ARG']  # get embedding for a country
model.wv.most_similar('COD', topn=20)  # find nearest countries

In [None]:
# Save embeddings
model.wv.save_word2vec_format('geo-embeddings.vec')  # save

## Visualization

Note on UMAP Reproducibility <br>
(https://umap-learn.readthedocs.io/en/latest/reproducibility.html) <br>
UMAP is a stochastic algorithm – it makes use of randomness both to speed up approximation steps, and to aid in solving hard optimization problems. This means that different runs of UMAP can produce different results. UMAP is relatively stable – thus the variance between runs should ideally be relatively small – but different runs may have variations none the less. To ensure that results can be reproduced exactly UMAP allows the user to set a random seed state. <br>

Since version 0.4 UMAP also support multi-threading for faster performance; when performing optimization this exploits the fact that race conditions between the threads are acceptable within certain optimization phases. Unfortunately this means that the randomness in UMAP outputs for the multi-threaded case depends not only on the random seed input, but also on race conditions between threads during optimization, over which no control can be had. This means that multi-threaded UMAP results cannot be explicitly reproduced.

In [None]:
# Get list of country names and their vectors
countries = list(model.wv.vocab.keys())
vectors = np.array([model.wv[c] for c in countries])

reducer = umap.UMAP(n_neighbors=5, min_dist=0.9)
vectors_2d = reducer.fit_transform(vectors)

In [None]:
fig, ax = plt.subplots(figsize=(14, 10), dpi=300)

sns.scatterplot(x=vectors_2d[:, 0], y=vectors_2d[:, 1], alpha=0.6, ax=ax)

# Create all text labels
texts = [
    plt.text(vectors_2d[i, 0], vectors_2d[i, 1], country, fontsize=8)
    for i, country in enumerate(countries)
]

plt.xlabel("Dimension 1", fontsize=14, fontweight='bold')
plt.ylabel("Dimension 2", fontsize=14, fontweight='bold')
plt.grid(True)
plt.tight_layout()
# Auto-adjust to avoid overlap
adjust_text(texts, #expand=(2, 2), # expand text bounding boxes by 1.2 fold in x direction and 2 fold in y direction
            arrowprops=dict(arrowstyle='->', color='gray', lw=0.5), # ensure the labeling is clear by adding arrows
            force_text=(3, 3));
plt.show()


### Trading agreements

In [None]:
trading_agreements = pd.read_excel("../data/4. Additional Data Sources/4.1. Trading agreements.xlsx")

In [None]:
trading_agreements["In-force Status"].unique()

In [None]:
trading_agreements = trading_agreements[trading_agreements["In-force Status"] == 'in force']

In [None]:
trading_agreements["Date of Signature"] = pd.to_datetime(trading_agreements["Date of Signature"], format="%d %b %Y")
trading_agreements["Date of Entry into Force"] = pd.to_datetime(trading_agreements["Date of Entry into Force"], format="%d %b %Y")

In [None]:
trading_agreements[trading_agreements["Date of Entry into Force"] > pd.to_datetime("2012-01-01")]

In [None]:
trading_agreements.Membership.str.split(";")

In [None]:
countries_map = pd.read_csv("../data/2. Atlas/countries.csv", encoding="latin1")

In [None]:
# Initialize a dict for edge weights
edge_weights = defaultdict(int)

# Iterate over each agreement
for row in trading_agreements.Membership:  # in case of NaNs
    countries = [c.strip() for c in row.split(';')]
    # Add edges for each pair (undirected)
    for c1, c2 in combinations(sorted(countries), 2):
        edge_weights[(c1, c2)] += 1

In [None]:
edge_weights

In [None]:
trading_agreements = pd.DataFrame(edge_weights.items(), columns=["pair", "weight"])
trading_agreements[["src", "tgt"]] = trading_agreements.pair.apply(pd.Series)
trading_agreements.drop("pair", axis=1, inplace=True) 

Make it both ways

In [None]:
trading_agreements_ = trading_agreements.copy()
trading_agreements_.rename({"src": "tgt", "tgt": "src"}, axis=1, inplace=True)

In [None]:
trading_agreements = pd.concat([trading_agreements, trading_agreements_], axis=0, ignore_index=True)

In [None]:
trading_agreements = trading_agreements.merge(countries_map[["country", "country_id"]], right_on="country", left_on="src", how="left")
trading_agreements.drop(["country", "src"], axis=1, inplace=True)
trading_agreements.rename({"country_id": "src"}, axis=1, inplace=True)

In [None]:
trading_agreements = trading_agreements.merge(countries_map[["country", "country_id"]], right_on="country", left_on="tgt", how="left")
trading_agreements.drop(["country", "tgt"], axis=1, inplace=True)
trading_agreements.rename({"country_id": "tgt"}, axis=1, inplace=True)

In [None]:
trading_agreements.head()

In [None]:
trading_agreements.to_csv("../data/trading_agreements_edges.csv", index=False)