# Prepare data for Graph NN

Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from utils import *

Load raw data

In [2]:
df = pd.concat([pd.read_stata(f"{data_paths['atlas']}/hs12_country_country_product_year_4_2012_2016.dta"),
                pd.read_stata(f"{data_paths['atlas']}/hs12_country_country_product_year_4_2017_2021.dta"),
                pd.read_stata(f"{data_paths['atlas']}/hs12_country_country_product_year_4_2022.dta")])
products = pd.read_csv(f"{data_paths['atlas']}/product_hs12.csv", dtype=str)
countries = pd.read_csv(f"{data_paths['atlas']}/location_country.csv", dtype=str)

We get a mapping from 'country_id' to 'node_id' (needed for Pytorch-Geometric)

In [146]:
nodes_index = countries[["country_id"]].reset_index().reset_index(names="node_id")
nodes_index = nodes_index[["node_id", "country_id"]]
nodes_index = {int(x.loc["country_id"]): int(x.loc["node_id"]) for i, x in nodes_index.iterrows()}

Let's use less granularity: Collapse 4-digit products to 2-digit

In [147]:
df.product_id = df.product_id.astype(str)
df = df.merge(products[["product_id", "code"]], how="left", on="product_id")
df.drop("product_id", axis=1, inplace=True)
df.rename(columns={"code": "product_id"}, inplace=True)
df.product_id = df.product_id.str[:2]
df = df.groupby(["country_id", "partner_country_id", "year", "product_id"]).agg({"export_value": "sum", "import_value": "sum", "coi": "max", "eci": "max", "pci": "max"}).reset_index()
df.head(3)

Unnamed: 0,country_id,partner_country_id,year,product_id,export_value,import_value,coi,eci,pci
0,4,8,2017,99,0.0,23355.0,-0.900028,-1.202973,0.106036
1,4,8,2018,99,37007.0,0.0,-0.798094,-1.158797,0.001044
2,4,8,2019,99,0.0,36310.0,-0.818056,-1.081313,0.27967


In [148]:
rca_drop_threshold = 0.2

In [149]:
# Include total volume imported for affecte importers
import_volumes = df[["country_id", "year", "product_id", "partner_country_id", "import_value"]].groupby(["country_id", "product_id", "year"]).agg({"import_value": "sum", "partner_country_id": "nunique"}).reset_index()
import_volumes.rename(columns={"country_id": "importer_id", "partner_country_id": "n_exporters"}, inplace=True)

lost_exporters = compute_lost_exporters(df=df, years=list(range(2012,2023)), import_volumes=import_volumes, drop_threshold=rca_drop_threshold)

Select Graph Layer :: Year and Product

In [242]:
## Select the year-product graph
year = 2022
product_id = "30"

subset = df[(df.year == year) & (df.product_id == product_id) & (df.export_value >= 0)]

## Generate the Graph Data
We need Node IDs, Node Features, Node Labels and Edge list

### Node Features & Node IDs
We start by considering 3 features: "Export Value", "COI" and "ECI"

In [243]:
nodes = subset[["country_id", "export_value", "coi", "eci"]].groupby("country_id").agg({"export_value": "sum", "coi": "max", "eci": "max"})
nodes.head(3)

Unnamed: 0_level_0,export_value,coi,eci
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,73132.0,-0.877697,-1.297693
8,1423616.0,0.079925,-0.373866
10,1512.0,-0.394352,0.962811


#### RCA as a feature

##### Compute RCAs

In [244]:
# Compute all years country_product matrices for period
compute_country_product_matrix_dict = {}

for y in df.year.unique():
    compute_country_product_matrix_dict[y] = compute_country_product_matrix(df[df.year==y], col="export_value")

rca_dict = rca(compute_country_product_matrix_dict)

In [245]:
# Country RCA for a given product
rca_dict[year][[product_id]].head(3)

product_id,30
country_id,Unnamed: 1_level_1
4,0.001059
8,0.01214
10,0.005024


##### Add RCA to node feature

In [246]:
nodes = nodes.merge(rca_dict[year][[product_id]].rename(columns={product_id: "rca"}), left_index=True, right_index=True, how="left")
print(nodes.shape)
nodes.head(3)

(233, 4)


Unnamed: 0_level_0,export_value,coi,eci,rca
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,73132.0,-0.877697,-1.297693,0.001059
8,1423616.0,0.079925,-0.373866,0.01214
10,1512.0,-0.394352,0.962811,0.005024


### Normalize the two not normalized features

In [247]:
nodes[["rca"]].idxmax()

rca    239
dtype: int32

In [248]:
scaler = StandardScaler()
nodes[["export_value"]] = scaler.fit_transform(nodes[["export_value"]])
nodes[["rca"]] = scaler.fit_transform(nodes[["rca"]])
nodes.head(3)

Unnamed: 0_level_0,export_value,coi,eci,rca
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,-0.240071,-0.877697,-1.297693,-0.2769
8,-0.23997,0.079925,-0.373866,-0.271664
10,-0.240076,-0.394352,0.962811,-0.275027


## Edge List

In [249]:
edge_list = subset[["country_id", "partner_country_id"]]#.replace(nodes_index)
edge_list.rename(columns={"country_id": "src", "partner_country_id": "tgt"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge_list.rename(columns={"country_id": "src", "partner_country_id": "tgt"}, inplace=True)


In [250]:
edge_list.head()

Unnamed: 0,src,tgt
856,4,40
1486,4,56
1764,4,76
2069,4,100
2131,4,104


## Nodes Labels
We want three labels:
In year *n*, for product *p*:
- 1 :: 'lost_exporter' -> Those countries that dropped ther importer status;
- 2 :: 'affected_importer' -> Those countries that are deemed affected as a result of a lost exporter;
- 0 :: 'not_affected' -> All other countries.

Compute lost_exporters dataframe

In [251]:
lost_exporters_subset = lost_exporters[(lost_exporters.year == year) & (lost_exporters.product_id == product_id)]
lost_exporters_subset.head(3)

Unnamed: 0,country_id,product_id,rca_drop,prev_year,year,importer_id,prev_year_export_value,export_value,prev_year_importer_volume,n_exporters_x,importer_volume,n_exporters_y,affected
14717,212,30,-0.810729,2021,2022,388,22687.0,5023.0,108498200.0,68,100126000.0,65,True
14718,212,30,-0.810729,2021,2022,499,3461.0,3891.0,139176900.0,56,114176100.0,53,False
14719,212,30,-0.810729,2021,2022,586,529628.0,147438.0,3040795000.0,148,1178769000.0,159,True


In [252]:
# Extract affected importers
affected_importers = lost_exporters_subset[["importer_id", "affected"]].rename(columns={"importer_id": "country_id", "affected": "label"})
affected_importers.loc[:, "label"] = affected_importers.loc[:, "label"].map({True: 1, False: 0}) # Map affected importers of lost exporters with correct label
affected_importers.sort_values(["label", "country_id"], inplace=True)
affected_importers.drop_duplicates(subset="country_id", keep="last", inplace=True) # Keep nodes affected

lost_exporters_unique = lost_exporters_subset.drop_duplicates(subset="country_id", keep="first")
if len(lost_exporters_unique) > 0:
    lost_exporters_unique.loc[:, "lost_exporter"] = 1 # Append a one to identify lost exporters

  affected_importers.loc[:, "label"] = affected_importers.loc[:, "label"].map({True: 1, False: 0}) # Map affected importers of lost exporters with correct label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lost_exporters_unique.loc[:, "lost_exporter"] = 1 # Append a one to identify lost exporters


In [253]:
affected_importers.head(3)

Unnamed: 0,country_id,label
14718,499,0
14717,388,1
14719,586,1


In [254]:
if len(lost_exporters_unique) > 0:
    nodes = nodes.merge(lost_exporters_unique[["country_id", "lost_exporter"]], on=["country_id"], how="left") # Finds lost exportesr
else:
    nodes["lost_exporter"] = 0
nodes = nodes.merge(affected_importers[["country_id", "label"]], on=["country_id"], how="left") # Find affected importers
nodes.fillna(0, inplace=True) # Remaining are not affected
#nodes["label"] = nodes.label_x + nodes.label_y # Unify both labels
#nodes.drop(["label_x", "label_y"], axis=1, inplace=True)
nodes.head(3)

Unnamed: 0,country_id,export_value,coi,eci,rca,lost_exporter,label
0,4,-0.240071,-0.877697,-1.297693,-0.2769,0.0,0.0
1,8,-0.23997,0.079925,-0.373866,-0.271664,0.0,0.0
2,10,-0.240076,-0.394352,0.962811,-0.275027,0.0,0.0


In [255]:
nodes.label.unique() # Make sure there is no overlapping labels (lost_exporter AND affected importer)

array([0., 1.])

In [256]:
#nodes[nodes.label == 3]

## Map 'country_id' to 'node_id'

In [257]:
#nodes.country_id = nodes.country_id.replace(nodes_index)
#nodes.rename(columns={"country_id": "node_id"}, inplace=True)
#nodes.head(3)

We get a mapping from 'country_id' to 'node_id' (needed for Pytorch-Geometric)

In [258]:
nodes.sort_values("country_id", inplace=True)
node_index = nodes.reset_index(names="ix")[["ix", "country_id"]]
node_index

Unnamed: 0,ix,country_id
0,0,4
1,1,8
2,2,10
3,3,12
4,4,16
...,...,...
228,228,876
229,229,882
230,230,887
231,231,894


In [259]:
#nodes_index = countries[["country_id"]].reset_index().reset_index(names="node_id")
#nodes_index = nodes_index[["node_id", "country_id"]]
node_index = {int(x.loc["country_id"]): int(x.loc["ix"]) for i, x in node_index.iterrows()}

In [260]:
edge_list.src = edge_list.src.map(node_index)
edge_list.tgt = edge_list.tgt.map(node_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge_list.src = edge_list.src.map(node_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge_list.tgt = edge_list.tgt.map(node_index)


## Save nodes and edge list

In [None]:
#nodes.to_csv(f"../data/nodes-{year}-{product_id}.csv", index=False)
#edge_list.to_csv(f"../data/edge_list-{year}-{product_id}.csv", index=False)

In [262]:
nodes["country_id"] = nodes.country_id.map(node_index)
nodes.rename(columns={"country_id": "node_id"}, inplace=True)

In [263]:
edge_list

Unnamed: 0,src,tgt
856,0,11
1486,0,17
1764,0,23
2069,0,29
2131,0,30
...,...,...
11323952,232,213
11325280,232,220
11325841,232,221
11326519,232,225


In [264]:
nodes.groupby("label").size()

label
0.0    230
1.0      3
dtype: int64