In [1]:
import pandas as pd
import numpy as np

## Load region-product export data

In [2]:
## load trade data, keep the 771 products as in orginal product space paper by loading that proxmity
trade0 = pd.read_stata(
    "data/country_sitcproduct4digit_year.dta",
    columns=["year", "location_code", "sitc_product_code", "export_value"],
).rename(
    columns={
        "location_code": "region",
        "sitc_product_code": "prod",
        "export_value": "export",
    }
)
## unique locations
regions = sorted(trade0["region"].unique().tolist())
## unique products
proxdf = pd.read_csv("data/sitc2_proximities.csv", dtype="str")
proxdf["proximity"] = proxdf["proximity"].astype(float)
prods = sorted(
    list(set(proxdf.commoditycode_1.tolist() + proxdf.commoditycode_2.tolist()))
)
trade0.head()

Unnamed: 0,year,region,prod,export
0,1988,ABW,11,0.0
1,1989,ABW,11,0.0
2,1990,ABW,11,0.0
3,1991,ABW,11,0.0
4,1992,ABW,11,0.0


In [3]:
## make trade data a cartesian product of year, region, products
years = list(range(1962,2019))
index = pd.MultiIndex.from_product([years,regions,prods],names=['year','region','prod'])
trade = pd.DataFrame(index = index).reset_index().merge(trade0,how='left').fillna(0)
trade.head()

Unnamed: 0,year,region,prod,export
0,1962,ABW,11,0.0
1,1962,ABW,12,0.0
2,1962,ABW,13,0.0
3,1962,ABW,14,0.0
4,1962,ABW,15,0.0


In [4]:
## aggregate to 3-digit SITC level
trade['prod'] = trade['prod'].str[:3]
trade = trade.groupby(['year','region','prod'])['export'].sum().reset_index()
trade.head()

Unnamed: 0,year,region,prod,export
0,1962,ABW,1,0.0
1,1962,ABW,11,0.0
2,1962,ABW,12,0.0
3,1962,ABW,14,0.0
4,1962,ABW,22,0.0


In [5]:
## calculate LCA and >=1 binarized version
trade['regionsum'] = trade.groupby(['year','region'])['export'].transform('sum')
trade['prodsum'] = trade.groupby(['year','prod'])['export'].transform('sum')
trade['yearsum'] = trade.groupby(['year'])['export'].transform('sum')
trade['rca'] = trade['export']*trade['yearsum']/trade['regionsum']/trade['prodsum']
trade['rca'] = np.where(trade['rca'].isna(),0,trade['rca'])
trade['rca'] = np.where(np.isinf(trade['rca']),0,trade['rca'])
trade['binrca'] = np.where(trade.rca>1,1,0)

In [6]:
## use a year to derive the proximity and product space at 3-digit level
proxyear = 1962
mcpdf = trade[trade.year==proxyear].pivot(index='region',columns='prod',values='binrca').fillna(0)
mcpmat = mcpdf.to_numpy()
rcamat = trade[trade.year==proxyear].pivot(index='region',columns='prod',values='rca').fillna(0).to_numpy()
mcpmat.shape, rcamat.shape

((249, 235), (249, 235))

In [7]:
## filter out countries without a product in specified year
ubiquity = mcpmat.sum(axis=0)
diversity = mcpmat.sum(axis=1)
mcpmat = mcpmat[diversity>0,:]
mcpmat.shape

(150, 235)

In [8]:
## filter out countries without a product in specified year
ubiquity = rcamat.sum(axis=0)
diversity = rcamat.sum(axis=1)
rcamat = rcamat[diversity>0,:]
rcamat.shape

(150, 235)

In [9]:
## calculate normalized proximity phi-M (A1), phi-P (A2), phi-C (A3) 
ubiquity = mcpmat.sum(axis=0)
diversity = mcpmat.sum(axis=1)
A = mcpmat.T @ mcpmat
A = A / ubiquity[np.newaxis, :]
A = np.minimum(A, A.T)
np.fill_diagonal(A, 0)
den = A.sum(axis=0)
A1 = np.diag(1/den) @ A
A2 = np.diag(1/ubiquity) @ mcpmat.T @ np.diag(1/diversity) @ mcpmat
A3 = (1 + np.corrcoef(rcamat.T)) / 2
den = A3.sum(axis=0)
A3 = np.diag(1/den) @ A3

In [10]:
## calculate eigenvectors of different matrices
eigvals1,eigvecs1 = np.linalg.eig(A1)
eigvecs1 = np.real(eigvecs1)

eigvals1_l,eigvecs1_l = np.linalg.eig(A1.T)
eigvecs1_l = np.real(eigvecs1_l)

eigvals2,eigvecs2 = np.linalg.eig(A2)
eigvecs2 = np.real(eigvecs2)

eigvals2_l,eigvecs2_l = np.linalg.eig(A2.T)
eigvecs2_l = np.real(eigvecs2_l)

eigvals3,eigvecs3 = np.linalg.eig(A3)
eigvecs3 = np.real(eigvecs3)

eigvals3_l,eigvecs3_l = np.linalg.eig(A3.T)
eigvecs3_l = np.real(eigvecs3_l)

In [11]:
## get primary pi vector and normalize
pivec1 = eigvecs1_l[:,0]
pivec1 = pivec1/pivec1.sum()
pivec2 = eigvecs2_l[:,0]
pivec2 = pivec2/pivec2.sum()
pivec3 = eigvecs3_l[:,0]
pivec3 = pivec3/pivec3.sum()

In [12]:
## adjust the signs of 2nd eigenvectors by aligning with diversity & pci
kp = eigvecs2[:,1]
kc = mcpmat @ kp
eigvecs2[:,1] = np.sign(np.real(np.corrcoef(diversity, kc)[0, 1]))*eigvecs2[:,1]
eigvecs1[:,1] = np.sign(np.real(np.corrcoef(eigvecs1[:,1], eigvecs2[:,1])[0, 1]))*eigvecs1[:,1]
eigvecs2_l[:,1] = np.sign(eigvecs2_l[:,1].dot(eigvecs2[:,1]))*eigvecs2_l[:,1]
eigvecs1_l[:,1] = np.sign(np.real(np.corrcoef(eigvecs1_l[:,1], eigvecs2_l[:,1])[0, 1]))*eigvecs1_l[:,1]
eigvecs3[:,1] = np.sign(np.real(np.corrcoef(eigvecs3[:,1], eigvecs2[:,1])[0, 1]))*eigvecs3[:,1]
eigvecs3_l[:,1] = np.sign(np.real(np.corrcoef(eigvecs3_l[:,1], eigvecs2_l[:,1])[0, 1]))*eigvecs3_l[:,1]

In [13]:
## normalize the vectors as illustrated in appendix
eigvecs1_norm = eigvecs1 @ np.diag(1/np.sqrt(np.diag(eigvecs1.T @ np.diag(pivec1) @ eigvecs1)))
eigvecs1_l_norm = eigvecs1_l @ np.diag(1/np.sqrt(np.diag(eigvecs1_l.T @ np.diag(1/pivec1) @ eigvecs1_l)))
eigvecs2_norm = eigvecs2 @ np.diag(1/np.sqrt(np.diag(eigvecs2.T @ np.diag(pivec2) @ eigvecs2)))
eigvecs2_l_norm = eigvecs2_l @ np.diag(1/np.sqrt(np.diag(eigvecs2_l.T @ np.diag(1/pivec2) @ eigvecs2_l)))
eigvecs3_norm = eigvecs3 @ np.diag(1/np.sqrt(np.diag(eigvecs3.T @ np.diag(pivec3) @ eigvecs3)))
eigvecs3_l_norm = eigvecs3_l @ np.diag(1/np.sqrt(np.diag(eigvecs3_l.T @ np.diag(1/pivec3) @ eigvecs3_l)))

In [14]:
trade = trade.merge(pd.DataFrame(pivec2,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_p'}))
trade = trade.merge(pd.DataFrame(pivec1,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_m'}))
trade = trade.merge(pd.DataFrame(pivec3,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_c'}))
trade.head()

Unnamed: 0,year,region,prod,export,regionsum,prodsum,yearsum,rca,binrca,pivec_p,pivec_m,pivec_c
0,1962,ABW,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898
1,1962,AFG,1,0.0,81810170.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898
2,1962,AGO,1,9682.848633,131182800.0,678265400.0,123476000000.0,0.013437,0,0.008395,0.002745,0.003898
3,1962,AIA,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898
4,1962,ALB,1,0.0,2301677.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898


In [15]:
trade = trade.merge(pd.DataFrame(kp,index=mcpdf.columns).reset_index().rename(columns={0:'pci'}))
trade = trade.merge(pd.DataFrame(eigvecs1_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_m'}))
trade = trade.merge(pd.DataFrame(eigvecs2_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_p'}))
trade = trade.merge(pd.DataFrame(eigvecs3_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_c'}))
trade = trade.merge(pd.DataFrame(eigvecs1_l_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_m'}))
trade = trade.merge(pd.DataFrame(eigvecs2_l_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_p'}))
trade = trade.merge(pd.DataFrame(eigvecs3_l_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_c'}))
trade.head()

Unnamed: 0,year,region,prod,export,regionsum,prodsum,yearsum,rca,binrca,pivec_p,pivec_m,pivec_c,pci,pci_m,pci_p,pci_c,pcil_m,pcil_p,pcil_c
0,1962,ABW,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
1,1962,AFG,1,0.0,81810170.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
2,1962,AGO,1,9682.848633,131182800.0,678265400.0,123476000000.0,0.013437,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
3,1962,AIA,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
4,1962,ALB,1,0.0,2301677.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269


In [16]:
trade.to_parquet('data/cleaned.parquet',index=False,compression='gzip')

In [17]:
## save normalized vectors for convenience
eig_index1 = (-eigvals1).argsort()
eig_index2 = (-eigvals2).argsort()
eig_index3 = (-eigvals3).argsort()
eigvecs2_norm = eigvecs2_norm[:,eig_index2]
eigvecs1_norm = eigvecs1_norm[:,eig_index1]
eigvecs3_norm = eigvecs3_norm[:,eig_index3]
eigvecs2_l_norm = eigvecs2_l_norm[:,eig_index2]
eigvecs1_l_norm = eigvecs1_l_norm[:,eig_index1]
eigvecs3_l_norm = eigvecs3_l_norm[:,eig_index3]
eigvecs2 = eigvecs2[:,eig_index2]
eigvecs1 = eigvecs1[:,eig_index1]
eigvecs3 = eigvecs3[:,eig_index3]
eigvecs2_l = eigvecs2_l[:,eig_index2]
eigvecs1_l = eigvecs1_l[:,eig_index1]
eigvecs3_l = eigvecs3_l[:,eig_index3]
np.save('data/eigenvec_norm_p.npy',eigvecs2_norm)
np.save('data/eigenvec_norm_m.npy',eigvecs1_norm)
np.save('data/eigenvec_norm_c.npy',eigvecs3_norm)
np.save('data/eigenvec_l_norm_p.npy',eigvecs2_l_norm)
np.save('data/eigenvec_l_norm_m.npy',eigvecs1_l_norm)
np.save('data/eigenvec_l_norm_c.npy',eigvecs3_l_norm)
np.save('data/eigenvec_p.npy',eigvecs2)
np.save('data/eigenvec_m.npy',eigvecs1)
np.save('data/eigenvec_c.npy',eigvecs3)
np.save('data/eigenvec_l_p.npy',eigvecs2_l)
np.save('data/eigenvec_l_m.npy',eigvecs1_l)
np.save('data/eigenvec_l_c.npy',eigvecs3_l)