In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
from constants import DATA_FOLDER, SAVE_FOLDER

## Load region-product export data

In [None]:
# Load trade data, keep the 771 products as in orginal product space paper by loading that proxmity
trade0 = pd.read_stata(
    f"{DATA_FOLDER}/country_sitcproduct4digit_year.dta",
    columns=["year", "location_code", "sitc_product_code", "export_value"],
).rename(
    columns={
        "location_code": "region",
        "sitc_product_code": "prod",
        "export_value": "export",
    }
)

# Unique locations
regions = sorted(trade0["region"].unique().tolist())

# Unique products
proxdf = pd.read_csv(f"{DATA_FOLDER}/sitc2_proximities.csv", dtype="str")
proxdf["proximity"] = proxdf["proximity"].astype(float)
prods = sorted(
    list(set(proxdf.commoditycode_1.tolist() + proxdf.commoditycode_2.tolist()))
)

trade0.head()

In [None]:
# Make trade data a cartesian product of year, region, products
years = list(range(1962,2019))
index = pd.MultiIndex.from_product([years,regions,prods],names=['year','region','prod'])
trade = pd.DataFrame(index = index).reset_index().merge(trade0,how='left').fillna(0)
trade.head()

In [None]:
# Aggregate to 3-digit SITC level
trade['prod'] = trade['prod'].str[:3]
trade = trade.groupby(['year','region','prod'])['export'].sum().reset_index()
trade.head()

In [None]:
# Calculate LCA and >=1 binarized version
trade['regionsum'] = trade.groupby(['year','region'])['export'].transform('sum')
trade['prodsum'] = trade.groupby(['year','prod'])['export'].transform('sum')
trade['yearsum'] = trade.groupby(['year'])['export'].transform('sum')
trade['rca'] = trade['export']*trade['yearsum']/trade['regionsum']/trade['prodsum']
trade['rca'] = np.where(trade['rca'].isna(),0,trade['rca'])
trade['rca'] = np.where(np.isinf(trade['rca']),0,trade['rca'])
trade['binrca'] = np.where(trade.rca>1,1,0)

## Get proximity matrix and calculate eigenvectors

In [None]:
# Choose initial year to derive proximity (at 3-digit level)
proxyear = 1962
mcpdf = trade[trade.year==proxyear].pivot(index='region',columns='prod',values='binrca').fillna(0)
mcpmat = mcpdf.to_numpy()
rcamat = trade[trade.year==proxyear].pivot(index='region',columns='prod',values='rca').fillna(0).to_numpy()
mcpmat.shape, rcamat.shape

In [None]:
# Filter out countries without a product in this year (Mcp matrix)
ubiquity = mcpmat.sum(axis=0)
diversity = mcpmat.sum(axis=1)
mcpmat = mcpmat[diversity>0,:]
mcpmat.shape

In [None]:
# Filter out countries without a product in this year (RCA matrix)
ubiquity = rcamat.sum(axis=0)
diversity = rcamat.sum(axis=1)
rcamat = rcamat[diversity>0,:]
rcamat.shape

In [None]:
# Calculate normalized proximity Phi_M, Phi_P, Phi_C
ubiquity = mcpmat.sum(axis=0)
diversity = mcpmat.sum(axis=1)
Phi_M = mcpmat.T @ mcpmat
Phi_M = Phi_M / ubiquity[np.newaxis, :]
Phi_M = np.minimum(Phi_M, Phi_M.T)
np.fill_diagonal(Phi_M, 0)
den = Phi_M.sum(axis=0)
Phi_M_normalized = np.diag(1/den) @ Phi_M
Phi_P_normalized = np.diag(1/ubiquity) @ mcpmat.T @ np.diag(1/diversity) @ mcpmat
Phi_C_normalized = (1 + np.corrcoef(rcamat.T)) / 2
den = Phi_C_normalized.sum(axis=0)
Phi_C_normalized = np.diag(1/den) @ Phi_C_normalized

In [None]:
# Calculate eigenvectors of different matrices
eigvals_M,eigvecs_M = np.linalg.eig(Phi_M_normalized)
eigvals_P,eigvecs_P = np.linalg.eig(Phi_P_normalized)
eigvals_C,eigvecs_C = np.linalg.eig(Phi_C_normalized)

eigvals_M_left,eigvecs_M_left = np.linalg.eig(Phi_M_normalized.T)
eigvals_P_left,eigvecs_P_left = np.linalg.eig(Phi_P_normalized.T)
eigvals_C_left,eigvecs_C_left = np.linalg.eig(Phi_C_normalized.T)

# Remove tiny imaginary parts
eigvals_M = np.real(eigvals_M)
eigvecs_M = np.real(eigvecs_M)
eigvecs_M_left = np.real(eigvecs_M_left)

eigvals_P = np.real(eigvals_P)
eigvecs_P = np.real(eigvecs_P)
eigvecs_P_left = np.real(eigvecs_P_left)

eigvals_C = np.real(eigvals_C)
eigvecs_C = np.real(eigvecs_C)
eigvecs_C_left = np.real(eigvecs_C_left)

In [None]:
# Get pi vector (1st left eigenvector) and normalize
pivec_M = eigvecs_M_left[:,0]
pivec_M = pivec_M/pivec_M.sum()
pivec_P = eigvecs_P_left[:,0]
pivec_P = pivec_P/pivec_P.sum()
pivec_C = eigvecs_C_left[:,0]
pivec_C = pivec_C/pivec_C.sum()

In [None]:
# Adjust signs of 1st eigenvectors (all 1s or -1s) to be positive
eigvecs_M[:,0] = eigvecs_M[:,0] * np.sign(eigvecs_M[0,0])
eigvecs_P[:,0] = eigvecs_P[:,0] * np.sign(eigvecs_P[0,0])
eigvecs_C[:,0] = eigvecs_C[:,0] * np.sign(eigvecs_C[0,0])

# Adjust signs of 2nd eigenvectors to align with diversity & pci
kp = eigvecs_P[:,1]
kc = mcpmat @ kp
eigvecs_P[:,1] = np.sign(np.real(np.corrcoef(diversity, kc)[0, 1]))*eigvecs_P[:,1]
eigvecs_M[:,1] = np.sign(np.real(np.corrcoef(eigvecs_M[:,1], eigvecs_P[:,1])[0, 1]))*eigvecs_M[:,1]
eigvecs_P_left[:,1] = np.sign(eigvecs_P_left[:,1].dot(eigvecs_P[:,1]))*eigvecs_P_left[:,1]
eigvecs_M_left[:,1] = np.sign(np.real(np.corrcoef(eigvecs_M_left[:,1], eigvecs_P_left[:,1])[0, 1]))*eigvecs_M_left[:,1]
eigvecs_C[:,1] = np.sign(np.real(np.corrcoef(eigvecs_C[:,1], eigvecs_P[:,1])[0, 1]))*eigvecs_C[:,1]
eigvecs_C_left[:,1] = np.sign(np.real(np.corrcoef(eigvecs_C_left[:,1], eigvecs_P_left[:,1])[0, 1]))*eigvecs_C_left[:,1]

In [None]:
# Normalize the vectors as discussed in appendix
eigvecs_M_norm = eigvecs_M @ np.diag(1/np.sqrt(np.diag(eigvecs_M.T @ np.diag(pivec_M) @ eigvecs_M)))
eigvecs_M_left_norm = eigvecs_M_left @ np.diag(1/np.sqrt(np.diag(eigvecs_M_left.T @ np.diag(1/pivec_M) @ eigvecs_M_left)))
eigvecs_P_norm = eigvecs_P @ np.diag(1/np.sqrt(np.diag(eigvecs_P.T @ np.diag(pivec_P) @ eigvecs_P)))
eigvecs_P_left_norm = eigvecs_P_left @ np.diag(1/np.sqrt(np.diag(eigvecs_P_left.T @ np.diag(1/pivec_P) @ eigvecs_P_left)))
eigvecs_C_norm = eigvecs_C @ np.diag(1/np.sqrt(np.diag(eigvecs_C.T @ np.diag(pivec_C) @ eigvecs_C)))
eigvecs_C_left_norm = eigvecs_C_left @ np.diag(1/np.sqrt(np.diag(eigvecs_C_left.T @ np.diag(1/pivec_C) @ eigvecs_C_left)))

## Merge export data for analysis

In [None]:
trade = trade.merge(pd.DataFrame(pivec_P,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_p'}))
trade = trade.merge(pd.DataFrame(pivec_M,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_m'}))
trade = trade.merge(pd.DataFrame(pivec_C,index=mcpdf.columns).reset_index().rename(columns={0:'pivec_c'}))
trade.head()

In [None]:
trade = trade.merge(pd.DataFrame(kp,index=mcpdf.columns).reset_index().rename(columns={0:'pci'}))
trade = trade.merge(pd.DataFrame(eigvecs_M_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_m'}))
trade = trade.merge(pd.DataFrame(eigvecs_P_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_p'}))
trade = trade.merge(pd.DataFrame(eigvecs_C_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pci_c'}))
trade = trade.merge(pd.DataFrame(eigvecs_M_left_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_m'}))
trade = trade.merge(pd.DataFrame(eigvecs_P_left_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_p'}))
trade = trade.merge(pd.DataFrame(eigvecs_C_left_norm[:,1],index=mcpdf.columns).reset_index().rename(columns={0:'pcil_c'}))
trade.head()

In [None]:
trade.to_parquet(f'{SAVE_FOLDER}/cleaned.parquet',index=False,compression='gzip')

In [None]:
# Sort vectors by eigenvalue (largest to smallest)
eig_index_M = (-eigvals_M).argsort()
eig_index_P = (-eigvals_P).argsort()
eig_index_C = (-eigvals_C).argsort()
eigvals_M  = eigvals_M[eig_index_M]
eigvals_P  = eigvals_P[eig_index_P]
eigvals_C  = eigvals_C[eig_index_C]
eigvecs_P_norm = eigvecs_P_norm[:,eig_index_P]
eigvecs_M_norm = eigvecs_M_norm[:,eig_index_M]
eigvecs_C_norm = eigvecs_C_norm[:,eig_index_C]
eigvecs_P_left_norm = eigvecs_P_left_norm[:,eig_index_P]
eigvecs_M_left_norm = eigvecs_M_left_norm[:,eig_index_M]
eigvecs_C_left_norm = eigvecs_C_left_norm[:,eig_index_C]
eigvecs_P = eigvecs_P[:,eig_index_P]
eigvecs_M = eigvecs_M[:,eig_index_M]
eigvecs_C = eigvecs_C[:,eig_index_C]
eigvecs_P_left = eigvecs_P_left[:,eig_index_P]
eigvecs_M_left = eigvecs_M_left[:,eig_index_M]
eigvecs_C_left = eigvecs_C_left[:,eig_index_C]
# np.save(f'{SAVE_FOLDER}/eigenvec_norm_p.npy',eigvecs_P_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_norm_m.npy',eigvecs_M_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_norm_c.npy',eigvecs_C_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_norm_p.npy',eigvecs_P_left_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_norm_m.npy',eigvecs_M_left_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_norm_c.npy',eigvecs_C_left_norm)
# np.save(f'{SAVE_FOLDER}/eigenvec_p.npy',eigvecs_P)
# np.save(f'{SAVE_FOLDER}/eigenvec_m.npy',eigvecs_M)
# np.save(f'{SAVE_FOLDER}/eigenvec_c.npy',eigvecs_C)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_p.npy',eigvecs_P_left)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_m.npy',eigvecs_M_left)
# np.save(f'{SAVE_FOLDER}/eigenvec_l_c.npy',eigvecs_C_left)

In [None]:
# Save RCAs to study distribution
sio.savemat(f'{SAVE_FOLDER}/rcavector.mat',{'rca':trade.rca.values})

# Save mincop proximity matrix
sio.savemat(f'{SAVE_FOLDER}/mincop_proximity.mat',{'mincop_proximity':Phi_M})

In [None]:
# Compute Pearson correlations between eigenvectors
correlations_PM = []
correlations_PC = []
n_evecs = len(eigvecs_P_norm)
for i in range(n_evecs):
    rho_PM = np.corrcoef(eigvecs_P_norm[:,i], eigvecs_M_norm[:,i], rowvar=False)[0,1]
    rho_PC = np.corrcoef(eigvecs_P_norm[:,i], eigvecs_C_norm[:,i], rowvar=False)[0,1]
    correlations_PM.append(rho_PM)
    correlations_PC.append(rho_PC)
    
# corrcoef gives odd results for first eigenvectors because of numerical precision issues, but the vectors are the same (np.isclose(eigvecs_P_norm[:,0], eigvecs_C_norm[:,0]).all() is True). Set correlations to 1 by hand.
correlations_PM[0] = 1
correlations_PC[0] = 1

In [None]:
# Save eigenvalues and eigenvectors
sio.savemat(f'{SAVE_FOLDER}/eigenspectrum.mat',{'evs_Phi_M':eigvals_M,'evs_Phi_P':eigvals_P,'evs_Phi_C':eigvals_C,'rho_PM':correlations_PM,'rho_PC':correlations_PC})