In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy.io as sio
from scipy.stats import entropy

## Load region-product data

In [2]:
trade = pd.read_parquet('data/cleaned.parquet')
trade.head()

Unnamed: 0,year,region,prod,export,regionsum,prodsum,yearsum,rca,binrca,pivec_p,pivec_m,pivec_c,pci,pci_m,pci_p,pci_c,pcil_m,pcil_p,pcil_c
0,1962,ABW,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
1,1962,AFG,1,0.0,81810170.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
2,1962,AGO,1,9682.848633,131182800.0,678265400.0,123476000000.0,0.013437,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
3,1962,AIA,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269
4,1962,ALB,1,0.0,2301677.0,678265400.0,123476000000.0,0.0,0,0.008395,0.002745,0.003898,-0.058055,-2.307477,-0.919122,-1.095074,-0.006335,-0.007716,-0.004269


In [3]:
R0 = 0.115

In [4]:
# transform RCA with R0 and generate different projections
df = trade.assign(llrca = np.log(1+trade.rca/R0)/np.log(1+1/R0))
df['bin'] = df['export']>0
df['avgrca_part_p'] = df.llrca*df.pivec_p
df['avgrca_part_m'] = df.llrca*df.pivec_m
df['avgrca_part_c'] = df.llrca*df.pivec_c
df['avgrca_p'] = df.groupby(['year','region'])['avgrca_part_p'].transform(sum)
df['avgrca_m'] = df.groupby(['year','region'])['avgrca_part_m'].transform(sum)
df['avgrca_c'] = df.groupby(['year','region'])['avgrca_part_c'].transform(sum)
df['diversity'] = df.groupby(['year','region'])['binrca'].transform(sum)
df.avgrca_p.describe()

count    3.335355e+06
mean     3.586094e-01
std      2.806920e-01
min      0.000000e+00
25%      1.172252e-01
50%      3.212149e-01
75%      5.881455e-01
max      1.005534e+00
Name: avgrca_p, dtype: float64

In [5]:
df['rct_p'] = np.where(df.avgrca_p>0,df.llrca/df.avgrca_p,0)
df['rct_m'] = np.where(df.avgrca_m>0,df.llrca/df.avgrca_m,0)
df['rct_c'] = np.where(df.avgrca_c>0,df.llrca/df.avgrca_c,0)
df['rct_demean_p'] = df.rct_p-df.groupby('prod').rct_p.transform('mean')
df['rct_demean_m'] = df.rct_m-df.groupby('prod').rct_m.transform('mean')
df['rct_demean_c'] = df.rct_c-df.groupby('prod').rct_c.transform('mean')
df['proj_p'] = np.where(df.avgrca_p>0, df.pci_p * df.llrca * df.pivec_p / df.avgrca_p,0)
df['proj_m'] = np.where(df.avgrca_m>0, df.pci_m * df.llrca * df.pivec_m / df.avgrca_m,0)
df['proj_c'] = np.where(df.avgrca_c>0, df.pci_c * df.llrca * df.pivec_c / df.avgrca_c,0)
df['eci_part'] = np.where(df.diversity>0,df.pci * df.binrca/df.diversity,0)
df.head()

Unnamed: 0,year,region,prod,export,regionsum,prodsum,yearsum,rca,binrca,pivec_p,...,rct_p,rct_m,rct_c,rct_demean_p,rct_demean_m,rct_demean_c,proj_p,proj_m,proj_c,eci_part
0,1962,ABW,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,...,0.0,0.0,0.0,-0.766895,-1.179861,-1.06651,0.0,0.0,0.0,0.0
1,1962,AFG,1,0.0,81810170.0,678265400.0,123476000000.0,0.0,0,0.008395,...,0.0,0.0,0.0,-0.766895,-1.179861,-1.06651,-0.0,-0.0,-0.0,-0.0
2,1962,AGO,1,9682.848633,131182800.0,678265400.0,123476000000.0,0.013437,0,0.008395,...,0.117628,0.268855,0.202484,-0.649267,-0.911006,-0.864026,-0.000908,-0.001703,-0.000864,-0.0
3,1962,AIA,1,0.0,0.0,678265400.0,123476000000.0,0.0,0,0.008395,...,0.0,0.0,0.0,-0.766895,-1.179861,-1.06651,0.0,0.0,0.0,0.0
4,1962,ALB,1,0.0,2301677.0,678265400.0,123476000000.0,0.0,0,0.008395,...,0.0,0.0,0.0,-0.766895,-1.179861,-1.06651,-0.0,-0.0,-0.0,-0.0


In [6]:
cntryagg = df.groupby(['year','region'])[['avgrca_part_p','avgrca_part_m','avgrca_part_c','proj_p','proj_m','proj_c','eci_part','bin']].sum().reset_index().rename(columns={'avgrca_part_p':'avgrca_p','avgrca_part_m':'avgrca_m','avgrca_part_c':'avgrca_c','eci_part':'eci'})
cntryagg.head()

Unnamed: 0,year,region,avgrca_p,avgrca_m,avgrca_c,proj_p,proj_m,proj_c,eci,bin
0,1962,ABW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1962,AFG,0.229441,0.075256,0.101934,-1.007255,-1.725521,-1.16299,-0.067015,39
2,1962,AGO,0.41356,0.180938,0.240246,-0.87096,-1.279847,-0.896124,-0.060397,134
3,1962,AIA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1962,ALB,0.211957,0.075734,0.103582,-0.996652,-1.301794,-0.791606,-0.079119,24


## calculate metrics for 2016

### prepare mcp of 2016

In [7]:
mcpdf = trade[trade.year==2016].pivot(index='region',columns='prod',values='binrca').fillna(0)
mcpdf.shape

(249, 235)

In [8]:
mcp = mcpdf.values
ubiquity = mcp.sum(axis=0)
diversity = mcp.sum(axis=1)
ubiquity.min(),diversity.min()
mcp = mcp[diversity>0,:]
mcp.shape

(233, 235)

In [9]:
kp = mcp.sum(axis=0)
kc = mcp.sum(axis=1)

### fitness

In [10]:
qp = np.ones(mcp.shape[1])
fc = np.ones(mcp.shape[0])

In [11]:
for i in range(20):
    fc_t = mcp @ qp
    qp_t = 1/(mcp.T @ (1/fc))
    fc = fc_t / fc_t.mean()
    qp = qp_t / qp_t.mean()

### ECI

In [12]:
mr = np.diag(1/kc) @ mcp @ np.diag(1/kp) @ mcp.T
mr.shape

(233, 233)

In [13]:
eigvals2,eigvecs2 = np.linalg.eig(mr)
eigvecs2 = np.real(eigvecs2)

In [14]:
eci = np.sign(np.corrcoef(kc,eigvecs2[:,1])[0,1])*eigvecs2[:,1]

### genepy

In [15]:
kp_1 = (np.diag(1/kc) @ mcp).sum(axis=0)
wcp = np.diag(1/kc) @ mcp @ np.diag(1/kp_1)
ncc = wcp @ wcp.T
np.fill_diagonal(ncc, 0)

In [16]:
eigvals,eigvecs = np.linalg.eig(ncc)
xc1 = np.absolute(eigvecs[:,0])
xc2 = eigvecs[:,1]
lambda1 = eigvals[0]
lambda2 = eigvals[1]
genepy = np.square(lambda1*np.square(xc1)+lambda2*np.square(xc2))+2*(lambda1**2*np.square(xc1)+lambda2**2*np.square(xc2))

### production ability

In [17]:
from girth import twopl_mml
estimates = twopl_mml(mcp.T)

In [18]:
estimates.keys()

dict_keys(['Discrimination', 'Difficulty', 'Ability', 'LatentPDF', 'AIC', 'BIC'])

## fix effects

In [19]:
fedf = trade[(trade.year==2016)&(trade['export']>0)][['region','prod','export','rca','regionsum','prodsum']].copy()
fedf.head()

Unnamed: 0,region,prod,export,rca,regionsum,prodsum
13446,ABW,1,96504.796875,0.267735,285126100.0,18439040000.0
13447,AFG,1,98623.320312,0.10286,758450700.0,18439040000.0
13448,AGO,1,1975.171631,5.8e-05,26980290000.0,18439040000.0
13450,ALB,1,72021.726562,0.027557,2067362000.0,18439040000.0
13451,AND,1,30757.896484,0.303068,80280310.0,18439040000.0


In [20]:
fedf['ycp'] = -np.log(-np.log(fedf.rca/(fedf.rca+1)))
fedf['regionshare'] = fedf.export/fedf.regionsum
fedf['prodshare'] = fedf.export/fedf.prodsum
fedf.head()

Unnamed: 0,region,prod,export,rca,regionsum,prodsum,ycp,regionshare,prodshare
13446,ABW,1,96504.796875,0.267735,285126100.0,18439040000.0,-0.441469,0.0003384636,5.233722e-06
13447,AFG,1,98623.320312,0.10286,758450700.0,18439040000.0,-0.863858,0.0001300326,5.348616e-06
13448,AGO,1,1975.171631,5.8e-05,26980290000.0,18439040000.0,-2.277953,7.320795e-08,1.07119e-07
13450,ALB,1,72021.726562,0.027557,2067362000.0,18439040000.0,-1.286105,3.483751e-05,3.905937e-06
13451,AND,1,30757.896484,0.303068,80280310.0,18439040000.0,-0.377422,0.0003831313,1.668086e-06


In [21]:
res = smf.ols(formula='ycp ~ region+prod', data=fedf).fit()

In [22]:
fecoefdf = pd.DataFrame({'fe':res.params[1:]}).reset_index()
fecoefdf['var'] = fecoefdf['index'].str[-4:-1]
fecoefdf.head()

Unnamed: 0,index,fe,var
0,region[T.AFG],-0.343794,AFG
1,region[T.AGO],-1.024346,AGO
2,region[T.AIA],0.638461,AIA
3,region[T.ALB],0.0125,ALB
4,region[T.AND],0.205859,AND


In [23]:
gamma_c = fedf[['region']].drop_duplicates().merge(fecoefdf[['var','fe']].rename(columns={'var':'region'}),how='left').fillna(0)
gamma_c.head()

Unnamed: 0,region,fe
0,ABW,0.0
1,AFG,-0.343794
2,AGO,-1.024346
3,ALB,0.0125
4,AND,0.205859


### entropy

In [24]:
tmpdf = fedf[['region','prod','export']].copy()
tmpdf.head()

Unnamed: 0,region,prod,export
13446,ABW,1,96504.796875
13447,AFG,1,98623.320312
13448,AGO,1,1975.171631
13450,ALB,1,72021.726562
13451,AND,1,30757.896484


In [25]:
tmpdf.region.nunique(),tmpdf['prod'].nunique()

(233, 235)

In [26]:
tmpdf['hc'] = tmpdf.groupby('region')['export'].transform(entropy)
tmpdf['hp'] = tmpdf.groupby('prod')['export'].transform(entropy)
tmpdf['xcp'] = tmpdf.export*(np.log(233) - tmpdf.hp)
tmpdf['ycp'] = tmpdf.export*(np.log(235) - tmpdf.hc)
tmpdf['xcpr'] = tmpdf.xcp/tmpdf.groupby('region')['xcp'].transform(sum)
tmpdf['ycpr'] = tmpdf.ycp/tmpdf.groupby('prod')['ycp'].transform(sum)
tmpdf.head()

Unnamed: 0,region,prod,export,hc,hp,xcp,ycp,xcpr,ycpr
13446,ABW,1,96504.796875,3.210066,3.35685,202099.239581,217089.386604,0.0003205398,8.516327e-06
13447,AFG,1,98623.320312,2.44719,3.35685,206535.827084,297092.432299,0.0001210198,1.165481e-05
13448,AGO,1,1975.171631,0.660831,3.35685,4136.381792,9478.364743,6.347788e-08,3.718323e-07
13450,ALB,1,72021.726562,3.520899,3.35685,150827.074332,139627.564405,3.080489e-05,5.477532e-06
13451,AND,1,30757.896484,3.530259,3.35685,64412.834304,59342.036646,0.0003102073,2.327964e-06


In [27]:
for i in range(25):
    tmpdf['hc'] = tmpdf.groupby('region')['xcpr'].transform(entropy)
    tmpdf['hp'] = tmpdf.groupby('prod')['ycpr'].transform(entropy)
    tmpdf['xcp'] = tmpdf.export*(np.log(233) - tmpdf.hp)
    tmpdf['ycp'] = tmpdf.export*(np.log(235) - tmpdf.hc)
    tmpdf['xcpr'] = tmpdf.xcp/tmpdf.groupby('region')['xcp'].transform(sum)
    tmpdf['ycpr'] = tmpdf.ycp/tmpdf.groupby('prod')['ycp'].transform(sum)

In [28]:
tmpdf.head()

Unnamed: 0,region,prod,export,hc,hp,xcp,ycp,xcpr,ycpr
13446,ABW,1,96504.796875,3.326746,3.456288,192503.017473,205829.207877,0.0003184944,7.666431e-06
13447,AFG,1,98623.320312,2.597491,3.456288,196728.943723,282269.251269,0.0001268189,1.051356e-05
13448,AGO,1,1975.171631,0.607231,3.456288,3939.975123,9584.232724,5.567062e-08,3.569797e-07
13450,ALB,1,72021.726562,3.413015,3.456288,143665.394218,147397.565596,2.989973e-05,5.490053e-06
13451,AND,1,30757.896484,3.441601,3.456288,61354.337568,62068.964583,0.000295184,2.311856e-06


In [29]:
regiondf = tmpdf[['region','hc']].drop_duplicates().sort_values('region')
regiondf.head()

Unnamed: 0,region,hc
13446,ABW,3.326746
13447,AFG,2.597491
13448,AGO,0.607231
70221,AIA,2.766955
13450,ALB,3.413015


## combine metrics into a result dataframe

In [30]:
resdf = pd.DataFrame(
    {
        "fitness2016": fc,
        "eci2016": eci,
        "kc": kc,
        "xc1": xc1,
        "xc2": xc2,
        "genepy": genepy,
        "ability": estimates["Ability"],
    },
    index=mcpdf.index[diversity > 0],
).reset_index()
resdf.shape

(233, 8)

In [31]:
resdf = resdf.merge(gamma_c).merge(regiondf).merge(cntryagg[(cntryagg.year==2016)&(cntryagg.bin>0)].drop(columns=['year']), how='left')
resdf.head()

Unnamed: 0,region,fitness2016,eci2016,kc,xc1,xc2,genepy,ability,fe,hc,avgrca_p,avgrca_m,avgrca_c,proj_p,proj_m,proj_c,eci,bin
0,ABW,1.004032,0.024499,33,0.078176,0.021791,0.015089,0.313106,0.0,3.326746,0.458487,0.475485,0.466518,0.105968,0.117298,0.152733,0.021736,192
1,AFG,0.365737,-0.096063,25,0.042223,-0.111436,0.007655,-0.452356,-0.343794,2.597491,0.511607,0.234632,0.286582,-0.815917,-1.072287,-0.849485,-0.050744,164
2,AGO,0.070533,-0.100995,7,0.030913,-0.07078,0.003656,-1.378027,-1.024346,0.607231,0.160169,0.116978,0.135088,-0.437043,-0.387758,-0.384177,-0.039769,198
3,AIA,0.604428,0.060996,24,0.064913,0.025858,0.010484,0.2215,0.638461,2.766955,0.281158,0.304979,0.266034,0.237016,0.157759,0.369143,0.03713,67
4,ALB,0.733577,-0.039531,47,0.045679,-0.060442,0.006066,0.650443,0.0125,3.413015,0.659942,0.531238,0.513222,-0.114121,-0.304722,-0.224567,0.001374,206


In [32]:
resdf[['kc','fitness2016','hc','ability','xc1','avgrca_p','avgrca_m','avgrca_c','bin','eci2016','eci','xc2','proj_p','proj_m','proj_c','genepy','fe']].corr()

Unnamed: 0,kc,fitness2016,hc,ability,xc1,avgrca_p,avgrca_m,avgrca_c,bin,eci2016,eci,xc2,proj_p,proj_m,proj_c,genepy,fe
kc,1.0,0.908907,0.922799,0.91072,0.716408,0.931741,0.967285,0.969524,0.698028,0.420054,0.470444,0.225476,0.406505,0.397427,0.378487,0.625309,0.324606
fitness2016,0.908907,1.0,0.794303,0.802775,0.866133,0.744436,0.886949,0.868511,0.551076,0.564106,0.583691,0.500641,0.504874,0.481109,0.49961,0.845189,0.358421
hc,0.922799,0.794303,1.0,0.958869,0.722283,0.925722,0.942871,0.943946,0.688022,0.490495,0.543543,0.221256,0.498029,0.478179,0.457501,0.593582,0.354024
ability,0.91072,0.802775,0.958869,1.0,0.779477,0.895495,0.927337,0.924193,0.684027,0.551558,0.607099,0.257679,0.542558,0.536644,0.517418,0.644847,0.362611
xc1,0.716408,0.866133,0.722283,0.779477,1.0,0.564092,0.761327,0.726598,0.396321,0.801605,0.803642,0.684278,0.723349,0.699943,0.742445,0.94787,0.485798
avgrca_p,0.931741,0.744436,0.925722,0.895495,0.564092,1.0,0.937545,0.961412,0.808057,0.250376,0.305705,-0.006503,0.261337,0.274873,0.226414,0.454005,0.212463
avgrca_m,0.967285,0.886949,0.942871,0.927337,0.761327,0.937545,1.0,0.995504,0.749586,0.474007,0.538753,0.264613,0.498864,0.492137,0.472184,0.655015,0.314116
avgrca_c,0.969524,0.868511,0.943946,0.924193,0.726598,0.961412,0.995504,1.0,0.778343,0.417803,0.478739,0.199634,0.438848,0.443048,0.413391,0.623274,0.288118
bin,0.698028,0.551076,0.688022,0.684027,0.396321,0.808057,0.749586,0.778343,1.0,-0.012318,0.1122,-0.166632,0.152788,0.20817,0.09599,0.348256,-0.207158
eci2016,0.420054,0.564106,0.490495,0.551558,0.801605,0.250376,0.474007,0.417803,-0.012318,1.0,0.952046,0.809481,0.856796,0.804323,0.876913,0.696404,0.680923


In [33]:
resdf.columns

Index(['region', 'fitness2016', 'eci2016', 'kc', 'xc1', 'xc2', 'genepy',
       'ability', 'fe', 'hc', 'avgrca_p', 'avgrca_m', 'avgrca_c', 'proj_p',
       'proj_m', 'proj_c', 'eci', 'bin'],
      dtype='object')

In [34]:
sio.savemat('data/metric2016.mat', {'struct1':resdf.to_dict("list")})