In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import csv
_= np.seterr(divide = 'ignore') 

In [2]:
## generate relatedness matrices from binary dataset
def proximity_from_binspmat(spmat,method,param=None):
    #diversity = spmat.sum(axis=1).A1
    ubiquity = spmat.sum(axis=0).A1
    #diversity[diversity<1] = 1
    ubiquity[ubiquity<1] = 1
    if method=='condprob':
        x = float(param)
        A = (spmat.T @ sp.identity(spmat.shape[0], dtype='int64', format='csr') @ spmat).toarray()
        A = np.diag(1/np.power(ubiquity,x)) @ A @ np.diag(1/np.power(ubiquity,1-x))
        if x<0.5:
            A = np.minimum(A, A.T)
        else:
            A = np.maximum(A, A.T)
    if method=='rca':
        A = (spmat.T @ sp.identity(spmat.shape[0], dtype='int64', format='csr') @ spmat).toarray()
        np.fill_diagonal(A, 0)
        sz = A.sum(axis=0)
        sz[sz<1] = 1
        A = np.diag(1/sz) @ A @ np.diag(1/sz) * A.sum()
        if param == 'posi':
            A[A<1] = 0
        A = A / (A+1)
    if method=='pearson':
        spmat2 = spmat.astype(np.float64)
        ubiquity = spmat2.sum(axis=0).A1
        centering = (np.outer(ubiquity, ubiquity)/spmat2.shape[0])
        A = (spmat2.T @ sp.identity(spmat2.shape[0], dtype='int64', format='csr') @ spmat2).toarray()
        A = (A - centering) / (spmat2.shape[0] - 1)
        d = np.copy(np.diag(A))
        d[d==0]=1
        A = A / np.sqrt(np.outer(d, d))
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    np.fill_diagonal(A, 0)
    return A

In [3]:
## generate relatedness matrices from continuous dataset
def proximity_from_valspmat(spmat,method,param=None):
    if method=='cosine':
        from sklearn.metrics.pairwise import cosine_similarity
        A = cosine_similarity(spmat.T)
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    if method=='pearson':
        ubiquity = spmat.sum(axis=0).A1
        centering = (np.outer(ubiquity, ubiquity)/spmat.shape[0])
        A = (spmat.T @ spmat).toarray()
        A = (A - centering) / (spmat.shape[0] - 1)
        d = np.copy(np.diag(A))
        d[d==0]=1
        A = A / np.sqrt(np.outer(d, d))
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    np.fill_diagonal(A, 0)
    return A

In [4]:
def proximity_from_valmcp(mcp,method,param=None):
    if method=='pearson':
        A = np.corrcoef(mcp,rowvar=False)
        A[np.isnan(A)]=0
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    elif method=='cosine':
        from sklearn.metrics.pairwise import cosine_similarity
        A = cosine_similarity(mcp.T)
        A[np.isnan(A)]=0
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    np.fill_diagonal(A, 0)
    return A

In [5]:
varlist = [
    "region",
    "ind",
    "raw",
    "lograw",
    "rca",
    "rca2",
    "pmi",
    "ppmi",
    "feresid",
    "resid",
    "posresid",
    "bin",
    "bin_rca",
    "bin_feresid",
    "bin_resid",
    "bin_posresid",
]

In [6]:
binparamdict = {
    "condprob": [f'{x:0.1f}' for x in np.linspace(0,1,11)],
    "rca": ["posi", "all"],
    "pearson": ["posi", "all"],
}

In [7]:
## create parameter combination in the grid
proxfile = open('proximity/coproduction/proximity.tsv','w')
writer = csv.writer(proxfile,delimiter='\t')
writer.writerow(['pid','datasource','method','param'])
proxid = 0
for var in ["bin","bin_rca","bin_feresid","bin_resid","bin_posresid"]:
    for method in ['condprob','rca','pearson']:
        for param in binparamdict[method]:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
for var in ["raw","lograw","rca","rca2"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
for var in ["pmi"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
for var in ["ppmi"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
for var in ["feresid","resid","posresid"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
proxfile.close()
print(proxid)

111


In [None]:
## create relatedness matrices for each training dataset for each combination above
## The .npz are sparse mcp-style matrices of establishement-industry
proxid = 0
for var in ["bin","bin_rca","bin_feresid","bin_resid","bin_posresid"]:
    spmat = sp.load_npz(f'coprodmat/coprod_{var}_spmat_2011.npz')
    for method in ['condprob','rca','pearson']:
        for param in binparamdict[method]:
            A = proximity_from_binspmat(spmat,method,param)
            np.save(f'proximity/coproduction/{proxid}.npy', A)
            proxid = proxid+1
for var in ["raw","lograw","rca","rca2"]:
    spmat = sp.load_npz(f'coprodmat/coprod_{var}_spmat_2011.npz')
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            A = proximity_from_valspmat(spmat,method,param)
            np.save(f'proximity/coproduction/{proxid}.npy', A)
            proxid = proxid+1
for var in ["pmi"]:
    mcp = np.load(f'coprodmat/coprod_{var}_npmat_2011.npy')
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            A = proximity_from_valmcp(mcp,method,param)
            np.save(f'proximity/coproduction/{proxid}.npy', A)
            proxid = proxid+1
for var in ["ppmi"]:
    spmat = sp.load_npz(f'coprodmat/coprod_{var}_spmat_2011.npz')
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            A = proximity_from_valspmat(spmat,method,param)
            np.save(f'proximity/coproduction/{proxid}.npy', A)
            proxid = proxid+1
for var in ["feresid","resid","posresid"]:
    mcp = np.load(f'coprodmat/coprod_{var}_npmat_2011.npy')
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            A = proximity_from_valmcp(mcp,method,param)
            np.save(f'proximity/coproduction/{proxid}.npy', A)
            proxid = proxid+1

In [9]:
print(proxid)

111
