In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import csv
_= np.seterr(divide = 'ignore') 

In [2]:
## generate relatedness matrices from binary dataset
def proximity_from_binmcp(sample,var,method,param=None):
    mcp = sample[var].values.reshape(-1,415).astype(np.float)
    #diversity = mcp.sum(axis=1)
    ubiquity = mcp.sum(axis=0)
    #diversity[diversity<1] = 1
    ubiquity[ubiquity<1] = 1
    if method=='condprob':
        x = float(param)
        A = mcp.T @ mcp
        A = np.diag(1/np.power(ubiquity,x)) @ A @ np.diag(1/np.power(ubiquity,1-x))
        if x<0.5:
            A = np.minimum(A, A.T)
        else:
            A = np.maximum(A, A.T)
    elif method=='rca':
        A = mcp.T @ mcp
        np.fill_diagonal(A, 0)
        sz = A.sum(axis=0)
        sz[sz<1] = 1
        A = np.diag(1/sz) @ A @ np.diag(1/sz) * A.sum()
        if param == 'posi':
            A[A<1] = 0
        A = A / (A+1)
    elif method=='pearson':
        A = np.corrcoef(mcp,rowvar=False)
        A[np.isnan(A)]=0
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    np.fill_diagonal(A, 0)
    return A

In [3]:
## generate relatedness matrices from continuous dataset
def proximity_from_valmcp(sample,var,method,param=None):
    mcp = sample[var].values.reshape(-1,415)
    if method=='pearson':
        A = np.corrcoef(mcp,rowvar=False)
        A[np.isnan(A)]=0
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    elif method=='cosine':
        from sklearn.metrics.pairwise import cosine_similarity
        A = cosine_similarity(mcp.T)
        A[np.isnan(A)]=0
        if param=='posi':
            A[A<0] = 0
        else:
            A = (A+1)/2
    np.fill_diagonal(A, 0)
    return A

In [4]:
varlist = [
    "region",
    "ind",
    "raw",
    "lograw",
    "rca",
    "rca2",
    "pmi",
    "ppmi",
    "feresid",
    "resid",
    "posresid",
    "bin",
    "bin_rca",
    "bin_feresid",
    "bin_resid",
    "bin_posresid",
]

In [5]:
binparamdict = {
    "condprob": [f'{x:0.1f}' for x in np.linspace(0,1,11)],
    "rca": ["posi", "all"],
    "pearson": ["posi", "all"],
}

In [6]:
## create parameter combination in the grid
proxfile = open('proximity/du1st/proximity.tsv','w')
writer = csv.writer(proxfile,delimiter='\t')
writer.writerow(['pid','datasource','method','param'])
proxid = 0
for var in ["bin","bin_rca","bin_feresid","bin_resid","bin_posresid"]:
    for method in ['condprob','rca','pearson']:
        for param in binparamdict[method]:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
for var in ["raw","lograw","rca","rca2","pmi","ppmi","feresid","resid","posresid"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            writer.writerow([proxid,var,method,param])
            proxid = proxid+1
proxfile.close()
print(proxid)

111


In [None]:
## create relatedness matrices for each training dataset for each combination above
## The .npz are sparse mcp-style matrices of firm-industry
sampledf = pd.read_parquet(f'du1stdf_metric.parquet',columns=varlist)
proxid = 0
for var in ["bin","bin_rca","bin_feresid","bin_resid","bin_posresid"]:
    for method in ['condprob','rca','pearson']:
        for param in binparamdict[method]:
            A = proximity_from_binmcp(sampledf,var,method,param)
            np.save(f'proximity/du1st/{proxid}.npy', A)
            proxid = proxid+1
for var in ["raw","lograw","rca","rca2","pmi","ppmi","feresid","resid","posresid"]:
    for method in ['pearson','cosine']:
        for param in ['posi','all']:
            A = proximity_from_valmcp(sampledf,var,method,param)
            np.save(f'proximity/du1st/{proxid}.npy', A)
            proxid = proxid+1

In [8]:
print(proxid)

111
