In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

import analysis

In [2]:
n_perms = 1000
allow_antiparallel = False
allow_self_loops = False
directed = False

# 2. BioRxiv network

In [3]:
df = pd.read_csv('../data/3.all_nodes/biorxiv.tsv.xz', sep='\t', compression='xz')

edges = list(map(tuple, 
    df
    .query('train == 1')
    .loc[:, ['id_a', 'id_b']]
    .values
))
mat = analysis.edges_to_matrix(edges, directed=directed).tocsc()

degree = np.repeat(mat.sum(axis=1), mat.shape[1], axis=1) \
         + np.repeat(mat.sum(axis=0), mat.shape[0], axis=0)

In [4]:
assert len(edges) == len(set(edges))

In [5]:
feature_dict = {
    'edge_prior': scipy.sparse.csc_matrix(mat.shape),
    
    # RWR is fastest on this network with approximate matrix inverse on sparse matrix
    'rwr': analysis.invertible_rwr(mat.toarray(), 0.25),
    'mean_rwr': scipy.sparse.csc_matrix(mat.shape),
    'p_rwr': scipy.sparse.csc_matrix(mat.shape),
    
    'jaccard': analysis.jaccard(mat, degree),
    'mean_jaccard': np.zeros(mat.shape),
    'p_jaccard': np.zeros(mat.shape),
}

In [6]:
perm_edges = edges.copy()
for i in tqdm.tnrange(n_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=allow_self_loops, 
                                            allow_antiparallel=allow_antiparallel, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges, directed=directed).tocsc()

    feature_dict['edge_prior'] += perm_mat
    
    perm_rwr = analysis.invertible_rwr(perm_mat.toarray(), 0.25)    
    feature_dict['mean_rwr'] += perm_rwr
    feature_dict['p_rwr'] += (perm_rwr >= feature_dict['rwr'])
    
    perm_jaccard = analysis.jaccard(perm_mat, degree)
    feature_dict['mean_jaccard'] += perm_jaccard
    feature_dict['p_jaccard'] += (perm_jaccard >= feature_dict['jaccard'])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [7]:
for feature, array in feature_dict.items():
    # Normalize features by the number of permutations
    if feature in ['edge_prior', 'mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard']:
        feature_dict[feature] /= n_perms
    
    # Make features dense (for DataFrame)
    if scipy.sparse.issparse(array):
        array = array.toarray()
    
    feature_df = (
        pd.DataFrame(array)
        .reset_index()
        .melt(id_vars=['index'])
        .rename(columns={'index': 'id_a', 'variable': 'id_b', 'value': feature})
        .assign(
            id_a=lambda df: df['id_a'].astype(int),
            id_b=lambda df: df['id_b'].astype(int),
        )
        # Note <= in PPI is now < in BioRxiv, because protein can interact with itself,
        # but an author can't be their own co-author.
        .query('id_a < id_b')
    )
    
    df = df.merge(feature_df, how='left', on=['id_a', 'id_b'])
df.head(2)

Unnamed: 0,name_a,name_b,id_a,id_b,train,test_recon,test_new,edge_prior,rwr,mean_rwr,p_rwr,jaccard,mean_jaccard,p_jaccard
0,- The US-Venezuela Collaborative Research Group,A. Murat Eren,0,1,0,0,0,0.0,3.597993e-08,0.000144,1.0,0.0,0.003646,1.0
1,- The US-Venezuela Collaborative Research Group,A. S. M. Ashique Mahmood,0,2,0,0,0,0.0,1.25453e-07,0.000129,1.0,0.0,0.003428,1.0


In [8]:
%%time

df.to_csv('../data/4.data/biorxiv.tsv.xz', sep='\t', compression='xz', index=False)

CPU times: user 12min 44s, sys: 681 ms, total: 12min 45s
Wall time: 12min 45s
