In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

import analysis

# 1. PPI network

In [2]:
ppi_df = pd.read_csv('../data/3.all_nodes/ppi.tsv.xz', sep='\t', compression='xz')

ppi_edges = list(map(tuple, 
    ppi_df
    .query('train == 1')
    .loc[:, ['id_a', 'id_b']]
    .values
))
ppi_mat = analysis.edges_to_matrix(ppi_edges, directed=False)

degree = np.repeat(ppi_mat.sum(axis=1), ppi_mat.shape[1], axis=1) \
         + np.repeat(ppi_mat.sum(axis=0), ppi_mat.shape[0], axis=0)

In [4]:
feature_dict = {
    'edge_prior': scipy.sparse.csc_matrix(ppi_mat.shape),
    
    'rwr': analysis.invertible_rwr(ppi_mat.toarray(), 0.25),
    'mean_rwr': np.zeros(ppi_mat.shape),
    'p_rwr': np.zeros(ppi_mat.shape),
    
    'jaccard': analysis.jaccard(ppi_mat, degree),
    'mean_jaccard': np.zeros(ppi_mat.shape),
    'p_jaccard': np.zeros(ppi_mat.shape),
}

perm_edges = ppi_edges.copy()
n_perms = 1000
for i in tqdm.tnrange(n_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, 
                                            allow_antiparallel=False, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges, directed=False).tocsc()
    
    feature_dict['edge_prior'] += perm_mat
    
    perm_rwr = analysis.invertible_rwr(perm_mat.toarray(), 0.25)
    feature_dict['mean_rwr'] += perm_rwr
    feature_dict['p_rwr'] += (perm_rwr >= feature_dict['rwr'])
    
    perm_jaccard = analysis.jaccard(perm_mat, degree)
    feature_dict['mean_jaccard'] += perm_jaccard
    feature_dict['p_jaccard'] += (perm_jaccard >= feature_dict['jaccard'])
    
for feature in ['edge_prior', 'mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard']:
    feature_dict[feature] /= n_perms

In [28]:
for feature, array in feature_dict.items():
    if scipy.sparse.issparse(array):
        array = array.toarray()
    feature_df = (
        pd.DataFrame(array)
        .reset_index()
        .melt(id_vars=['index'])
        .rename(columns={'index': 'id_a', 'variable': 'id_b', 'value': feature})
        .query('source <= target')
    )
    
    ppi_df = (
        ppi_df
        .merge(feature_df, how='left', on=['id_a', 'id_b'])
    )

UndefinedVariableError: name 'source' is not defined

In [None]:
ppi_df