In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

import analysis

In [2]:
n_perms = 1000
allow_antiparallel = False
allow_self_loops = True
directed = False

# 1. PPI network

In [3]:
ppi_df = pd.read_csv('../data/3.all_nodes/ppi.tsv.xz', sep='\t', compression='xz')

ppi_edges = list(map(tuple, 
    ppi_df
    .query('train == 1')
    .loc[:, ['id_a', 'id_b']]
    .values
))
ppi_mat = analysis.edges_to_matrix(ppi_edges, directed=directed)
print(ppi_mat.shape)

degree = np.repeat(ppi_mat.sum(axis=1), ppi_mat.shape[1], axis=1) \
         + np.repeat(ppi_mat.sum(axis=0), ppi_mat.shape[0], axis=0)

(4083, 4083)


In [None]:
feature_dict = {
    'edge_prior': scipy.sparse.csc_matrix(ppi_mat.shape),
    
    # RWR is fastest on this network with exact matrix inverse on dense array
    'rwr': analysis.invertible_rwr(ppi_mat.toarray(), 0.25),
    'mean_rwr': np.zeros(ppi_mat.shape),
    'p_rwr': np.zeros(ppi_mat.shape),
    
    'jaccard': analysis.jaccard(ppi_mat, degree),
    'mean_jaccard': np.zeros(ppi_mat.shape),
    'p_jaccard': np.zeros(ppi_mat.shape),
}

In [None]:
perm_edges = ppi_edges.copy()
for i in tqdm.tnrange(n_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=allow_self_loops, 
                                            allow_antiparallel=allow_antiparallel, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges, directed=directed).tocsc()
    
    feature_dict['edge_prior'] += perm_mat
    
    perm_rwr = analysis.invertible_rwr(perm_mat.toarray(), 0.25)
    feature_dict['mean_rwr'] += perm_rwr
    feature_dict['p_rwr'] += (perm_rwr >= feature_dict['rwr'])
    
    perm_jaccard = analysis.jaccard(perm_mat, degree)
    feature_dict['mean_jaccard'] += perm_jaccard
    feature_dict['p_jaccard'] += (perm_jaccard >= feature_dict['jaccard'])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Note: below, it may seem more obvious to just flatten the computed feature matrices and add them as columns to the DataFrame. Unfortunately, this would not work, as the flattened matrix has far more entries than the DataFrame, which only stores the values where `id_a` $\leq$ `id_b`. More specifically, the DataFrame keeps only 

In [None]:
for feature, array in feature_dict.items():
    # Normalize features by the number of permutations
    if feature in ['edge_prior', 'mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard']:
        feature_dict[feature] /= n_perms
    
    # Make features dense (for DataFrame)
    if scipy.sparse.issparse(array):
        array = array.toarray()
    
    feature_df = (
        pd.DataFrame(array)
        .reset_index()
        .melt(id_vars=['index'])
        .rename(columns={'index': 'id_a', 'variable': 'id_b', 'value': feature})
        .assign(
            id_a=lambda df: df['id_a'].astype(int),
            id_b=lambda df: df['id_b'].astype(int),
        )
        .query('id_a <= id_b')
    )
    
    ppi_df = (
        ppi_df
        .merge(feature_df, how='left', on=['id_a', 'id_b'])
    )
ppi_df.head(2)

In [None]:
%%time

ppi_df.to_csv('../data/4.data/ppi.tsv.xz', sep='\t', compression='xz', index=False)