In [1]:
import itertools
import tempfile

import numpy as np
import pandas as pd
import requests
import scipy.sparse
import tqdm
import xswap

import analysis

### 01. Download graphs

In [2]:
raul_url = 'http://interactome.baderlab.org/data/Raul-Vidal(Nature_2005).psi'
lit_url = 'http://interactome.baderlab.org/data/LitBM-17.psi'

with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(raul_url)
    tf.write(res.content)
    raul_df = pd.read_table(tf.name)

with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(lit_url)
    tf.write(res.content)
    lit_df = pd.read_table(tf.name, names=raul_df.columns)

### 02. Extract edges

In [3]:
raul_edges_df = (
    raul_df
    .rename(columns={
        'Unique identifier for interactor A': 'id_a',
        'Unique identifier for interactor B': 'id_b',
    })
    .loc[:, 'id_a':'id_b']
    .query('id_a != "-" and id_b != "-"')
)

raul_nodes_set = set(raul_edges_df['id_a']).union(set(raul_edges_df['id_b']))
raul_str_edges = list(set(raul_edges_df.apply(tuple, axis=1).tolist()))
raul_edges, raul_map, _ = xswap.preprocessing.map_str_edges(raul_str_edges, 
                                                            bipartite=False)

lit_str_edges = list(set((
    lit_df
    .rename(columns={
        'Unique identifier for interactor A': 'id_a',
        'Unique identifier for interactor B': 'id_b',
    })
    .loc[:, 'id_a':'id_b']
    .query('id_a != "-" and id_b != "-"')
    .loc[lambda df: df['id_a'].apply(lambda x: x in raul_nodes_set) 
                    & df['id_b'].apply(lambda x: x in raul_nodes_set)]
    .apply(tuple, axis=1)
    .tolist()
)))
lit_edges = list(map(lambda tup: (raul_map[tup[0]], raul_map[tup[1]]), 
                     lit_str_edges))

### 03. Construct matrices and DataFrames

In [4]:
def edges_to_matrix(edges, n_nodes):
    sp = scipy.sparse.coo_matrix((np.ones(len(edges)), zip(*edges)), 
                                 shape=(n_nodes, n_nodes), dtype=int)
    return sp.toarray()

In [5]:
source_id, target_id = zip(*list(
    itertools.product(
        range(len(raul_map)), range(len(raul_map))
    )
))

raul_adj = edges_to_matrix(raul_edges, len(raul_map))
lit_adj = edges_to_matrix(lit_edges, len(raul_map))

rwr_mat = analysis.all_pairs_rwr(raul_adj, restart_prob=0.25)

df = (
    pd.DataFrame
    .from_dict({
        'source_id': source_id, 
        'target_id': target_id,
        'raul_edge': raul_adj.flatten(),
        'lit_edge': lit_adj.flatten(),
        'rwr': rwr_mat.flatten(),
    })
    .assign(rank=lambda df: df.loc[:, 'rwr'].rank(method='min', ascending=False))
)

### 04. Compute p-values through permutation

In [6]:
indicator_mat = np.zeros_like(raul_adj, dtype=int)

n_perms = 1000

permuted_edges = list(set([(min(edge), max(edge)) for edge in raul_edges.copy()]))

for i in tqdm.trange(n_perms):
    permuted_edges, _ = xswap.permute_edge_list(
        permuted_edges, allow_self_loops=True, allow_antiparallel=False, seed=i)
    permuted_adj = edges_to_matrix(permuted_edges, len(raul_map))
    perm_rwr = analysis.all_pairs_rwr(permuted_adj, restart_prob=0.25)
    indicator_mat += (perm_rwr > rwr_mat)

indicator_mat = indicator_mat / n_perms

df['p_value'] = indicator_mat.flatten()
df['p_value_rank'] = df['p_value'].rank(method='min', ascending=True)

100%|██████████| 1000/1000 [2:07:56<00:00,  7.68s/it] 


In [8]:
df.to_csv('p_vs_rank.tsv.gz', sep='\t', index=False, compression='gzip')