In [19]:
import urllib
import io
import itertools
import tempfile
import re

import numpy as np
import pandas as pd
import requests
import scipy.sparse
import sklearn.metrics
import tqdm

import xswap
import analysis

%matplotlib inline

In [2]:
def edges_to_matrix(edges):
    n_nodes = max(map(max, edges)) + 1
    edges = list(set(
        list(map(tuple, map(sorted, edges)))
        + list(map(tuple, map(reversed, map(sorted, edges))))
    ))
    sp_mat = scipy.sparse.coo_matrix(
        (np.ones(len(edges)), zip(*edges)), shape=(n_nodes, n_nodes))
    return sp_mat

## Literature-curated

'mmc2.xlsx' is Table S1 in https://doi.org/10.1016/j.cell.2014.10.050

In [3]:
mmc2_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S0092867414014226-mmc2.xlsx'
lit_bm_dict = pd.read_excel(mmc2_url, sheet_name=['1A', '1B'])
lit_bm_dict = {2010: lit_bm_dict['1A'], 2013: lit_bm_dict['1B']}

lit_bm_dict[2013].head()

Unnamed: 0,entrez_gene_ida,entrez_gene_idb,gene_symbol_a,gene_symbol_b,pieces_of_evidence,source_databases,evidence
0,4790,79155,NFKB1,TNIP2,4,"biogrid,dip,intact,mint,hprd","14743216:MI:0004,15169888:MI:0004,15169888:MI:..."
1,7879,83547,RAB7A,RILP,6,"biogrid,dip,pdb,hprd","11179213:MI:0018,11179213:MI:0096,11448994:MI:..."
2,3932,80306,LCK,MED28,3,"intact,hprd","10656681:MI:0018,16899217:MI:0096,16899217:MI:..."
3,53840,85363,TRIM34,TRIM5,2,"biogrid,intact","16828831:MI:0018,22493164:MI:0018"
4,5705,8856,PSMC5,NR1I2,2,"bind,hprd","14702340:MI:0018,15604093:MI:0018"


## High-throughput

In [4]:
# Download and combine Marc Vidal lab's published networks
network_files = [
    'Raul-Vidal(Nature_2005).psi',
    'Venkatesan-Vidal(Nature_Methods_2009).psi',
    'Yu-Vidal(Nature_Methods_2011).psi',
    'Rolland-Vidal(Cell_2014).psi',
    'Yang-Vidal(Cell_2016).psi',
]
ht_df = pd.DataFrame()
for file in network_files:
    df = (pd.read_table(f'http://interactome.baderlab.org/data/{file}')
          .assign(source=file))
    ht_df = pd.concat([ht_df, df]).reset_index(drop=True)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .query('ida != "-" and idb != "-"')
    .filter(items=['ida', 'idb',])
    .assign(
        uniprot_a = lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:).+', x).group()),
        uniprot_b = lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:).+', x).group())
    )
    .drop_duplicates()
)
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())

In [5]:
# Note: A major source of error is arising here, because many UniProt IDs end with, for example, "-1",
#    indicating a particular isoform. This does not map to entrez_gene_id. For simplicity, all such
#    examples are currently being ignored. These will be sent to the server, but no mapping is returned.

# Get a mapping of nodes in the combined network
url = 'https://www.uniprot.org/uploadlists/'
mapping_df = pd.DataFrame()
for i in range(int(len(ht_nodes) / 900) + 2):
    node_query = sorted(ht_nodes)[i*900:(i+1)*900]
    string_query = ' '.join(node_query)
    params = {
    'from': 'ACC',
    'to': 'P_ENTREZGENEID',
    'format': 'tab',
    'query': string_query,
    }
    res = requests.get(url, params=params)
    mapping_df = pd.concat([mapping_df, pd.read_table(io.StringIO(res.content.decode("utf-8")))])

# Apply the mapping into the dataframe
ht_edges_df = (
    ht_edges_df
    .drop_duplicates()
    .merge(mapping_df, left_on='uniprot_a', right_on='From')
    .merge(mapping_df, left_on='uniprot_b', right_on='From')
    .rename(columns={'To_x': 'entrez_gene_ida', 'To_y': 'entrez_gene_idb'})
    .drop(columns=['From_x', 'From_y'])
)
ht_edges_df[['entrez_gene_ida', 'entrez_gene_idb']] = (
    ht_edges_df.loc[:, 'entrez_gene_ida':'entrez_gene_idb'].astype(int)
)
ht_edges_df.head()

Unnamed: 0,ida,idb,uniprot_a,uniprot_b,entrez_gene_ida,entrez_gene_idb
0,uniprotkb:Q8IYI6,uniprotkb:A0A024R0Y4,Q8IYI6,A0A024R0Y4,149371,6871
1,uniprotkb:O15287,uniprotkb:A0A024R0Y4,O15287,A0A024R0Y4,2189,6871
2,uniprotkb:Q8WW24,uniprotkb:A0A024R0Y4,Q8WW24,A0A024R0Y4,150483,6871
3,uniprotkb:Q12874,uniprotkb:A0A024R0Y4,Q12874,A0A024R0Y4,10946,6871
4,uniprotkb:Q9Y4H4,uniprotkb:A0A024R0Y4,Q9Y4H4,A0A024R0Y4,63940,6871


In [6]:
ht_nodes_entrez = set(ht_edges_df.loc[:, 'entrez_gene_ida':].values.flatten())
lit_nodes_13 = set(lit_bm_dict[2013].loc[:, 'entrez_gene_ida':'entrez_gene_idb'].values.flatten())
lit_nodes_10 = set(lit_bm_dict[2010].loc[:, 'entrez_gene_ida':'entrez_gene_idb'].values.flatten())
lit_nodes = lit_nodes_10.union(lit_nodes_13)

shared_nodes = sorted(lit_nodes.intersection(ht_nodes_entrez))
source, target = list(zip(*itertools.product(shared_nodes, shared_nodes)))

len(shared_nodes)

533

In [7]:
edges_df = (
    pd.DataFrame
    .from_dict({
        'entrez_gene_ida': source,
        'entrez_gene_idb': target,
    })
    .merge(
        lit_bm_dict[2010]
        .loc[:, 'entrez_gene_ida':'entrez_gene_idb']
        .assign(lit_2010=1),
        on=['entrez_gene_ida', 'entrez_gene_idb'],
        how='left',
    )
    .merge(
        lit_bm_dict[2013]
        .loc[:, 'entrez_gene_ida':'entrez_gene_idb']
        .assign(lit_2013=1),
        on=['entrez_gene_ida', 'entrez_gene_idb'],
        how='left',
    )
    .merge(
        ht_edges_df
        .loc[:, 'entrez_gene_ida':'entrez_gene_idb']
        .assign(ht_2014=1),
        on=['entrez_gene_ida', 'entrez_gene_idb'],
        how='left',
    )
    .fillna(0)
)

edges_df.head()

Unnamed: 0,entrez_gene_ida,entrez_gene_idb,lit_2010,lit_2013,ht_2014
0,60,60,1.0,0.0,1.0
1,60,71,0.0,0.0,1.0
2,60,89,0.0,0.0,0.0
3,60,185,0.0,0.0,0.0
4,60,259,0.0,0.0,0.0


In [8]:
edges_df.iloc[:, 2:].sum()

lit_2010    121.0
lit_2013    250.0
ht_2014     752.0
dtype: float64

In [9]:
edges_df.query('lit_2010 == 0 and lit_2013 == 0 and ht_2014 == 1').shape

(670, 5)

In [10]:
network_edges = dict()
network_mappings = dict()
network_matrices = dict()

for network in ['lit_2010', 'lit_2013']:
    # Extract edge tuples
    relevant_df = edges_df.query(f'{network} == 1')
    edges = list(zip(relevant_df['entrez_gene_ida'], relevant_df['entrez_gene_idb']))
    edges = list(set(map(tuple, map(sorted, edges))))
    mapped_edges, mapping, _ = (
        xswap.preprocessing.map_str_edges(edges, bipartite=False))   
    
    # Create adjacency matrix
    max_id = max(mapping.values())
    sp_mat = edges_to_matrix(mapped_edges)

    network_edges[network] = mapped_edges
    network_mappings[network] = mapping
    network_matrices[network] = sp_mat
    
    # Compute RWR
    rwr_mat = analysis.all_pairs_rwr(sp_mat, 0.25)
    
    # Compute RWR p-value
    n_perms = 1000
    indicator = np.zeros_like(rwr_mat)
    perm_edges = mapped_edges.copy()
    for i in tqdm.tnrange(n_perms):
        perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, 
                                                allow_antiparallel=False, seed=i)
        perm_mat = edges_to_matrix(perm_edges)
        perm_rwr = analysis.all_pairs_rwr(perm_mat, 0.25)
        indicator += (perm_rwr > rwr_mat)
    p_values = indicator / n_perms
    
    # Unmap RWR and add to DataFrame
    reversed_map = {v: k for k, v in mapping.items()}
    mapped_source, mapped_target = zip(*itertools.product(range(max_id+1), range(max_id+1)))
    mapped_rwr_df = (
        pd.DataFrame
        .from_dict({
            'mapped_source': mapped_source,
            'mapped_target': mapped_target,
            f'{network}_rwr': rwr_mat.flatten(),
            f'{network}_p_value': p_values.flatten()
        })
        .assign(
            entrez_gene_ida=lambda df: df['mapped_source'].map(reversed_map),
            entrez_gene_idb=lambda df: df['mapped_target'].map(reversed_map)
        )
        .filter(items=['entrez_gene_ida', 'entrez_gene_idb', f'{network}_rwr', f'{network}_p_value'])
    )
    edges_df = (
        edges_df
        .merge(mapped_rwr_df, on=['entrez_gene_ida', 'entrez_gene_idb'], how='left')
        .fillna(0)
    )
    
edges_df.to_csv('p_vs_rank.tsv.gz', compression='gzip', sep='\t')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [11]:
edges_df.head()

Unnamed: 0,entrez_gene_ida,entrez_gene_idb,lit_2010,lit_2013,ht_2014,lit_2010_rwr,lit_2010_p_value,lit_2013_rwr,lit_2013_p_value
0,60,60,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,60,71,0.0,0.0,1.0,0.0,0.395,0.0,0.0
2,60,89,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60,185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60,259,0.0,0.0,0.0,0.0,0.588,0.0,0.0


In [14]:
sklearn.metrics.roc_auc_score(edges_df['ht_2014'], edges_df['lit_2010_rwr'])

0.5492824970430713

In [15]:
sklearn.metrics.roc_auc_score(edges_df['ht_2014'], edges_df['lit_2010_p_value'])

0.5095103199460361

In [16]:
sklearn.metrics.roc_auc_score(edges_df['ht_2014'], edges_df['lit_2013_rwr'])

0.5868134081278051

In [17]:
sklearn.metrics.roc_auc_score(edges_df['ht_2014'], edges_df['lit_2013_p_value'])

0.4721691002240965