In [17]:
import itertools
import tempfile
import re

import numpy as np
import pandas as pd
import requests
import scipy.sparse
import tqdm

import xswap
import analysis

%matplotlib inline

## STRING

STRING gives HTTP 403 for `pd.read_table(url)`, but `requests.get` seems to work just fine.

In [3]:
# Download PPI network from STRING
string_url = 'https://stringdb-static.org/download/protein.links.v11.0/9606.protein.links.v11.0.txt.gz'
with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(string_url)
    tf.write(res.content)
    string_df = pd.read_table(tf.name, compression='gzip', sep=' ')

# Download Ensembl to UniProtKB identifier mappings
mapping_url = 'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz'
with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(mapping_url)
    tf.write(res.content)
    mapping_df = pd.read_table(tf.name, compression='gzip', header=None)

# Create dictionary with mappings
map_to_uniprot = (
    mapping_df
    .assign(uniprot=lambda df: df[1].apply(lambda x: re.search('[A-Z0-9]+', x).group()))
    .set_index(2)
    .loc[:, 'uniprot']
    .to_dict()
)

string_df = (
    string_df
    .assign(
        uniprot_a=lambda df: df['protein1'].map(map_to_uniprot),
        uniprot_b=lambda df: df['protein2'].map(map_to_uniprot),
    )
    .filter(items=['uniprot_a', 'uniprot_b'])
    .dropna()
)
string_nodes = set(string_df.values.flatten())

In [4]:
len(string_nodes)

19080

## High-throughput

In [5]:
# Download and combine Marc Vidal lab's published networks
ht_url = 'http://interactome.baderlab.org/data/Raul-Vidal(Nature_2005).psi'
ht_df = pd.read_table(ht_url)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .filter(items=['ida', 'idb',])
    .query('ida != "-" and idb != "-"')
    .assign(
        uniprot_a = lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group()),
        uniprot_b = lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group())
    )
    .filter(items=['uniprot_a', 'uniprot_b',])
    .drop_duplicates()
)
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())

ht_edges_df.head()

In [7]:
len(ht_nodes)

1510

## Combine data

In [8]:
shared_nodes = sorted(string_nodes.intersection(ht_nodes))
source, target = list(zip(*itertools.product(shared_nodes, shared_nodes)))

len(shared_nodes)

1389

In [9]:
edges_df = (
    pd.DataFrame
    .from_dict({
        'uniprot_a': source,
        'uniprot_b': target,
    })
    .merge(
        string_df
        .assign(string=1),
        on=['uniprot_a', 'uniprot_b'],
        how='left',
    )
    .merge(
        ht_edges_df
        .assign(ht_2014=1),
        on=['uniprot_a', 'uniprot_b'],
        how='left',
    )
    .fillna(0)
)

edges_df.head()

Unnamed: 0,uniprot_a,uniprot_b,string,ht_2014
0,A4D1E9,A4D1E9,0.0,0.0
1,A4D1E9,O00144,0.0,0.0
2,A4D1E9,O00148,1.0,0.0
3,A4D1E9,O00151,1.0,0.0
4,A4D1E9,O00160,0.0,0.0


In [10]:
# Extract edge tuples
string_edges_df = edges_df.query('string == 1')
edges = list(zip(string_edges_df['uniprot_a'], string_edges_df['uniprot_b']))
edges = list(set(map(tuple, map(sorted, edges))))
mapped_edges, mapping, _ = (
    xswap.preprocessing.map_str_edges(edges, bipartite=False))   

# Create adjacency matrix
max_id = max(mapping.values())
sp_mat = analysis.edges_to_matrix(mapped_edges)

In [None]:
# Compute RWR
rwr_mat = analysis.all_pairs_rwr(sp_mat, 0.25)

# Compute RWR p-value
n_perms = 1000
indicator = np.zeros_like(rwr_mat)
perm_edges = mapped_edges.copy()
for i in tqdm.tnrange(n_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, 
                                            allow_antiparallel=False, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges)
    perm_rwr = analysis.all_pairs_rwr(perm_mat, 0.25)
    indicator += (perm_rwr > rwr_mat)
p_values = indicator / n_perms

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

In [11]:
# Unmap RWR and add to DataFrame
reversed_map = {v: k for k, v in mapping.items()}
mapped_source, mapped_target = zip(*itertools.product(range(max_id+1), range(max_id+1)))
mapped_rwr_df = (
    pd.DataFrame
    .from_dict({
        'mapped_source': mapped_source,
        'mapped_target': mapped_target,
        'rwr': rwr_mat.flatten(),
        'p_value': p_values.flatten()
    })
    .assign(
        uniprot_a=lambda df: df['mapped_source'].map(reversed_map),
        uniprot_b=lambda df: df['mapped_target'].map(reversed_map)
    )
    .filter(items=['uniprot_a', 'uniprot_b', 'rwr', 'p_value'])
)
edges_df = (
    edges_df
    .merge(mapped_rwr_df, on=['uniprot_a', 'uniprot_b'], how='left')
    .fillna(0)
)

edges_df.to_csv('p_vs_rank.tsv.gz', compression='gzip', sep='\t', index=False)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [12]:
edges_df.head()

Unnamed: 0,entrez_gene_ida,entrez_gene_idb,lit_2010,lit_2013,ht_2014,lit_2010_rwr,lit_2010_p_value,lit_2013_rwr,lit_2013_p_value
0,60,60,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,60,71,0.0,0.0,1.0,0.0,0.395,0.0,0.0
2,60,89,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60,185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60,259,0.0,0.0,0.0,0.0,0.588,0.0,0.0
