In [1]:
import urllib
import io
import tempfile
import re

import pandas as pd
import requests

## Literature-curated

'mmc2.xlsx' is Table S1 in https://doi.org/10.1016/j.cell.2014.10.050

In [2]:
lit_bm_dict = pd.read_excel('../data/p_vs_rank/mmc2.xlsx', sheet_name=['1A', '1B'])
lit_bm_dict = {2010: lit_bm_dict['1A'], 2013: lit_bm_dict['1B']}

lit_bm_dict[2013].head()

## High-throughput

In [7]:
# Download and combine Marc Vidal lab's published networks
network_files = [
    'Raul-Vidal(Nature_2005).psi',
    'Venkatesan-Vidal(Nature_Methods_2009).psi',
    'Yu-Vidal(Nature_Methods_2011).psi',
    'Rolland-Vidal(Cell_2014).psi',
    'Yang-Vidal(Cell_2016).psi',
]
ht_df = pd.DataFrame()
for file in network_files:
    with tempfile.NamedTemporaryFile() as tf:
        res = requests.get(f'http://interactome.baderlab.org/data/{file}')
        tf.write(res.content)
        df = pd.read_table(tf.name).assign(source=file)
    ht_df = pd.concat([ht_df, df]).reset_index(drop=True)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .query('ida != "-" and idb != "-"')
    .filter(items=['ida', 'idb',])
    .assign(
        uniprot_a = lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:).+', x).group()),
        uniprot_b = lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:).+', x).group())
    )
    .drop_duplicates()
)
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())

In [9]:
# Note: A major source of error is arising here, because many UniProt IDs end with, for example, "-1",
#    indicating a particular isoform. This does not map to entrez_gene_id. For simplicity, all such
#    examples are currently being ignored. These will be sent to the server, but no mapping is returned.

# Get a mapping of nodes in the combined network
url = 'https://www.uniprot.org/uploadlists/'
mapping_df = pd.DataFrame()
for i in range(int(len(ht_nodes) / 900) + 2):
    node_query = sorted(ht_nodes)[i*900:(i+1)*900]
    string_query = ' '.join(node_query)
    params = {
    'from': 'ACC',
    'to': 'P_ENTREZGENEID',
    'format': 'tab',
    'query': string_query,
    }
    res = requests.get(url, params=params)
    mapping_df = pd.concat([mapping_df, pd.read_table(io.StringIO(res.content.decode("utf-8")))])

# Apply the mapping into the dataframe
ht_edges_df = (
    ht_edges_df
    .drop_duplicates()
    .merge(mapping_df, left_on='uniprot_a', right_on='From')
    .merge(mapping_df, left_on='uniprot_b', right_on='From')
    .rename(columns={'To_x': 'entrez_gene_ida', 'To_y': 'entrez_gene_idb'})
    .drop(columns=['From_x', 'From_y'])
)

ht_edges_df.head()

In [12]:
ht_nodes_entrez = set(ht_edges_df.loc[:, 'entrez_gene_ida':].values.flatten())
lit_nodes_13 = set(lit_bm_dict[2013].loc[:, 'entrez_gene_ida':'entrez_gene_idb'].values.flatten())
lit_nodes_10 = set(lit_bm_dict[2010].loc[:, 'entrez_gene_ida':'entrez_gene_idb'].values.flatten())
lit_nodes = lit_nodes_10.union(lit_nodes_13)

len(lit_nodes.intersection(ht_nodes_entrez))

533