In [1]:
import collections
import itertools
import tempfile
import re

import numpy as np
import pandas as pd
import requests
import scipy.sparse
import tqdm

import xswap
import analysis

%matplotlib inline

## STRING

STRING gives HTTP 403 for `pd.read_table(url)`, but `requests.get` seems to work just fine.

In [2]:
# Download PPI network from STRING
string_url = 'https://stringdb-static.org/download/protein.links.v11.0/9606.protein.links.v11.0.txt.gz'
with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(string_url)
    tf.write(res.content)
    string_df = pd.read_table(tf.name, compression='gzip', sep=' ')

# Download Ensembl to UniProtKB identifier mappings
mapping_url = 'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz'
with tempfile.NamedTemporaryFile() as tf:
    res = requests.get(mapping_url)
    tf.write(res.content)
    mapping_df = pd.read_table(tf.name, compression='gzip', header=None)

# Create dictionary with mappings
map_to_uniprot = (
    mapping_df
    .assign(uniprot=lambda df: df[1].apply(lambda x: re.search('[A-Z0-9]+', x).group()))
    .set_index(2)
    .loc[:, 'uniprot']
    .to_dict()
)

string_df = (
    string_df
    .assign(
        uniprot_a=lambda df: df['protein1'].map(map_to_uniprot),
        uniprot_b=lambda df: df['protein2'].map(map_to_uniprot),
    )
    .filter(items=['uniprot_a', 'uniprot_b'])
    .dropna()
)
string_nodes = set(string_df.values.flatten())

string_df.head(2)

Unnamed: 0,uniprot_a,uniprot_b
1,P84085,O43307
2,P84085,O75460


In [3]:
len(string_nodes)

19080

## High-throughput, systematic network

In [8]:
ht_1_url = 'http://interactome.baderlab.org/data/Raul-Vidal(Nature_2005).psi'
ht_2_url = 'http://interactome.baderlab.org/data/Rolland-Vidal(Cell_2014).psi'

ht_df = pd.concat([pd.read_table(ht_1_url), pd.read_table(ht_2_url)], ignore_index=True)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .filter(items=['ida', 'idb',])
    .query('ida != "-" and idb != "-"')
    .assign(
        uniprot_a = lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group()),
        uniprot_b = lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group())
    )
    .filter(items=['uniprot_a', 'uniprot_b',])
    .drop_duplicates()
)
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())

ht_edges_df.head(2)

Unnamed: 0,uniprot_a,uniprot_b
2211,A0A0R4J2E4,A0A024R0Y4
2214,O60573,A0A024R0Y4


In [9]:
len(ht_nodes)

4517

## Combine data

In [10]:
shared_nodes = sorted(string_nodes.intersection(ht_nodes))
source, target = zip(*itertools.product(shared_nodes, shared_nodes))

len(shared_nodes)

4083