In [25]:
import itertools
import pathlib
import re

import networkx as nx
import numpy as np
import pandas as pd
import requests

# 1. Download raw files

In [2]:
data_path = pathlib.Path('../data/1.raw/')

file_to_url = {
    'ppi_string.txt.gz': ('https://stringdb-static.org/download/protein.links.v11.0/'
                          '9606.protein.links.v11.0.txt.gz'),
    
    'ppi_string_mapping.tsv.gz': ('https://string-db.org/mapping_files/uniprot/'
                                  'human.uniprot_2_string.2018.tsv.gz'),
    
    'ppi_ht_1.psi': 'http://interactome.baderlab.org/data/Raul-Vidal(Nature_2005).psi',
    'ppi_ht_2.psi': 'http://interactome.baderlab.org/data/Rolland-Vidal(Cell_2014).psi',
    
    'tftg.gmt': ('https://static-content.springer.com/esm/art%3A10.1186%2Fs12915-017-0469-0/'
                 'MediaObjects/12915_2017_469_MOESM5_ESM.gmt')
}

# for file, url in file_to_url.items():
#     with open(data_path.joinpath(file), 'wb') as f:
#         res = requests.get(url)
#         f.write(res.content)

# 2. Process files to edges

## 2.1 PPI

### 2.1.1 STRING

https://string-db.org/

The two PPI networks use different mappings. We convert STRING to UniProt identifiers.

In [3]:
# Ensembl to UniProtKB identifier mappings
mapping_df = pd.read_table(data_path.joinpath('ppi_string_mapping.tsv.gz'), 
                           compression='gzip', names=['species', 'uniprot_entry', 'string', 
                                                      'unknown_a', 'unknown_b'])

# Create dictionary with mappings
string_to_uniprot = (
    mapping_df
    .assign(uniprot=lambda df: df['uniprot_entry'].apply(lambda x: re.search('[A-Z0-9]+', x).group()))
    .set_index('string')
    .loc[:, 'uniprot']
    .to_dict()
)

# PPI network from STRING
string_edges_df = (
    pd.read_table(data_path.joinpath('ppi_string.txt.gz'), compression='gzip', sep=' ')
    .assign(
        uniprot_a=lambda df: df['protein1'].map(string_to_uniprot),
        uniprot_b=lambda df: df['protein2'].map(string_to_uniprot),
    )
    .filter(items=['uniprot_a', 'uniprot_b'])
)

# Some STRING identifiers cannot be mapped to UniProt. These appear as NA in string_df
percent_unmapped = 100 * (string_edges_df.shape[0] - string_edges_df.dropna().shape[0]) \
                   / string_edges_df.shape[0]

string_edges_df = string_edges_df.dropna()

print(f'{percent_unmapped :.3f} percent of edges had a node that could not be mapped to UniProt')

string_edges_df.head(2)

2.385 percent of edges had a node that could not be mapped to UniProt


Unnamed: 0,uniprot_a,uniprot_b
1,P84085,O43307
2,P84085,O75460


### 2.1.2 High-throughput PPI network

We use two networks from the same group, both created through high-throughput screening. Data is available for download at http://interactome.baderlab.org/download.

Rual et al. (2005) *Nature* https://www.ncbi.nlm.nih.gov/pubmed/16189514

Rolland et al. (2014) *Cell* https://www.ncbi.nlm.nih.gov/pubmed/25416956

In [4]:
ht_df = pd.concat([
    pd.read_csv(data_path.joinpath('ppi_ht_1.psi'), sep='\t'), 
    pd.read_csv(data_path.joinpath('ppi_ht_2.psi'), sep='\t')
], ignore_index=True)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .filter(items=['ida', 'idb',])
    .query('ida != "-" and idb != "-"')
    .assign(
        uniprot_a = lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group()),
        uniprot_b = lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group())
    )
    .filter(items=['uniprot_a', 'uniprot_b',])
    .drop_duplicates()
)

ht_edges_df.head(2)

Unnamed: 0,uniprot_a,uniprot_b
142,O14964,A0A024R0Y4
144,O95990,A0A024R0Y4


### 2.1.3 Combined PPI network

Now, having two PPI networks both mapped to UniProt identifiers, we subset to the intersection of the two sets of nodes, using only nodes that are present in both networks. Then we map the shared nodes to IDs, unique integers from 0 to the number of shared nodes. This is done for efficiency in XSwap later on. Finally, as the edges are undirected, they are sorted so that the first ID is always <= the second ID. This ensures that we don't accidentally miss duplicates, etc.

In [5]:
string_nodes = set(string_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())
shared_nodes = set(string_nodes.intersection(ht_nodes))

print(f'STRING: {len(string_nodes)} nodes\nHT: {len(ht_nodes)} nodes\n'
      f'SHARED: {len(shared_nodes)} nodes')

# Map nodes onto unique integers (for XSwap)
ppi_nodes = sorted(shared_nodes)
ppi_mapping = {name: i for name, i in zip(ppi_nodes, range(len(ppi_nodes)))}

STRING: 19080 nodes
HT: 4517 nodes
SHARED: 4083 nodes


In [6]:
%%time

np.random.seed(0)

ordered_string_edges = (
    string_edges_df
    .assign(
        mapped_a=lambda df: df['uniprot_a'].map(ppi_mapping),
        mapped_b=lambda df: df['uniprot_b'].map(ppi_mapping),
    )
    .dropna()
    .assign(
        id_a=lambda df: df.apply(lambda row: min(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        id_b=lambda df: df.apply(lambda row: max(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        test_recon=1,
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7)
    )
    .filter(items=['id_a', 'id_b', 'train', 'test_recon'])
)

ordered_ht_edges = (
    ht_edges_df
    .assign(
        mapped_a=lambda df: df['uniprot_a'].map(ppi_mapping),
        mapped_b=lambda df: df['uniprot_b'].map(ppi_mapping),
    )
    .dropna()
    .assign(
        id_a=lambda df: df.apply(lambda row: min(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        id_b=lambda df: df.apply(lambda row: max(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        test_new=1,
    )
    .filter(items=['id_a', 'id_b', 'test_new'])
)

CPU times: user 44.7 s, sys: 543 ms, total: 45.2 s
Wall time: 45.2 s


In [7]:
uniprot_a, uniprot_b = zip(*itertools.product(ppi_nodes, ppi_nodes))

ppi_df = (
    pd.DataFrame()
    .assign(
        uniprot_a=uniprot_a,
        uniprot_b=uniprot_b,
        id_a=lambda df: df['uniprot_a'].map(ppi_mapping),
        id_b=lambda df: df['uniprot_b'].map(ppi_mapping),
    )
    .query('id_a <= id_b')
    .merge(ordered_string_edges, how='left', on=['id_a', 'id_b'])
    .merge(ordered_ht_edges, how='left', on=['id_a', 'id_b'])
    .fillna(0)
    .assign(
        test_recon=lambda df: df['test_recon'].astype(int),
        test_new=lambda df: df['test_new'].astype(int),
    )
)

ppi_df.to_csv(data_path.parent.joinpath('2.edges/ppi.tsv.xz'), compression='xz',
             index=False, sep='\t')

ppi_df.head()

Unnamed: 0,uniprot_a,uniprot_b,id_a,id_b,train,test_recon,test_new
0,A0A087WT00,A0A087WT00,0,0,0,0,0
1,A0A087WT00,A0A0B4J1W7,0,1,0,0,0
2,A0A087WT00,A0AV96,0,2,0,0,0
3,A0A087WT00,A0AVK6,0,3,0,0,0
4,A0A087WT00,A0AVT1,0,4,0,0,0


In [29]:
ppi_edges = list(map(tuple, ppi_df.query('train == 1').loc[:, 'id_a':'id_b'].values))
G = nx.from_edgelist(ppi_edges)
nx.is_connected(G)

True

## 2.2 BioRxiv collaboration network

## 2.3 Transcription factor - target gene (TFTG) network

In [31]:
tftg_records = list()
with open(data_path.joinpath('tftg.gmt'), 'r') as f:
    for line in f.readlines():
        groups = line.strip().split('\t')
        tf_name, tf_entrez = groups[0].split('_')[-2:]
        method = re.match('\A([A-Za-z\-\ ]+)(?=(\ Transcriptional|\ TFTG))', groups[1]).group().lower()
        for gene in groups[2:]:
            tftg_records.append(
                (tf_name, gene, method)
            )

np.random.seed(0)
tftg_edges_df = (
    pd.DataFrame
    .from_records(tftg_records, columns=['tf_name', 'gene_name', 'method'])
    .assign(
        test_recon=lambda df: (df['method'] == 'low throughtput').astype(int),
        test_new=lambda df: (df['method'] == 'chip-seq').astype(int),
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7).astype(int),
    )
)

tfs = set(tftg_edges_df.query('train == 1').loc[:, 'tf_name'].values)
genes_only = set(tftg_edges_df.query('train == 1').loc[:, 'gene_name'].values)
genes_only = sorted(genes_only.difference(tfs))
tfs = sorted(tfs)

tf_mapping = {tf: i for i, tf in enumerate(tfs)}
gene_mapping = {gene: len(tfs) + i for i, gene in enumerate(genes_only)}
tftg_mapping = {**tf_mapping, **gene_mapping}

tftg_edges_df = (
    tftg_edges_df
    .assign(
        tf_id=lambda df: df['tf_name'].map(tftg_mapping).astype(int),
        gene_id=lambda df: df['gene_name'].map(tftg_mapping).astype(int),
    )
    .dropna()
)

print("{} unique transcription factors\n{} unique genes".format(len(set(tftg_edges_df['tf_name'])), 
                                                                len(set(tftg_edges_df['gene_name']))))

tftg_edges_df.head(2)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [34]:
set(tftg_mapping.keys()).difference(set(tftg_edges_df['tf_name']))

{'C6orf120',
 'APITD1-CORT',
 'DPYSL4',
 'KLHDC9',
 'RNF167',
 'METTL11B',
 'MLLT11',
 'NR2F1-AS1',
 'ELF3',
 'ING4',
 'RUNX1T1',
 'LRRC37A3',
 'DMKN',
 'COBLL1',
 'MRPL20',
 'P2RX4',
 'MIR5188',
 'ARRB1',
 'ZNF296',
 'C22orf34',
 'C3orf14',
 'ZNF573',
 'PROSER1',
 'MIR3610',
 'RASSF1',
 'AIG1',
 'AP3B1',
 'SF3B2',
 'ENAH',
 'C9orf91',
 'NUP50',
 'ZXDC',
 'KCNJ10',
 'MIR3661',
 'LOC642852',
 'SERPINA9',
 'FLJ10661',
 'MICALL1',
 'ZFX',
 'BCL7A',
 'CFL1P1',
 'NCAPG2',
 'LRRC49',
 'TRMT2A',
 'LOC100129973',
 'LOC100287177',
 'HS3ST3B1',
 'TYK2',
 'TBX6',
 'LOC150381',
 'RPIA',
 'RXRG',
 'BRDT',
 'PDGFC',
 'PANX1',
 'PRRC2C',
 'UBE2O',
 'DOC2A',
 'KIAA0430',
 'ARHGEF1',
 'ZNF260',
 'DYNC1I1',
 'PDHB',
 'SLC2A4RG',
 'VPS33B',
 'ZNF839',
 'BASP1',
 'NAA15',
 'BIRC7',
 'FAM89A',
 'OPA1',
 'PRKAG1',
 'BOD1',
 'EMR2',
 'C1R',
 'LINC00273',
 'LANCL2',
 'BBOX1',
 'LPCAT4',
 'RGS9',
 'SMAP1',
 'CDC37',
 'DLGAP5',
 'HAR1B',
 'MARCKSL1',
 'LOC157273',
 'PALLD',
 'CADM1',
 'CYTH1',
 'PDXDC1',
 'LYAR

In [33]:
tftg_edges_df['tf_name'].map(tftg_mapping).astype(int)

ValueError: Cannot convert non-finite values (NA or inf) to integer