In [1]:
import itertools
import pathlib
import re

import networkx as nx
import numpy as np
import pandas as pd
import requests

# 1. Download raw files

In [2]:
data_path = pathlib.Path('../data/1.raw/')
data_path.mkdir(exist_ok=True, parents=True)

file_to_url = {
    'ppi_string.txt.gz': ('https://stringdb-static.org/download/protein.links.v11.0/'
                          '9606.protein.links.v11.0.txt.gz'),
    
    'ppi_string_mapping.tsv.gz': ('https://string-db.org/mapping_files/uniprot/'
                                  'human.uniprot_2_string.2018.tsv.gz'),
    
    'ppi_ht_1.psi': 'http://interactome.baderlab.org/data/Raul-Vidal(Nature_2005).psi',
    'ppi_ht_2.psi': 'http://interactome.baderlab.org/data/Rolland-Vidal(Cell_2014).psi',
    
    'tftg.gmt': ('https://static-content.springer.com/esm/art%3A10.1186%2Fs12915-017-0469-0/'
                 'MediaObjects/12915_2017_469_MOESM5_ESM.gmt')
}

# for file, url in file_to_url.items():
#     with open(data_path.joinpath(file), 'wb') as f:
#         res = requests.get(url)
#         f.write(res.content)

# 2. Process files to edges

Processing is generally as follows: 

(Note this example is for an undirected network with self-loops)

1. Convert raw relationship data (in whatever form) to 

| source 	| target 	| network A 	| network B 	|
|--------	|--------	|-----------	|-----------	|
| A      	| B      	| 1         	| 0         	|
| A      	| C      	| 1         	| 1         	|
| B      	| C      	| 0         	| 1         	|

2. Assign 70% of Network1 edges to the training network. Map the nodes in the training network to the integers 0, ..., num(nodes)-1. If the network is undirected, ensure that `id_a` $\leq$ `id_b`. If the network is directed, index the source nodes first, (0, ..., num(source)-1), then target nodes (num(source),...). This mapping is done for the convenience of XSwap later. Results in `[network name]_edges_df`, which have the following schema:

| source 	| target 	| source_id 	| target_id 	| train 	| network A 	| network B 	|
|--------	|--------	|-----------	|-----------	|-------	|-----------	|-----------	|
| A      	| B      	| 0         	| 1         	| 0     	| 1         	| 0         	|
| A      	| C      	| 0         	| 2         	| 1     	| 1         	| 1         	|
| B      	| C      	| 1         	| 2         	| 0     	| 0         	| 1         	|

3. Take the subset of nodes that have an edge in the training network. The Cartesian product of these nodes will be the `[network_name]_df`, which have the following schema:

| source 	| target 	| source_id 	| target_id 	| train 	| network A 	| network B 	|
|--------	|--------	|-----------	|-----------	|-------	|-----------	|-----------	|
| A      	| A      	| 0         	| 0         	| 0     	| 0         	| 0         	|
| A      	| B      	| 0         	| 1         	| 0     	| 1         	| 0         	|
| A      	| C      	| 0         	| 2         	| 1     	| 1         	| 1         	|
| B      	| B      	| 1         	| 1         	| 0     	| 0         	| 0         	|
| B      	| C      	| 1         	| 2         	| 0     	| 0         	| 1         	|
| C      	| C      	| 2         	| 2         	| 0     	| 0         	| 0         	|


## 2.1 PPI

### 2.1.1 STRING

https://string-db.org/

The two PPI networks use different mappings. We convert STRING to UniProt identifiers.

In [3]:
# Ensembl to UniProtKB identifier mappings
mapping_df = pd.read_table(data_path.joinpath('ppi_string_mapping.tsv.gz'), 
                           compression='gzip', names=['species', 'uniprot_entry', 'string', 
                                                      'unknown_a', 'unknown_b'])

# Create dictionary with mappings
string_to_uniprot = (
    mapping_df
    .assign(uniprot=lambda df: df['uniprot_entry'].apply(lambda x: re.search('[A-Z0-9]+', x).group()))
    .set_index('string')
    .loc[:, 'uniprot']
    .to_dict()
)

# Load PPI network from STRING
string_edges_df = (
    pd.read_table(data_path.joinpath('ppi_string.txt.gz'), compression='gzip', sep=' ')
    .assign(
        uniprot_a=lambda df: df['protein1'].map(string_to_uniprot),
        uniprot_b=lambda df: df['protein2'].map(string_to_uniprot),
        test_recon=1,
    )
    .filter(items=['uniprot_a', 'uniprot_b', 'test_recon'])
)

# Some STRING identifiers cannot be mapped to UniProt. These appear as NA in string_df
percent_unmapped = 100 * (string_edges_df.shape[0] - string_edges_df.dropna().shape[0]) \
                   / string_edges_df.shape[0]

string_edges_df = string_edges_df.dropna()

print(f'{percent_unmapped :.3f} percent of edges had a node that could not be mapped to UniProt')

string_edges_df.head(2)

2.385 percent of edges had a node that could not be mapped to UniProt


Unnamed: 0,uniprot_a,uniprot_b,test_recon
1,P84085,O43307,1
2,P84085,O75460,1


### 2.1.2 High-throughput PPI network

We use two networks from the same group, both created through high-throughput screening. Data is available for download at http://interactome.baderlab.org/download.

Rual et al. (2005) *Nature* https://www.ncbi.nlm.nih.gov/pubmed/16189514

Rolland et al. (2014) *Cell* https://www.ncbi.nlm.nih.gov/pubmed/25416956

In [4]:
# Combine the two networks
ht_df = pd.concat([
    pd.read_csv(data_path.joinpath('ppi_ht_1.psi'), sep='\t'), 
    pd.read_csv(data_path.joinpath('ppi_ht_2.psi'), sep='\t')
], ignore_index=True)

ht_edges_df = (
    ht_df
    .rename(columns={
        'Unique identifier for interactor A': 'ida', 
        'Unique identifier for interactor B': 'idb'})
    .filter(items=['ida', 'idb',])
    .query('ida != "-" and idb != "-"')
    .assign(
        uniprot_a=lambda df: df['ida'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group()),
        uniprot_b=lambda df: df['idb'].apply(lambda x: re.search('(?<=uniprotkb:)[0-9A-Z]+', x).group()),
        test_new=1,
    )
    .filter(items=['uniprot_a', 'uniprot_b', 'test_new'])
    .drop_duplicates()
)

ht_edges_df.head(2)

Unnamed: 0,uniprot_a,uniprot_b,test_new
142,O14964,A0A024R0Y4,1
144,O95990,A0A024R0Y4,1


### 2.1.3 Combined PPI network

Now, having two PPI networks both mapped to UniProt identifiers, we subset to the intersection of the two sets of nodes, using only nodes that are present in both networks. Then we map the shared nodes to IDs, unique integers from 0 to the number of shared nodes. This is done for efficiency in XSwap later on. Finally, as the edges are undirected, they are sorted so that the first ID is always <= the second ID. This ensures that we don't accidentally miss duplicates, etc.

In [5]:
# Only use nodes that are present in both networks
string_nodes = set(string_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())
ht_nodes = set(ht_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten())
shared_nodes = set(string_nodes.intersection(ht_nodes))

print(f'STRING: {len(string_nodes)} nodes\nHT: {len(ht_nodes)} nodes\n'
      f'SHARED: {len(shared_nodes)} nodes')

# Join DataFrames and subset to node pairs consisting only of nodes shared between both networks
np.random.seed(0)
ppi_edges_df = (
    string_edges_df
    .merge(ht_edges_df, how='outer', on=['uniprot_a', 'uniprot_b'])
    .loc[lambda df: (df['uniprot_a'].apply(lambda x: x in shared_nodes) & 
                     df['uniprot_b'].apply(lambda x: x in shared_nodes))]
    .fillna(0)
    .assign(
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7).astype(int),
        test_recon=lambda df: df['test_recon'].astype(int),
        test_new=lambda df: df['test_new'].astype(int),
    )
)

# Map nodes onto unique integers (for XSwap)
ppi_nodes = sorted(set(ppi_edges_df.loc[:, 'uniprot_a':'uniprot_b'].values.flatten()))
ppi_mapping = {name: i for name, i in zip(ppi_nodes, range(len(ppi_nodes)))}
ppi_reversed_mapping = {v: k for k, v in ppi_mapping.items()}

# Create a DF of all edges whose nodes have an edge in at least one of the networks
ppi_edges_df = (
    ppi_edges_df
    .assign(
        mapped_a=lambda df: df['uniprot_a'].map(ppi_mapping),
        mapped_b=lambda df: df['uniprot_b'].map(ppi_mapping),
    )
    # Drop node pairs with nodes not in the train network
    .dropna()
    .assign(
        # Edges are bi-directional, so make id_a <= id_b
        id_a=lambda df: df.apply(lambda row: min(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        id_b=lambda df: df.apply(lambda row: max(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        
        # Re-ordering means that UniProt IDs may now be reversed. 
        # Apply reverse mapping to ensure correctness.
        uniprot_a=lambda df: df['id_a'].map(ppi_reversed_mapping),
        uniprot_b=lambda df: df['id_b'].map(ppi_reversed_mapping),
    )
    .filter(items=['uniprot_a', 'uniprot_b', 'id_a', 'id_b', 'train', 'test_recon', 'test_new'])
    .reset_index(drop=True)
)

STRING: 19080 nodes
HT: 4517 nodes
SHARED: 4083 nodes


In [6]:
# Create the final DF of pairings for all nodes that appear in the training network
uniprot_a, uniprot_b = zip(*itertools.product(ppi_nodes, ppi_nodes))

ppi_df = (
    pd.DataFrame()
    .assign(
        uniprot_a=uniprot_a,
        uniprot_b=uniprot_b,
        id_a=lambda df: df['uniprot_a'].map(ppi_mapping),
        id_b=lambda df: df['uniprot_b'].map(ppi_mapping),
    )
    # Do not include duplicates of edges. 
    .query('id_a <= id_b')
    # Merge with DF that already satisfies id_a <= id_b
    .merge(ppi_edges_df, how='left', on=['id_a', 'id_b'])
    .fillna(0)
    .assign(
        train=lambda df: df['train'].astype(int),
        test_recon=lambda df: df['test_recon'].astype(int),
        test_new=lambda df: df['test_new'].astype(int),
    )
)

data_path.parent.joinpath('2.edges/').mkdir(exist_ok=True, parents=True)
ppi_df.to_csv(data_path.parent.joinpath('2.edges/ppi.tsv.xz'), compression='xz',
             index=False, sep='\t')

ppi_df.head(2)

Unnamed: 0,uniprot_a_x,uniprot_b_x,id_a,id_b,uniprot_a_y,uniprot_b_y,train,test_recon,test_new
0,A0A087WT00,A0A087WT00,0,0,0,0,0,0,0
1,A0A087WT00,A0A0B4J1W7,0,1,0,0,0,0,0


In [7]:
# Verify that the network is connected
ppi_edges = list(map(tuple, ppi_df.query('train == 1').loc[:, 'id_a':'id_b'].values))
G = nx.from_edgelist(ppi_edges)
nx.is_connected(G)

True

## 2.2 BioRxiv collaboration network

Rxivist full database (doi:10.5281/zenodo.2566421) at https://zenodo.org/record/2566421


I used the following query to export a copy of the Rxivist BioRxiv scrape.

```sqlite
SELECT 
    aa.article, 
    art.title,
    auth.id as author_id, 
    auth.name as author_name, 
    auth.institution, 
    art.doi, 
    art.collection, 
    art.posted
FROM prod.article_authors aa
JOIN prod.authors auth
	ON aa.author = auth.id
JOIN prod.articles art
	ON aa.article = art.id
```

## 2.3 Transcription factor - target gene (TFTG) network

Data is originally in the form:

Source: [target1, target2, ...]

and needs to first be reformatted as an edge list.

In [8]:
# Format data to edges
tftg_records = list()
with open(data_path.joinpath('tftg.gmt'), 'r') as f:
    for line in f.readlines():
        groups = line.strip().split('\t')
        tf_name, tf_entrez = groups[0].split('_')[-2:]
        method = re.match('\A([A-Za-z\-\ ]+)(?=(\ Transcriptional|\ TFTG))', groups[1]).group().lower()
        for gene in groups[2:]:
            tftg_records.append(
                (tf_name, gene, method)
            )

# Format edges to a DataFrame. Randomly assign 70% of low-throughput edges to train, with
# the withheld 30% being used for predicting reconstruction.
np.random.seed(0)
tftg_edges_df = (
    pd.DataFrame
    .from_records(tftg_records, columns=['tf_name', 'gene_name', 'method'])
    .assign(
        test_recon=lambda df: (df['method'] == 'low throughtput').astype(int),
        test_new=lambda df: (df['method'] == 'chip-seq').astype(int),
    )
    .groupby(['tf_name', 'gene_name'])[['test_recon', 'test_new']].sum()
    .reset_index()
    .assign(
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7).astype(int),
    )
)

tftg_edges_df.head(2)

Unnamed: 0,tf_name,gene_name,test_recon,test_new,train
0,AEBP2,AAGAB,0,1,0
1,AEBP2,ALDH4A1,0,1,0


In [9]:
# Create a mapping between nodes and integers. Make TFs the lowest values, and non-TF genes later
tfs = set(tftg_edges_df.query('train == 1').loc[:, 'tf_name'].values)
genes = set(tftg_edges_df.query('train == 1').loc[:, 'gene_name'].values)
genes_only = sorted(genes.difference(tfs))

tfs = sorted(tfs)
genes = sorted(genes)

tf_mapping = {tf: i for i, tf in enumerate(tfs)}
gene_mapping = {gene: len(tfs) + i for i, gene in enumerate(genes_only)}
tftg_mapping = {**tf_mapping, **gene_mapping}

print("{} unique transcription factors\n{} unique genes".format(len(tfs), len(genes)))

232 unique transcription factors
14913 unique genes


In [10]:
tf_name, gene_name = zip(*itertools.product(tfs, genes))

tftg_df = (
    pd.DataFrame()
    .assign(
        tf_name=tf_name,
        gene_name=gene_name,
        tf_id=lambda df: df['tf_name'].map(tftg_mapping),
        gene_id=lambda df: df['gene_name'].map(tftg_mapping),
    )
    .merge(tftg_edges_df, how='left', on=['tf_name', 'gene_name'])
    .fillna(0)
    .assign(
        train=lambda df: df['train'].astype(int),
        test_recon=lambda df: df['test_recon'].astype(int),
        test_new=lambda df: df['test_new'].astype(int),
    )
    .filter(items=['tf_name', 'gene_name', 'tf_id', 'gene_id', 'train', 'test_recon', 'test_new'])
)

print("{} unique transcription factors\n{} unique genes".format(len(set(tftg_df['tf_name'])), 
                                                                len(set(tftg_df['gene_name']))))

tftg_df.to_csv(data_path.parent.joinpath('2.edges/tftg.tsv.xz'), compression='xz',
                     index=False, sep='\t')

tftg_df.head(2)

232 unique transcription factors
14913 unique genes


Unnamed: 0,tf_name,gene_name,tf_id,gene_id,train,test_recon,test_new
0,AHR,A1CF,0,232,0,0,0
1,AHR,A2M,0,233,0,0,0
