In [1]:
import itertools
import re

import numpy as np
import pandas as pd
import requests
import scipy.sparse
import tqdm
import xswap

import analysis

# 01. Download raw transcription factor binding data

In [2]:
tftg_url = ('https://static-content.springer.com/esm/art%3A10.1186%2Fs12915-017-0469-0/'
            'MediaObjects/12915_2017_469_MOESM5_ESM.gmt')
tftg_path = '../data/1.raw/tftg.gmt'

# with open(tftg_path, 'wb') as f:
#     res = requests.get(tftg_url)
#     f.write(res.content)

# 02. Format data into edges

In [3]:
records = list()
with open(tftg_path, 'r') as f:
    for line in f.readlines():
        groups = line.strip().split('\t')
        tf_name, tf_entrez = groups[0].split('_')[-2:]
        method = re.match('\A([A-Za-z\-\ ]+)(?=(\ Transcriptional|\ TFTG))', groups[1]).group().lower()
        for gene in groups[2:]:
            records.append(
                (tf_entrez, tf_name, gene, method)
            )

np.random.seed(0)
edges_df = (
    pd.DataFrame
    .from_records(records, columns=['tf_entrez', 'tf_name', 'gene_name', 'method'])
    .assign(
        test_recon=lambda df: (df['method'] == 'low throughtput').astype(int),
        test_new=lambda df: (df['method'] == 'chip-seq').astype(int),
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7).astype(int),
    )
)

print("{} unique transcription factors\n{} unique genes".format(len(set(edges_df['tf_name'])), 
                                                                len(set(edges_df['gene_name']))))

edges_df.head(2)

384 unique transcription factors
17155 unique genes


Unnamed: 0,tf_entrez,tf_name,gene_name,method,test_recon,test_new,train
0,10009,ZBTB33,CDKN2A,low throughtput,1,0,1
1,10009,ZBTB33,MMP7,low throughtput,1,0,0


# 03. Create train/test data

In [4]:
edges = sorted(map(tuple, edges_df.query('train == 1').loc[:, 'tf_name':'gene_name'].values))
nodes = sorted(set(edge[0] for edge in edges).union(set(edge[1] for edge in edges)))

# Since only a small subset of genes are transcription factors in our data,
# we need a custom mapping so that we don't have to always deal with a massive matrix

tfs = sorted(set(edge[0] for edge in edges))
genes_only = sorted(set(edge[1] for edge in edges).difference(set(tfs)))

mapping = {tf: i for i, tf in enumerate(tfs)}
i = len(tfs)
for gene in (genes_only):
    mapping[gene] = i
    i += 1
reversed_mapping = {v: k for k, v in mapping.items()}
    
source, target = zip(*edges)
mapped_edges = list(zip(map(mapping.get, source), map(mapping.get, target)))

mat = analysis.edges_to_matrix(mapped_edges, directed=True).tocsc()

# Because the graph is directed, we can reduce the size of many computations by only using
# relevant and nonzero matrix portions
n = len(tfs)
degree = np.repeat(mat.sum(axis=1)[:n], n, axis=1) \
         + np.repeat(mat.sum(axis=1)[:n].T, n, axis=0)

In [5]:
# Compute features on unpermuted network
feature_mats = {
    'prior_empirical': scipy.sparse.csc_matrix((n, mat.shape[1]), dtype=float),
    
    'rwr': analysis.rwr_approx_inv(mat, 0.25, 20)[:n],
    'mean_rwr': scipy.sparse.csc_matrix((n, mat.shape[1]), dtype=float),
    'p_rwr': scipy.sparse.csc_matrix((n, mat.shape[1]), dtype=float),
    
    # Jaccard only makes sense with respect to TFs as other genes have out-degree zero
    'jaccard': analysis.jaccard(mat[:n], degree),
    'mean_jaccard': scipy.sparse.csc_matrix((n, n), dtype=float),
    'p_jaccard': scipy.sparse.csc_matrix((n, n), dtype=float),
}

n_perms = 1000

perm_edges = mapped_edges.copy()
for i in tqdm.tnrange(n_perms):
    # Permute edges
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_antiparallel=True,
                                            allow_self_loops=False, multiplier=10, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges, directed=True).tocsc()
    feature_mats['prior_empirical'] += perm_mat[:n]

    # Compute RWR on permuted network
    perm_rwr = analysis.rwr_approx_inv(perm_mat, 0.25, 20)[:n]
    feature_mats['mean_rwr'] += perm_rwr
    feature_mats['p_rwr'] += (perm_rwr < feature_mats['rwr'])

    # Compute Jaccard similarity on permuted network
    perm_jac = analysis.jaccard(perm_mat[:n], degree)
    feature_mats['mean_jaccard'] += perm_jac
    feature_mats['p_jaccard'] += (perm_jac < feature_mats['jaccard'])
    del perm_mat, perm_rwr, perm_jac

del degree

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [6]:
df = pd.DataFrame()

for feature, values in feature_mats.items():
    
    # Transform all features into dense arrays
    if scipy.sparse.issparse(values):
        values = values.toarray()
    else:
        values = np.array(values)
    
    # Jaccard is only for TF-TF comparisons, so the matrices were square. Need to be padded
    if feature in ['jaccard', 'p_jaccard', 'mean_jaccard']:
        values = np.pad(values, ((0,0), (0, mat.shape[1]-n)), 'constant')

    # Normalize features to their correct values
    if feature in ['p_rwr', 'p_jaccard']:
        values = (n_perms - values) / n_perms
    if feature in ['mean_rwr', 'mean_jaccard']:
        values /= n_perms
    
    df[feature] = values.flatten()
    

df['mapped_source'], df['mapped_target'] = zip(*itertools.product(range(n), range(mat.shape[1])))

df = (
    df
    .assign(
        source=df['mapped_source'].map(reversed_mapping),
        target=df['mapped_target'].map(reversed_mapping),
    )
    .merge(edges_df, left_on=['source', 'target'], right_on=['tf_name', 'gene_name'], how='left')
    .fillna(0)
    .assign(
        out_degree=lambda df: df['source'].map(df.groupby('source')['train'].sum().to_dict()).astype(int),
        in_degree=lambda df: df['target'].map(df.groupby('target')['train'].sum().to_dict()).astype(int),
    )
    .filter(items=['source', 'target', 'mapped_source', 'mapped_target', 'out_degree', 'in_degree',
                  'train', 'test_recon', 'test_new', 'prior_empirical', 'rwr', 'mean_rwr', 'p_rwr',
                  'jaccard', 'mean_jaccard', 'p_jaccard',])
)

df.to_csv('../data/tftf_result.tsv.xz', index=False, sep='\t', compression='xz')

df.head()

Unnamed: 0,source,target,mapped_source,mapped_target,out_degree,in_degree,train,test_recon,test_new,prior_empirical,rwr,mean_rwr,p_rwr,jaccard,mean_jaccard,p_jaccard
0,AHR,AHR,0,0,5,6,0.0,0.0,0.0,0.0,0.25,0.2500001,0.049,1.0,1.0,1.0
1,AHR,AR,0,1,5,3,0.0,0.0,0.0,0.0,2.039234e-10,5.473652e-09,0.027,0.0,0.000581,1.0
2,AHR,ARNT,0,2,5,5,0.0,0.0,0.0,0.0,1.089836e-11,4.891516e-06,0.049,0.064516,0.000533,0.001
3,AHR,ATF1,0,3,5,9,0.0,0.0,0.0,1.0,3.171561e-09,3.766199e-05,0.029,0.008197,0.000672,0.08
4,AHR,ATF2,0,4,5,10,0.0,0.0,0.0,2.0,2.609309e-08,7.505623e-05,0.019,0.035714,0.00075,0.021
