In [1]:
import collections
import csv
import itertools
import re
import tempfile
import time

import networkx as nx
import numpy as np
import pandas as pd
import requests
import scipy.sparse
import tqdm
import xswap

import analysis

# 01. Download raw transcription factor binding data

In [2]:
tftg_url = ('https://static-content.springer.com/esm/art%3A10.1186%2Fs12915-017-0469-0/'
            'MediaObjects/12915_2017_469_MOESM5_ESM.gmt')
tftg_path = '../data/transcription_factor_raw.gmt'

# with open(tftg_path, 'wb') as f:
#     res = requests.get(tftg_url)
#     f.write(res.content)

# 02. Format data into edges

In [3]:
records = list()
with open(tftg_path, 'r') as f:
    for line in f.readlines():
        groups = line.strip().split('\t')
        tf_name, tf_entrez = groups[0].split('_')[-2:]
        method = re.match('\A([A-Za-z\-\ ]+)(?=(\ Transcriptional|\ TFTG))', groups[1]).group().lower()
        for gene in groups[2:]:
            records.append(
                (tf_entrez, tf_name, gene, method)
            )

edges_df = pd.DataFrame.from_records(records, columns=['tf_entrez', 'tf_name', 'gene_name', 'method'])

print("{} unique transcription factors\n{} unique genes".format(len(set(edges_df['tf_name'])), 
                                                                len(set(edges_df['gene_name']))))

edges_df.head(2)

384 unique transcription factors
17155 unique genes


Unnamed: 0,tf_entrez,tf_name,gene_name,method
0,10009,ZBTB33,CDKN2A,low throughtput
1,10009,ZBTB33,MMP7,low throughtput


# 03. Create train/test data

In [4]:
edges = sorted(map(tuple, edges_df.query('method == "low throughtput"').loc[:, 'tf_name':'gene_name'].values))
nodes = sorted(set(edge[0] for edge in edges).union(set(edge[1] for edge in edges)))

In [5]:
g = nx.from_edgelist(edges, create_using=nx.DiGraph())

g_edges = sorted(g.edges)
g_nodes = sorted(g.nodes)

assert g_nodes == nodes
assert g_edges == edges

In [6]:
nx.is_weakly_connected(g)

True

In [7]:
nx.is_strongly_connected(g)

False

In [8]:
mapped_edges, mapping, _ = xswap.preprocessing.map_str_edges(edges, bipartite=False)
reversed_mapping = {v: k for k, v in mapping.items()}
mat = analysis.edges_to_matrix(mapped_edges, directed=True)

# Create source, target degree matrices
degree = np.repeat(mat.sum(axis=1), mat.shape[1], axis=1) \
       + np.repeat(mat.sum(axis=1).T, mat.shape[0], axis=0)

# Compute features on unpermuted network
feature_mats = {
    'prior_empirical': scipy.sparse.csc_matrix(mat.shape, dtype=np.float32),
    
    'rwr': analysis.rwr_approx_inv(mat, 0.25, 20),
    'mean_rwr': scipy.sparse.csc_matrix(mat.shape, dtype=np.float32),
    'p_rwr': scipy.sparse.csc_matrix(mat.shape, dtype=np.float16),
    
    'jaccard': analysis.jaccard(mat, degree),
    'mean_jaccard': scipy.sparse.csc_matrix(mat.shape, dtype=np.float32),
    'p_jaccard': scipy.sparse.csc_matrix(mat.shape, dtype=np.float16),
}

In [9]:
n_perms = 2

perm_edges = mapped_edges.copy()
for i in tqdm.tnrange(n_perms):
    # Permute edges
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_antiparallel=True,
                                            allow_self_loops=False, multiplier=10, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges, directed=True)
    feature_mats['prior_empirical'] += perm_mat

    # Compute RWR on permuted network
    perm_rwr = analysis.rwr_approx_inv(perm_mat, 0.25, 20)
    feature_mats['mean_rwr'] += perm_rwr
    feature_mats['p_rwr'] += (perm_rwr < feature_mats['rwr'])

    # Compute Jaccard similarity on permuted network
    perm_jac = analysis.jaccard(perm_mat, degree)
    feature_mats['mean_jaccard'] += perm_jac
    feature_mats['p_jaccard'] += (perm_jac < feature_mats['jaccard'])
    del perm_mat, perm_rwr, perm_jac

del degree

# Normalize and format all features
feature_mats['p_rwr'].data = (n_perms - feature_mats['p_rwr'].data) / n_perms
feature_mats['p_jaccard'] = (n_perms - feature_mats['p_jaccard']) / n_perms
feature_mats['mean_rwr'] /= n_perms
feature_mats['mean_jaccard'] /= n_perms

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [None]:
df = pd.DataFrame()

for feature in list(feature_mats.keys()):
    values = feature_mats.pop(feature)
    if scipy.sparse.issparse(values):
        df[feature] = values.toarray().flatten()
    else:
        df[feature] = np.array(values).flatten()