In [1]:
import numpy as np
import pandas as pd
import requests

import analysis

# 1. Low-throughput edges

### TTRUST

Han, H., Cho, J. W., Lee, S., Yun, A., Kim, H., Bae, D., … Lee, I. (2017). TRRUST v2: an expanded reference database of human and mouse transcriptional regulatory interactions. *Nucleic acids research*, 46(D1), D380–D386. doi:10.1093/nar/gkx1013

In [2]:
lt_url = 'https://www.grnpedia.org/trrust/data/trrust_rawdata.human.tsv'
lt_raw = pd.read_csv(lt_url, sep='\t')
lt_raw.to_csv('../data/1.raw/tftg_lt.tsv.xz', compression='xz', index=False, sep='\t')

# name_a is TF, name_b is gene
lt_raw = pd.read_csv('../data/1.raw/tftg_lt.tsv.xz', names=['name_a', 'name_b', 'function', 'pubmed'], sep='\t')
lt_raw.head()

Unnamed: 0,name_a,name_b,function,pubmed
0,AATF,BAX,Repression,22909821
1,AATF,CDKN1A,Unknown,17157788
2,AATF,KLK3,Unknown,23146908
3,AATF,MYC,Activation,20549547
4,AATF,TP53,Unknown,17157788


In [3]:
lt_edges_df = (
    lt_raw
    .filter(items=['name_a', 'name_b'])
    .drop_duplicates()
    .assign(test_recon=1)
)
lt_edges_df.head()

Unnamed: 0,name_a,name_b,test_recon
0,AATF,BAX,1
1,AATF,CDKN1A,1
2,AATF,KLK3,1
3,AATF,MYC,1
4,AATF,TP53,1


In [4]:
len(set(lt_edges_df['name_a'])), len(set(lt_edges_df['name_b']))

(795, 2492)

# 2. High-throughput edges

Lachmann, A., Xu, H., Krishnan, J., Berger, S. I., Mazloom, A. R., & Ma'ayan, A. (2010). ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. *Bioinformatics* (Oxford, England), 26(19), 2438–2444. doi:10.1093/bioinformatics/btq466

In [5]:
ht_url = 'http://amp.pharm.mssm.edu/static/hdfs/harmonizome/data/cheappi/gene_attribute_edges.txt.gz'
ht_raw = pd.read_csv(ht_url, sep='\t')
ht_raw.to_csv('../data/1.raw/tftg_ht.tsv.xz', compression='xz', sep='\t', index=False)

ht_raw = pd.read_csv('../data/1.raw/tftg_ht.tsv.xz', sep='\t', skiprows=[1,])
ht_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,source,source_desc,source_id,target,target_desc,target_id,weight
0,DLGAP1,na,9229,KLF2,na,10365,1.0
1,DTNB,na,1838,KLF2,na,10365,1.0
2,BHLHE40,na,8553,KLF2,na,10365,1.0
3,RPS6KA1,na,6195,KLF2,na,10365,1.0
4,PXN,na,5829,KLF2,na,10365,1.0


In [6]:
ht_edges_df = (
    ht_raw
    .filter(items=['target', 'source'])
    # name_a is TF, name_b is gene
    .rename(columns={'target': 'name_a', 'source': 'name_b'})
    .drop_duplicates()
    .assign(test_new=1)
)
ht_edges_df.head()

Unnamed: 0,name_a,name_b,test_new
0,KLF2,DLGAP1,1
1,KLF2,DTNB,1
2,KLF2,BHLHE40,1
3,KLF2,RPS6KA1,1
4,KLF2,PXN,1


In [7]:
len(set(ht_edges_df['name_a'])), len(set(ht_edges_df['name_b']))

(199, 21585)

# 3. Merge datasets

In [8]:
np.random.seed(0)

tftg_edges_df = (
    ht_edges_df
    .merge(lt_edges_df, on=['name_a', 'name_b'], how='outer')
    .fillna(0)
    
    # Filter to nodes that have at least one edge in both the high- and low-throughput networks
    .assign(
        ht_out_degree=lambda df: df['name_a'].map(df.groupby('name_a')['test_new'].sum().to_dict()),
        lt_out_degree=lambda df: df['name_a'].map(df.groupby('name_a')['test_recon'].sum().to_dict()),
        ht_in_degree=lambda df: df['name_b'].map(df.groupby('name_b')['test_new'].sum().to_dict()),
        lt_in_degree=lambda df: df['name_b'].map(df.groupby('name_b')['test_recon'].sum().to_dict()),
    )
    .loc[lambda df: df.apply(lambda row: all(row[4:]), axis=1)]
    .filter(items=['name_a', 'name_b', 'test_recon', 'test_new'])
    .assign(
        # Drop edges to create training network
        train=lambda df: df['test_recon'].apply(lambda x: x and (np.random.rand() < 0.7)).astype(int),
        test_recon=lambda df: df['test_recon'].astype(int),
        test_new=lambda df: df['test_new'].astype(int),
    )
)

# Create a mapping from nodes in training network to integers. Remove nodes not having an
# edge in the training network.
print(tftg_edges_df.shape)

tftg_train_edges = set(map(tuple, 
    tftg_edges_df
    .query('train == 1')
    .loc[:, ['name_a', 'name_b']]
    .values
))

# Create a mapping. Map TFs from 1,...,num_tfs and non-tf genes from num_tfs + 1, ...
tfs = {edge[0] for edge in tftg_train_edges}
genes = {edge[1] for edge in tftg_train_edges}

genes_only = genes.difference(tfs)

tf_mapping = {tf: i for i, tf in enumerate(sorted(tfs))}
gene_mapping = {gene: (i+len(tfs)) for i, gene in enumerate(sorted(genes_only))}
mapping = {**tf_mapping, **gene_mapping}

tftg_edges_df = (
    tftg_edges_df
    .assign(
        id_a=lambda df: df['name_a'].map(mapping),
        id_b=lambda df: df['name_b'].map(mapping),
    )
    # Unmapped nodes are not present in the training network and map to NA. Drop these.
    .dropna()
    .assign(
        id_a=lambda df: df['name_a'].map(mapping).astype(int),
        id_b=lambda df: df['name_b'].map(mapping).astype(int),
    )
    .filter(items=['name_a', 'name_b', 'id_a', 'id_b', 'train', 'test_recon', 'test_new'])
    .reset_index(drop=True)
)

print(tftg_edges_df.shape)


tftg_edges_df.to_csv('../data/2.edges/tftg.tsv.xz', compression='xz', sep='\t', index=False)

tftg_edges_df.head()

(52455, 5)
(31493, 7)


Unnamed: 0,name_a,name_b,id_a,id_b,train,test_recon,test_new
0,KLF2,BHLHE40,64,252,0,0,1
1,KLF2,KLF4,64,65,0,0,1
2,KLF2,DMRT1,64,491,0,0,1
3,KLF2,HOXB13,64,713,0,0,1
4,KLF2,MFSD2A,64,896,0,0,1


In [9]:
tftg_df = analysis.process_edges_to_full_network(tftg_edges_df, mapping, allow_loop=True, directed=True)

print(tftg_df.shape)
assert tftg_df.groupby(['name_a', 'name_b']).size().max() == 1

tftg_df.to_csv('../data/3.all_nodes/tftg.tsv.xz', compression='xz', sep='\t', index=False)

(190771, 7)
