In [1]:
import pandas as pd

In [2]:
GM12878 = 'EC-003-NET.edgeList_TSS_GM12878.tsv'
K562 = 'EC-003-NET.edgeList_TSS_K562.tsv'
SEP = '\t'

gm12878_df = pd.read_csv('../data/'+GM12878, sep=SEP, header=None)
k562_df = pd.read_csv('../data/'+K562, sep=SEP, header=None)
k562_df.columns = gm12878_df.columns = ['cell_type', 'source', 'target', 'type', 'weight']


In [3]:
k562_df.shape

(954968, 5)

In [4]:
gm12878_df.shape

(506074, 5)

In [5]:
gm12878_df.nunique()

cell_type         1
source          101
target        16553
type              1
weight       220493
dtype: int64

In [6]:
gm12878_tf = set(gm12878_df['source'])
k562_tf = set(k562_df['source'])
print(f'Number of TF in GM12878: {len(gm12878_tf)}')
print(f'Number of TF in K562: {len(k562_tf)}')

Number of TF in GM12878: 101
Number of TF in K562: 209


In [7]:
# Find intersection
common_tf = list(k562_tf.intersection(gm12878_tf))
print(f'Number of common TF: {len(common_tf)}')

Number of common TF: 69


In [18]:
common_tf = list(common_tf)
common_tf.sort()
name2id = {}
for i, name in enumerate(common_tf):
    name2id[name] = i
name2id

{'ATF3': 0,
 'BCLAF1': 1,
 'BHLHE40': 2,
 'CBX5': 3,
 'CEBPB': 4,
 'CEBPZ': 5,
 'CHD1': 6,
 'CHD2': 7,
 'CTCF': 8,
 'E2F4': 9,
 'EGR1': 10,
 'ELF1': 11,
 'ELK1': 12,
 'EP300': 13,
 'ETS1': 14,
 'ETV6': 15,
 'EZH2': 16,
 'FOS': 17,
 'GABPA': 18,
 'HDGF': 19,
 'IKZF1': 20,
 'JUNB': 21,
 'JUND': 22,
 'MAFK': 23,
 'MAX': 24,
 'MAZ': 25,
 'MEF2A': 26,
 'MLLT1': 27,
 'MTA2': 28,
 'MXI1': 29,
 'MYC': 30,
 'NBN': 31,
 'NFE2': 32,
 'NFYA': 33,
 'NFYB': 34,
 'NR2C2': 35,
 'NRF1': 36,
 'PML': 37,
 'POLR2A': 38,
 'POLR2AphosphoS2': 39,
 'POLR2AphosphoS5': 40,
 'POLR3G': 41,
 'RAD21': 42,
 'RCOR1': 43,
 'REST': 44,
 'RFX5': 45,
 'SIN3A': 46,
 'SIX5': 47,
 'SMAD5': 48,
 'SMC3': 49,
 'SP1': 50,
 'SPI1': 51,
 'SRF': 52,
 'STAT5A': 53,
 'SUZ12': 54,
 'TAF1': 55,
 'TARDBP': 56,
 'TBL1XR1': 57,
 'TBP': 58,
 'UBTF': 59,
 'USF1': 60,
 'USF2': 61,
 'YBX1': 62,
 'YY1': 63,
 'ZBED1': 64,
 'ZBTB33': 65,
 'ZBTB40': 66,
 'ZNF143': 67,
 'ZNF274': 68}

In [8]:
gm12878_tf2tf = gm12878_df[gm12878_df['source'].isin(common_tf)]
gm12878_tf2tf = gm12878_tf2tf[gm12878_tf2tf['target'].isin(common_tf)]
k562_tf2tf = k562_df[k562_df['source'].isin(common_tf)]
k562_tf2tf = k562_tf2tf[k562_tf2tf['target'].isin(common_tf)]

In [19]:
gm12878_tf2tf.sort_values(by=['source', 'target'], ascending=True, inplace=True)
gm12878_tf2tf['source_id'] = gm12878_tf2tf['source'].map(lambda name: name2id[name])
gm12878_tf2tf['target_id'] = gm12878_tf2tf['target'].map(lambda name: name2id[name])
gm12878_tf2tf = gm12878_tf2tf[['cell_type', 'source_id', 'source', 'target_id', 'target', 'weight', 'type']]
gm12878_tf2tf

Unnamed: 0,cell_type,source_id,source,target_id,target,weight,type
5841,GM12878,0,ATF3,2,BHLHE40,88.392564,TSS
6536,GM12878,0,ATF3,24,MAX,315.000000,TSS
6293,GM12878,0,ATF3,29,MXI1,98.760086,TSS
6401,GM12878,0,ATF3,32,NFE2,161.798194,TSS
6079,GM12878,0,ATF3,58,TBP,144.000000,TSS
...,...,...,...,...,...,...,...
496372,GM12878,67,ZNF143,64,ZBED1,1545.632722,TSS
496323,GM12878,67,ZNF143,65,ZBTB33,1000.000000,TSS
486330,GM12878,67,ZNF143,66,ZBTB40,1000.000000,TSS
491321,GM12878,67,ZNF143,67,ZNF143,1000.000000,TSS


In [20]:
k562_tf2tf.sort_values(by=['source', 'target'], ascending=True, inplace=True)
k562_tf2tf['source_id'] = k562_tf2tf['source'].map(lambda name: name2id[name])
k562_tf2tf['target_id'] = k562_tf2tf['target'].map(lambda name: name2id[name])
k562_tf2tf = k562_tf2tf[['cell_type', 'source_id', 'source', 'target_id', 'target', 'weight', 'type']]

k562_tf2tf

Unnamed: 0,cell_type,source_id,source,target_id,target,weight,type
8063,K562,0,ATF3,0,ATF3,1000.000000,TSS
8543,K562,0,ATF3,2,BHLHE40,713.545787,TSS
11234,K562,0,ATF3,3,CBX5,337.835895,TSS
13473,K562,0,ATF3,4,CEBPB,280.966863,TSS
12126,K562,0,ATF3,8,CTCF,1269.849304,TSS
...,...,...,...,...,...,...,...
410286,K562,67,ZNF143,65,ZBTB33,238.000000,TSS
402157,K562,67,ZNF143,66,ZBTB40,246.000000,TSS
406210,K562,67,ZNF143,67,ZNF143,1077.087601,TSS
409665,K562,67,ZNF143,68,ZNF274,1113.904796,TSS


In [21]:
gm12878_tf2tf.to_csv('../data/GM12878_tf2tf.csv', index=False)
k562_tf2tf.to_csv('../data/K562_tf2tf.csv', index=False)