# Joing Training prep
1. find common tf
2. rename 
3. merge common tf with ppi

**Note that PPI is undirected 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = Path('../../data/')

ppi = pd.read_csv(path / 'biogrid.hc.tsv', sep='\t', header=None)
gm = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_GM12878.tsv', sep='\t', header=None)
k = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_K562.tsv', sep='\t', header=None)

gm.columns = ['cell_type', 'source', 'target', 'type', 'weight']
k.columns = ['cell_type', 'source', 'target', 'type', 'weight']

ppi.columns = ['source', 'target', 'type', 'dummy']
ppi = ppi[['source', 'target', 'type']]
ppi['weight'] = 'NA'


print(f'Shape of GM12878: {gm.shape}')
print(f'Shape of K562: {k.shape}')
print(f'Shape of PPI: {ppi.shape}')

Shape of GM12878: (506074, 5)
Shape of K562: (954968, 5)
Shape of PPI: (21038, 4)


Check for naming compatibility

In [3]:
ppi_nodes = set(ppi['source']).union(set(ppi['target']))
print(f'Number of nodes in PPI: {len(ppi_nodes)}')

gm_nodes = set(gm['target'])
k_nodes = set(k['target'])
print(f'Number of gene nodes in GM: {len(gm_nodes)}')
print(f'Number of gene nodes in K: {len(k_nodes)}')

print(f'Number of common nodes in GM&PPI: {len(gm_nodes.intersection(ppi_nodes))}')
print(f'Number of common nodes in K&PPI: {len(k_nodes.intersection(ppi_nodes))}')

Number of nodes in PPI: 8752
Number of gene nodes in GM: 16553
Number of gene nodes in K: 17894
Number of common nodes in GM&PPI: 8015
Number of common nodes in K&PPI: 8169


From number of common nodes, we could assume that they are using the same naming system

**Transform PPI to undirected graph by swapping its source and target**

In [4]:
ppi_reverse = ppi[['target', 'source', 'type']]
ppi_reverse.columns = ['source', 'target', 'type']
ppi_undirected = pd.concat([ppi, ppi_reverse])
ppi_undirected.sort_values(['source', 'target'], ascending=True)
# ppi_undirected.reset_index(inplace=True)


Unnamed: 0,source,target,type,weight
20013,@UBC,TRA,PPI,
0,A1CF,APOBEC1,PPI,
1,A1CF,SYNCRIP,PPI,
2,A2M,AMBP,PPI,
3,A2M,APOE,PPI,
...,...,...,...,...
14070,ZYX,NEDD8,PPI,
20846,ZYX,UBC,PPI,
20957,ZYX,VASP,PPI,
20847,ZZEF1,UBC,PPI,


**Find common tf**

In [5]:
gm12878_tf = set(gm['source'])
k562_tf = set(k['source'])

common_tf = list(k562_tf.intersection(gm12878_tf))

gm_tf2tf = gm[gm['source'].isin(common_tf)]
gm_tf2tf = gm_tf2tf[gm_tf2tf['target'].isin(common_tf)]
k_tf2tf = k[k['source'].isin(common_tf)]
k_tf2tf = k_tf2tf[k_tf2tf['target'].isin(common_tf)]

gm_tf2tf.reset_index()
gm_tf2tf = gm_tf2tf[['cell_type', 'source', 'target', 'type', 'weight']]
k_tf2tf.reset_index()
k_tf2tf = k_tf2tf[['cell_type', 'source', 'target', 'type', 'weight']]

# rename


gm_tf2tf['source'] = gm_tf2tf['source'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
gm_tf2tf['target'] = gm_tf2tf['target'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
k_tf2tf['source'] = k_tf2tf['source'].map(lambda x: x + '_k' if x in k562_tf else x)
k_tf2tf['target'] = k_tf2tf['target'].map(lambda x: x + '_k' if x in k562_tf else x)

In [6]:
# gm_tf2tf.to_csv(path / 'gm_common_renamed.csv', index=False)
# k_tf2tf.to_csv(path / 'k_common_renamed.csv', index=False)

In [7]:
# ppi_undirected.to_csv(path / 'ppi_undirected.csv', index=False)

## Merged version of GM, K, and PPI

In [10]:
print(f'K shape: {k_tf2tf.shape}')
print(f'GM shape: {gm_tf2tf.shape}')
print(f'PPI shape: {ppi_undirected.shape}')

# print(f'K + PPI: {k_tf2tf.shape[0] + ppi_undirected.shape[0]}')
print(f'GM + K + PPI: {gm_tf2tf.shape[0] + k_tf2tf.shape[0] + ppi_undirected.shape[0]}')

K shape: (2408, 5)
GM shape: (2196, 5)
PPI shape: (42076, 4)
GM + K + PPI: 46680


In [11]:
ppi_k = ppi_undirected.copy(deep=True)
ppi_k['source'] = ppi_k['source'].map(lambda x: x + '_k' if x in k562_tf else x)
ppi_k['target'] = ppi_k['target'].map(lambda x: x + '_k' if x in k562_tf else x)
ppi_k['cell_type'] = 'K562'

ppi_gm = ppi_undirected.copy(deep=True)
ppi_gm['source'] = ppi_gm['source'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
ppi_gm['target'] = ppi_gm['target'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
ppi_gm['cell_type'] = 'GM12878'

k_merged_renamed = pd.concat([k_tf2tf, ppi_k], axis=0)
# k_merged_renamed.reset_index(inplace=True)
# k_merged_renamed.drop_duplicates(inplace=True)

merged_renamed = pd.concat([gm_tf2tf, k_merged_renamed], axis=0)
merged_renamed.reset_index(inplace=True)
merged_renamed.drop_duplicates(inplace=True)

print('After removing duplicates')
# print(f'K + PPI: {k_merged_renamed.shape}')
print(f'GM + K + PPI: {merged_renamed.shape}')
print('No duplicates row observed')

After removing duplicates
GM + K + PPI: (46680, 6)
No duplicates row observed


In [12]:
merged_renamed

Unnamed: 0,index,cell_type,source,target,type,weight
0,5646,GM12878,ATF3_gm,ZBTB40_gm,TSS,373
1,5676,GM12878,ATF3_gm,YBX1_gm,TSS,196.371
2,5841,GM12878,ATF3_gm,BHLHE40_gm,TSS,88.3926
3,6079,GM12878,ATF3_gm,TBP_gm,TSS,144
4,6293,GM12878,ATF3_gm,MXI1_gm,TSS,98.7601
...,...,...,...,...,...,...
46675,21033,K562,ZNF434,ZNF434,PPI,
46676,21034,K562,ZNF446,ZNF434,PPI,
46677,21035,K562,ZNF496,ZNF446,PPI,
46678,21036,K562,ZSCAN16,ZNF446,PPI,


In [13]:
# gm_merged_renamed.to_csv(path / 'gm_ppi_common_renamed.csv', index=False)
merged_renamed.to_csv(path / 'joint_renamed.csv', index=False)