# Joing Training prep
1. find common tf
2. rename 
3. merge common tf with ppi

**Note that PPI is undirected 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = Path('../../data/')

ppi = pd.read_csv(path / 'biogrid.hc.tsv', sep='\t', header=None)
gm = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_GM12878.tsv', sep='\t', header=None)
k = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_K562.tsv', sep='\t', header=None)

gm.columns = ['cell_type', 'source', 'target', 'type', 'weight']
k.columns = ['cell_type', 'source', 'target', 'type', 'weight']

ppi.columns = ['source', 'target', 'type', 'dummy']
ppi = ppi[['source', 'target', 'type']]


print(f'Shape of GM12878: {gm.shape}')
print(f'Shape of K562: {k.shape}')
print(f'Shape of PPI: {ppi.shape}')

Shape of GM12878: (506074, 5)
Shape of K562: (506074, 5)
Shape of PPI: (21038, 3)


In [3]:
ppi_nodes = set(ppi['source']).union(set(ppi['target']))
print(f'Number of nodes in PPI: {len(ppi_nodes)}')

gm_nodes = set(gm['target'])
k_nodes = set(k['target'])

gm_ppi = gm_nodes.difference(set(gm['source']))
# gm_ppi.intersection(ppi_nodes)



# print(f'Number of gene nodes in GM: {len(gm_nodes)}')
# print(f'Number of gene nodes in K: {len(k_nodes)}')

# print(f'Number of common nodes in GM&PPI: {len(gm_nodes.intersection(ppi_nodes))}')
# print(f'Number of common nodes in K&PPI: {len(k_nodes.intersection(ppi_nodes))}')

Number of nodes in PPI: 8752




**Transform PPI to undirected graph by swapping its source and target**

In [4]:
ppi_reverse = ppi[['target', 'source', 'type']]
ppi_reverse.columns = ['source', 'target', 'type']
ppi_undirected = pd.concat([ppi, ppi_reverse])
ppi_undirected.sort_values(['source', 'target'], ascending=True, inplace=True)
ppi_undirected.reset_index(inplace=True)
ppi_undirected = ppi_undirected[['source', 'target', 'type']]
ppi_undirected['weight'] = 'NA'
ppi_undirected

Unnamed: 0,source,target,type,weight
0,@UBC,TRA,PPI,
1,A1CF,APOBEC1,PPI,
2,A1CF,SYNCRIP,PPI,
3,A2M,AMBP,PPI,
4,A2M,APOE,PPI,
...,...,...,...,...
42071,ZYX,NEDD8,PPI,
42072,ZYX,UBC,PPI,
42073,ZYX,VASP,PPI,
42074,ZZEF1,UBC,PPI,


**Find common tf and all tf**

In [5]:
gm12878_tf = set(gm['source'])
k562_tf = set(k['source'])

common_tf = set(k562_tf.intersection(gm12878_tf))
all_tf = set(k562_tf.union(k562_tf))
xor_tf = all_tf.difference(common_tf)
print(f'Common TFs: {len(common_tf)}')
print(f'All TFs: {len(all_tf)}')
print(f'Exclusive(XOR) TFs: {len(xor_tf)}')

# Create a look up table containing all common tfs for future use (if necessary)
path = Path('../../data/')

common_tf_df = pd.DataFrame(common_tf, columns=['tf'])
all_tf_df = pd.DataFrame(all_tf, columns=['tf'])
xor_tf_df = pd.DataFrame(xor_tf, columns=['tf'])

# common_tf_df.to_csv(path / 'common_tf.csv', index=False)
# all_tf_df.to_csv(path / 'all_tf.csv', index=False)
# xor_tf_df.to_csv(path / 'xor_tf.csv', index=False)


# Filter out source nodes that belong to common_tf
gm_tf2gene = gm[gm['source'].isin(common_tf)]
k_tf2gene = k[k['source'].isin(common_tf)]

# Clean up
gm_tf2gene.reset_index()
gm_tf2gene = gm_tf2gene[['cell_type', 'source', 'target', 'type', 'weight']]
k_tf2gene.reset_index()
k_tf2gene = k_tf2gene[['cell_type', 'source', 'target', 'type', 'weight']]

Common TFs: 101
All TFs: 101
Exclusive(XOR) TFs: 0


Check that we have indeed only include common tf in source nodes

In [6]:
gm_source = set(gm_tf2gene['source'])
k_source = set(k_tf2gene['source'])

import collections
compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

print('The two edgelist has exactly the same types of source nodes' if compare(gm_source, k_source) else 'The two edgelist does not have same types of source nodes')

The two edgelist has exactly the same types of source nodes


At this stage, all the `source` are common TFs, what we need to do next is

1. rename all `source` nodes
2. identify `target` nodes, they should be classified into three classes
    1. gene -- no operation needed
    2. TF but not part of common_tf -- remove?
    3. TF and part of common_tf -- rename

In [7]:
# rename ALL source nodes (add '_gm' or '_k')
# ALL source nodes are TFs
gm_tf2gene['source_renamed'] = gm_tf2gene['source'].map(lambda x: x + '_gm')
k_tf2gene['source_renamed'] = k_tf2gene['source'].map(lambda x: x + '_k')

# Rename target nodes which are TFs AND part of common_tf
gm_tf2gene['target_renamed'] = gm_tf2gene['target'].map(lambda x: x + '_gm' if x in common_tf else x)
k_tf2gene['target_renamed'] = k_tf2gene['target'].map(lambda x: x + '_k' if x in common_tf else x)

# Remove target nodes which are TFs BUT NOT part of common_tf
gm_tf2gene = gm_tf2gene[~gm_tf2gene['target'].isin(xor_tf)]
k_tf2gene = k_tf2gene[~k_tf2gene['target'].isin(xor_tf)]

# Check for correctness
gm_is_renamed = gm_tf2gene['target_renamed'].map(lambda x: x[-3:] == '_gm')
k_is_renamed = k_tf2gene['target_renamed'].map(lambda x: x[-2:] == '_k')
print('Renamed GM12878 target nodes: ', collections.Counter(gm_is_renamed))
print('Renamed K562 target nodes: ', collections.Counter(k_is_renamed))

Renamed GM12878 target nodes:  Counter({False: 506074})
Renamed K562 target nodes:  Counter({False: 506074})


In [8]:
# Check all xor tfs are removed
print(len(set(gm_tf2gene['target_renamed']).intersection(xor_tf)))
print(len(set(k_tf2gene['target_renamed']).intersection(xor_tf)))

0
0


In [9]:
gm_tf2gene

Unnamed: 0,cell_type,source,target,type,weight,source_renamed,target_renamed
0,GM12878,ATF2,HES4,TSS,674.551267,ATF2_gm1,HES4
1,GM12878,ATF2,ISG15,TSS,261.000000,ATF2_gm1,ISG15
2,GM12878,ATF2,AGRN,TSS,190.035403,ATF2_gm1,AGRN
3,GM12878,ATF2,ACAP3,TSS,226.268091,ATF2_gm1,ACAP3
4,GM12878,ATF2,PUSL1,TSS,241.000000,ATF2_gm1,PUSL1
...,...,...,...,...,...,...,...
506069,GM12878,JUND,G6PD,TSS,401.905464,JUND_gm1,G6PD
506070,GM12878,JUND,VAMP7,TSS,823.020345,JUND_gm1,VAMP7
506071,GM12878,JUND,CRLF2,TSS,1000.000000,JUND_gm1,CRLF2
506072,GM12878,JUND,ASMTL,TSS,1363.003897,JUND_gm1,ASMTL


In [10]:
# Clean up the DataFrame and save
gm_tf2gene['source'] = gm_tf2gene['source_renamed']
k_tf2gene['source'] = k_tf2gene['source_renamed']
gm_tf2gene['target'] = gm_tf2gene['target_renamed']
k_tf2gene['target'] = k_tf2gene['target_renamed']
gm_tf2gene.drop(['source_renamed', 'target_renamed'], axis=1, inplace=True)
k_tf2gene.drop(['source_renamed', 'target_renamed'], axis=1, inplace=True)

gm_tf2gene.to_csv(path / 'gm_common_renamed.csv', index=False)
k_tf2gene.to_csv(path / 'k_common_renamed.csv', index=False)

In [11]:
ppi_undirected.to_csv(path / 'ppi_undirected.csv', index=False)

## Merged version of GM, K, and PPI

In [12]:
print(f'K shape: {k_tf2gene.shape}')
print(f'GM shape: {gm_tf2gene.shape}')
print(f'PPI shape: {ppi_undirected.shape}')

# print(f'K + PPI: {k_tf2gene.shape[0] + ppi_undirected.shape[0]}')
print(f'GM + K + PPI: {gm_tf2gene.shape[0] + k_tf2gene.shape[0] + ppi_undirected.shape[0]}')

K shape: (506074, 5)
GM shape: (506074, 5)
PPI shape: (42076, 4)
GM + K + PPI: 1054224


In [13]:
ppi_undirected

Unnamed: 0,source,target,type,weight
0,@UBC,TRA,PPI,
1,A1CF,APOBEC1,PPI,
2,A1CF,SYNCRIP,PPI,
3,A2M,AMBP,PPI,
4,A2M,APOE,PPI,
...,...,...,...,...
42071,ZYX,NEDD8,PPI,
42072,ZYX,UBC,PPI,
42073,ZYX,VASP,PPI,
42074,ZZEF1,UBC,PPI,


In [14]:
# ppi_k = ppi_undirected.copy(deep=True)
# ppi_k['source'] = ppi_k['source'].map(lambda x: x + '_k' if x in k562_tf else x)
# ppi_k['target'] = ppi_k['target'].map(lambda x: x + '_k' if x in k562_tf else x)
# ppi_k['cell_type'] = 'K562'

# ppi_gm = ppi_undirected.copy(deep=True)
# ppi_gm['source'] = ppi_gm['source'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
# ppi_gm['target'] = ppi_gm['target'].map(lambda x: x + '_gm' if x in gm12878_tf else x)
# ppi_gm['cell_type'] = 'GM12878'

# k_merged_renamed = pd.concat([k_tf2gene, ppi_k], axis=0)
# k_merged_renamed.reset_index(inplace=True)
# k_merged_renamed.drop_duplicates(inplace=True)

# merged_renamed = pd.concat([gm_tf2gene, k_merged_renamed], axis=0)
# merged_renamed.reset_index(inplace=True)
# merged_renamed.drop_duplicates(inplace=True)

ppi_undirected['cell_type'] = 'NA'
needed_cols = ['cell_type', 'source', 'target', 'type', 'weight']

merged_renamed = pd.concat([gm_tf2gene[needed_cols], k_tf2gene[needed_cols], ppi_undirected])
merged_renamed['cell_type'] = merged_renamed['cell_type'].astype(object)
merged_renamed.reset_index(inplace=True)
merged_renamed.drop_duplicates(inplace=True)

print('After removing duplicates')
# print(f'K + PPI: {k_merged_renamed.shape}')
print(f'GM + K + PPI: {merged_renamed.shape}')


After removing duplicates
GM + K + PPI: (1054224, 6)


No duplicates observed

In [15]:
merged_renamed

Unnamed: 0,index,cell_type,source,target,type,weight
0,0,GM12878,ATF2_gm1,HES4,TSS,674.551
1,1,GM12878,ATF2_gm1,ISG15,TSS,261
2,2,GM12878,ATF2_gm1,AGRN,TSS,190.035
3,3,GM12878,ATF2_gm1,ACAP3,TSS,226.268
4,4,GM12878,ATF2_gm1,PUSL1,TSS,241
...,...,...,...,...,...,...
1054219,42071,,ZYX,NEDD8,PPI,
1054220,42072,,ZYX,UBC,PPI,
1054221,42073,,ZYX,VASP,PPI,
1054222,42074,,ZZEF1,UBC,PPI,


In [16]:
# merged_renamed.loc[:,'cell_type'] = merged_renamed.loc[:, 'cell_type'].astype('string')
# merged_renamed.loc[:,'type'] = merged_renamed.loc[:, 'type'].astype('string')

merged_renamed.dtypes

index         int64
cell_type    object
source       object
target       object
type         object
weight       object
dtype: object

In [17]:
# gm_merged_renamed.to_csv(path / 'gm_ppi_common_renamed.csv', index=False)
merged_renamed[['cell_type', 'source', 'target', 'type', 'weight']].to_csv(path / 'joint_renamed_v2.csv', index=False)