# Data Preprocessing
## 1 Combine tf2gene and PPI to a single dataset
**Note that PPI is undirected 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = Path('../../data/')

ppi = pd.read_csv(path / 'biogrid.hc.tsv', sep='\t', header=None)
gm = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_GM12878.tsv', sep='\t', header=None)
k = pd.read_csv(path / 'EC-003-NET.edgeList_TSS_K562.tsv', sep='\t', header=None)

gm.columns = ['cell_type', 'source', 'target', 'type', 'weight']
k.columns = ['cell_type', 'source', 'target', 'type', 'weight']

ppi.columns = ['source', 'target', 'type', 'dummy']
ppi = ppi[['source', 'target', 'type']]
ppi['weight'] = 'NA'


print(f'Shape of GM12878: {gm.shape}')
print(f'Shape of K562: {k.shape}')
print(f'Shape of PPI: {ppi.shape}')

Shape of GM12878: (506074, 5)
Shape of K562: (954968, 5)
Shape of PPI: (21038, 4)


Check for naming compatibility

In [3]:
ppi_nodes = set(ppi['source']).union(set(ppi['target']))
print(f'Number of nodes in PPI: {len(ppi_nodes)}')

gm_nodes = set(gm['target'])
k_nodes = set(k['target'])
print(f'Number of target nodes in GM: {len(gm_nodes)}')
print(f'Number of target nodes in K: {len(k_nodes)}')

print(f'Number of common nodes in GM&PPI: {len(gm_nodes.intersection(ppi_nodes))}')
print(f'Number of common nodes in K&PPI: {len(k_nodes.intersection(ppi_nodes))}')

Number of nodes in PPI: 8752
Number of gene nodes in GM: 16553
Number of gene nodes in K: 17894
Number of common nodes in GM&PPI: 8015
Number of common nodes in K&PPI: 8169


From number of common nodes, we could assume that they are using the same naming system

**Transform PPI to undirected graph by swapping its source and target**

In [4]:
ppi_reverse = ppi[['target', 'source', 'type']]
ppi_reverse.columns = ['source', 'target', 'type']
ppi_undirected = pd.concat([ppi, ppi_reverse])
ppi_undirected.sort_values(['source', 'target'], ascending=True)
# ppi_undirected.reset_index(inplace=True)
ppi_undirected

Unnamed: 0,source,target,type,weight
0,A1CF,APOBEC1,PPI,
1,A1CF,SYNCRIP,PPI,
2,A2M,AMBP,PPI,
3,A2M,APOE,PPI,
4,AAAS,UBC,PPI,
...,...,...,...,...
21033,ZNF434,ZNF434,PPI,
21034,ZNF446,ZNF434,PPI,
21035,ZNF496,ZNF446,PPI,
21036,ZSCAN16,ZNF446,PPI,


### Ver. 1
1. preserve tf2gene's directed nature
2. preserve all nodes

In [5]:
# GM12878
ppi_undirected['cell_type'] = 'GM12878'
gm_ppi = pd.concat([gm, ppi_undirected])
gm_ppi = gm_ppi.reset_index()[['cell_type', 'source', 'target', 'type', 'weight']]
# Remove duplicate rows
print(f'Shape of GM_PPI (with duplicates): {gm_ppi.shape}')
gm_ppi.drop_duplicates(inplace=True)
print(f'Shape of GM_PPI (without duplicates): {gm_ppi.shape}')


# K562
ppi_undirected['cell_type'] = 'K562'
k_ppi = pd.concat([k, ppi_undirected])
k_ppi = k_ppi.reset_index()[['cell_type', 'source', 'target', 'type', 'weight']]
# Remove duplicate rows
print(f'Shape of K_PPI (with duplicates): {k_ppi.shape}')
k_ppi.drop_duplicates(inplace=True)
print(f'Shape of K_PPI (without duplicates): {k_ppi.shape}')

Shape of GM_PPI (with duplicates): (548150, 5)
Shape of GM_PPI (without duplicates): (545113, 5)
Shape of K_PPI (with duplicates): (997044, 5)
Shape of K_PPI (without duplicates): (991451, 5)


In [6]:
k_ppi[k_ppi['source'] == k_ppi['target']]

Unnamed: 0,cell_type,source,target,type,weight
5345,K562,ATF1,ATF1,TSS,546
8063,K562,ATF3,ATF3,TSS,1000
14838,K562,BACH1,BACH1,TSS,672.292
19344,K562,BHLHE40,BHLHE40,TSS,2903.73
34171,K562,CCNT2,CCNT2,TSS,163
...,...,...,...,...,...
997029,K562,ZHX1,ZHX1,PPI,
997032,K562,ZNF174,ZNF174,PPI,
997038,K562,ZNF408,ZNF408,PPI,
997039,K562,ZNF434,ZNF434,PPI,


In [7]:
# gm_ppi.to_csv(path/'GM12878_PPI_ver1.csv', index=False)

In [8]:
# k_ppi.to_csv(path/'K562_PPI_ver1.csv', index=False)

#### Sample out a smaller subgraph for experiment
1. Sample out some 1/10 tfs from the two tf2gene datasets
2. find all tf2gene edges related to the sampled tfs, sample out 1/10 of them, record the genes
3. find all PPI edges related to the recorded genes in Step 2

In [9]:
np.random.seed(0)

# sample the tfs
gm_ntf = gm['source'].nunique()
gm_tf = list(set(gm['source']))
gm_tf_nodes = set(np.random.choice(gm_tf, int(gm_ntf // 10)))
gm_tf2gene_samples = gm_ppi[gm_ppi['source'].isin(gm_tf_nodes)]

k_ntf = k['source'].nunique()
k_tf = list(set(k['source']))
k_tf_nodes = set(np.random.choice(k_tf, int(k_ntf // 10)))
k_tf2gene_samples = k_ppi[k_ppi['source'].isin(k_tf_nodes)]

print(f'[GM12878] Number of sampled tf: {len(gm_tf_nodes)}')
print(f'[K562] Number of sampled tf: {len(k_tf_nodes)}')


# select the related genes
gm_tf2gene_targets = list(set(gm_tf2gene_samples['target']))
gm_tf2gene_targets = set(np.random.choice(gm_tf2gene_targets, int(len(gm_tf2gene_targets) // 10)))

k_tf2gene_targets = list(set(k_tf2gene_samples['target']))
k_tf2gene_targets = set(np.random.choice(k_tf2gene_targets, int(len(k_tf2gene_targets) // 10)))

gm_tf2gene_samples = gm_tf2gene_samples[gm_tf2gene_samples['target'].isin(gm_tf2gene_targets)]
k_tf2gene_samples = k_tf2gene_samples[k_tf2gene_samples['target'].isin(k_tf2gene_targets)]
                      
print(f'[GM12878] Number of sampled tf2gene edges: {len(gm_tf2gene_samples)}')
print(f'[K562] Number of sampled gene target nodes: {len(k_tf2gene_samples)}')

# gm_gene_samples = gm[gm['source'].isin(gm_tf_nodes)]
# k_gene_samples = k[k['source'].isin(k_tf_nodes)]

# gm_gene_targets = list(set(gm_gene_samples['target']))
# gm_gene_targets = set(np.random.choice(gm_gene_targets, int(len(gm_gene_targets) // 10)))

# k_gene_targets = list(set(k_gene_samples['target']))
# k_gene_targets = set(np.random.choice(k_gene_targets, int(len(k_gene_targets) // 10)))

# gm_gene_samples = gm[gm['source'].isin(gm_tf_nodes) & gm['target'].isin(gm_gene_targets)]
# k_gene_samples = k[k['source'].isin(k_tf_nodes) & k['target'].isin(k_gene_targets)]
                      
# print(f'[GM12878] Number of sampled gene target nodes: {len(gm_gene_targets)}')
# print(f'[K562] Number of sampled gene target nodes: {len(k_gene_targets)}')


# find PPI edges
new_ppi = gm_ppi[gm_ppi['type'] == 'PPI']

gm_gene2gene_samples = new_ppi[new_ppi['source'].isin(gm_tf2gene_targets) | new_ppi['target'].isin(gm_tf2gene_targets)]
k_gene2gene_samples = new_ppi[new_ppi['source'].isin(k_tf2gene_targets) | new_ppi['target'].isin(k_tf2gene_targets)]

print(f'[GM12878] Number of sampled PPI link: {len(gm_gene2gene_samples)}')
print(f'[K562] Number of sampled PPI link: {len(k_gene2gene_samples)}')


# concat the dataframes

gm_ppi_samples = pd.concat([gm_tf2gene_samples, gm_gene2gene_samples])
k_ppi_samples = pd.concat([k_tf2gene_samples, k_gene2gene_samples])

print(f'Shape of gm_ppi_samples: {gm_ppi_samples.shape}')
print(f'Shape of k_ppi_samples: {k_ppi_samples.shape}')


[GM12878] Number of sampled tf: 9
[K562] Number of sampled tf: 19
[GM12878] Number of sampled tf2gene edges: 4055
[K562] Number of sampled gene target nodes: 9312
[GM12878] Number of sampled PPI link: 5352
[K562] Number of sampled PPI link: 5998
Shape of gm_ppi_samples: (9407, 5)
Shape of k_ppi_samples: (15310, 5)


In [13]:
gm_ppi_samples[['cell_type', 'source', 'target', 'type', 'weight']].to_csv(path / 'GM12878_PPI_sample_ver1.csv', index=False)
k_ppi_samples[['cell_type', 'source', 'target', 'type', 'weight']].to_csv(path / 'K562_PPI_sample_ver1.csv', index=False)