# Convert Hetionet v1.0 into an adjacency table with edge counts

In [1]:
import pandas

In [2]:
# Read edge TSV
repo_url = 'https://github.com/dhimmel/hetionet/raw/46f2d918caaec98e17613869911231154afbf255/'
url = repo_url + 'hetnet/tsv/hetionet-v1.0-edges.sif.gz'
sif_df = pandas.read_table(url, compression='gzip')
sif_df.tail(2)

Unnamed: 0,source,metaedge,target
2250195,Anatomy::UBERON:0002048,AeG,Gene::8843
2250196,Anatomy::UBERON:0002240,AeG,Gene::2124


In [3]:
# Remove expression edges
remove_metaedges = {'AeG', 'AuG', 'AdG'}
sif_df = sif_df.query("metaedge not in @remove_metaedges")

In [4]:
# Highest degree nodes
pandas.concat([sif_df.source, sif_df.target]).value_counts().head().reset_index()

Unnamed: 0,index,0
0,Gene::7316,9317
1,Gene::351,2329
2,Gene::5111,2081
3,Gene::991,2012
4,Pathway::PC7_7439,1956


In [5]:
# Edges per metaedge
sif_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,GpBP,559504
1,Gr>G,265672
2,GiG,147164
3,CcSE,138944
4,GpMF,97222
5,GpPW,84372
6,GpCC,73566
7,GcG,61690
8,CdG,21102
9,CuG,18756


In [6]:
count_df = (
    pandas.DataFrame.from_records(
        map(sorted, zip(sif_df.source, sif_df.target)),
        columns=['source', 'target'])
    .groupby(['source', 'target'])
    .apply(len)
    .reset_index()
    .rename(columns={0: 'edge_count'})
)

In [7]:
count_df.tail(3)

Unnamed: 0,source,target,edge_count
1519877,Gene::9997,Pathway::PC7_8229,1
1519878,Gene::9997,Pathway::PC7_8339,1
1519879,Gene::9997,Pathway::WP3286_r84312,1


In [8]:
# Number of pairs by edge count
count_df.edge_count.value_counts().reset_index()

Unnamed: 0,index,edge_count
0,1,1516073
1,2,3792
2,3,15


In [9]:
# Percent of pairs by edge count
count_df.edge_count.value_counts(True).reset_index()

Unnamed: 0,index,edge_count
0,1,0.997495
1,2,0.002495
2,3,1e-05


In [11]:
count_df.to_csv('adjacency.tsv.bz2', compression='bz2', index=False, sep='\t')