In [1]:
import hetmatpy.hetmat
import networkx as nx
import pandas as pd

# PPI

In [2]:
ppi_edges = pd.read_csv('../../data/2.edges/ppi.tsv.xz', sep='\t')

In [3]:
(
    ppi_edges
    .groupby('id_a')
    [['train', 'test_recon', 'test_new']]
    .sum()
    .reset_index()
    .merge(
        ppi_edges
        .groupby('id_b')
        [['train', 'test_recon', 'test_new']]
        .sum()
        .reset_index(),
        left_on='id_a', right_on='id_b', how='outer'
    )
    .assign(
        train=lambda df: df['train_x'] + df['train_y'],
        test_recon=lambda df: df['test_recon_x'] + df['test_recon_y'],
        test_new=lambda df: df['test_new_x'] + df['test_new_y']
    )
    .fillna(0)
    .loc[:, ['train', 'test_recon', 'test_new']]
    .astype(bool)
    .sum(axis=0)
)

train         3992
test_recon    3992
test_new      3916
dtype: int64

In [4]:
ppi_edges[['train', 'test_recon', 'test_new']].sum()

train         255522
test_recon    364743
test_new       12913
dtype: int64

# bioRxiv

In [5]:
biorxiv_edges = pd.read_csv('../../data/2.edges/biorxiv.tsv.xz', sep='\t')

In [6]:
(
    biorxiv_edges
    .groupby('id_a')
    [['train', 'test_recon', 'test_new']]
    .sum()
    .reset_index()
    .merge(
        biorxiv_edges
        .groupby('id_b')
        [['train', 'test_recon', 'test_new']]
        .sum()
        .reset_index(),
        left_on='id_a', right_on='id_b', how='outer'
    )
    .assign(
        train=lambda df: df['train_x'] + df['train_y'],
        test_recon=lambda df: df['test_recon_x'] + df['test_recon_y'],
        test_new=lambda df: df['test_new_x'] + df['test_new_y']
    )
    .fillna(0)
    .loc[:, ['train', 'test_recon', 'test_new']]
    .astype(bool)
    .sum(axis=0)
)

train         4587
test_recon    4615
test_new      4615
dtype: int64

In [7]:
biorxiv_edges[['train', 'test_recon', 'test_new']].sum()

train         30686
test_recon    43691
test_new      44963
dtype: int64

# TF-TG

In [8]:
tftg_edges = pd.read_csv('../../data/2.edges/tftg.tsv.xz', sep='\t')

In [9]:
(
    tftg_edges
    .groupby('id_a')
    [['train', 'test_recon', 'test_new']]
    .sum()
    .reset_index()
    .astype(bool)
    .sum(axis=0)
)

id_a          143
train         142
test_recon    144
test_new      144
dtype: int64

In [10]:
(
    tftg_edges
    .groupby('id_b')
    [['train', 'test_recon', 'test_new']]
    .sum()
    .reset_index()
    .astype(bool)
    .sum(axis=0)
)

id_b          1417
train         1396
test_recon    1406
test_new      1417
dtype: int64

In [11]:
tftg_edges[['train', 'test_recon', 'test_new']].sum()

train          2689
test_recon     3496
test_new      29177
dtype: int64

# Hetionet

In [2]:
hetmat = hetmatpy.hetmat.HetMat('../hetionet-prior/hetionet-v1.0.hetmat/')

In [22]:
metapaths = hetmat.metagraph.extract_all_metapaths(1, True)
metaedges = sorted(i[0].abbrev for i in metapaths)

for metaedge in metaedges:
    source, target, adj_mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dense_threshold=1)
    if adj_mat.nnz > 2000:
        if adj_mat.shape[0] != adj_mat.shape[1] or (adj_mat != adj_mat.T).nnz:
            print(metaedge, f'Source: {len(source)}, Target: {len(target)}', adj_mat.nnz)
        else:
            print(metaedge, f'Source: {len(source)}, Target: {len(target)}', adj_mat.nnz / 2)

AdG Source: 402, Target: 20945 102240
AeG Source: 402, Target: 20945 526407
AlD Source: 402, Target: 137 3602
AuG Source: 402, Target: 20945 97848
BPpG Source: 11381, Target: 20945 559504
CCpG Source: 1391, Target: 20945 73566
CbG Source: 1552, Target: 20945 11571
CcSE Source: 1552, Target: 5734 138944
CdG Source: 1552, Target: 20945 21102
CrC Source: 1552, Target: 1552 6486.0
CuG Source: 1552, Target: 20945 18756
DaG Source: 137, Target: 20945 12623
DdG Source: 137, Target: 20945 7623
DpS Source: 137, Target: 438 3357
DuG Source: 137, Target: 20945 7731
G<rG Source: 20945, Target: 20945 265672
GcG Source: 20945, Target: 20945 61690.0
GiG Source: 20945, Target: 20945 147164.0
GpMF Source: 20945, Target: 2884 97222
GpPW Source: 20945, Target: 1822 84372
