## Discussion section

Most of this information is used in the discussion section

In [1]:
import hetmatpy.hetmat
import numpy as np
import pandas as pd
from plotnine import *
import tqdm

In [2]:
hetmat = hetmatpy.hetmat.HetMat('../../data/task1/hetionet-v1.0.hetmat/')

metaedges = hetmat.metagraph.extract_all_metapaths(1, True)
metaedges = [metaedge[0] for metaedge in metaedges]

rows = []
for metaedge in tqdm.tqdm_notebook(metaedges):
    _, _, mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dense_threshold=1)
    rowsums = np.array(mat.sum(axis=1)).flatten()
    n_unique_source_degree = len(set(rowsums))
    
    colsums = np.array(mat.sum(axis=0)).flatten()
    n_unique_target_degree = len(set(colsums))
    
    row = {
        'metaedge': metaedge.abbrev,
        'source_nodes': mat.shape[0],
        'target_nodes': mat.shape[1],
        'source_degrees': n_unique_source_degree,
        'target_degrees': n_unique_target_degree,
    }
    rows.append(row)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




In [3]:
df = (
    pd.DataFrame(rows)
    .merge(pd.read_csv('../../data/task1/hetionet_calibration_metrics.csv'), on='metaedge')
    .assign(
        mean_num_degrees = lambda df: (df['source_degrees'] * df['target_degrees']) ** 0.5,
        mean_num_nodes = lambda df: (df['source_nodes'] * df['target_nodes']) ** 0.5,
    )
)

In [4]:
(
    df
    .query('feature == "xswap_prior" & mean_num_nodes > 2500 & network == "full"')
    ['auroc'].min()
)

0.9492872016964718

In [5]:
(
    df
    .query('network == "full"')
    .filter(items=['metaedge', 'feature', 'auroc'])
    .groupby('metaedge')
    .apply(lambda group: group.loc[group['auroc'] == group['auroc'].max(), 'feature'])
    .reset_index()
    .filter(items=['metaedge', 'feature'])
    ['feature']
    .value_counts()
)

xswap_prior       12
scaled_degree      6
analytic_prior     2
Name: feature, dtype: int64