In [1]:
import pathlib
from typing import List, Tuple, TypeVar

import hetmatpy.hetmat
import numpy as np
import numpy
import pandas as pd 
import scipy.sparse
import sklearn.metrics
import tqdm
import xswap

In [3]:
hetmat = hetmatpy.hetmat.HetMat('hetionet-v1.0.hetmat/')
metapaths = hetmat.metagraph.extract_all_metapaths(1, exclude_inverts=True)
metaedges = [metapath[0] for metapath in metapaths]

n_perms = 100

In [4]:
for metaedge in tqdm.tqdm_notebook(metaedges):
    print(metaedge, flush=True)
    prior_path = pathlib.Path(f'priors/{metaedge.abbrev}.csv.gz')
    if prior_path.exists():
        continue

    # Generate matrix and infer metaedge properties from it
    source, target, mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dense_threshold=1, dtype=bool)
    shape = mat.shape
    if mat.shape[0] != mat.shape[1]:
        # Can't be symmetric if non-square
        symmetric = False
    else:
        # Symmetric if square and every value is equal to its value in the transposed matrix
        symmetric = (mat != mat.T).nnz == 0
    include_self_loops = mat.diagonal().sum() > 0

    # Create edges from hetmat matrix
    name_to_edges = {
        'original': xswap.network_formats.matrix_to_edges(mat, include_reverse_edges=(not symmetric)), 
    }
    del mat
    
    # Sample edges for reconstruction
    sampled_indices_50 = np.random.randint(low=0, high=len(name_to_edges['original']), 
                                           size=int(0.5 * len(name_to_edges['original'])))
    name_to_edges['sample_50'] = [name_to_edges['original'][i] for i in set(sampled_indices_50)]
    del sampled_indices_50
    
    sampled_indices_20 = np.random.randint(low=0, high=len(name_to_edges['original']), 
                                           size=int(0.2 * len(name_to_edges['original'])))    
    name_to_edges['sample_20'] = [name_to_edges['original'][i] for i in set(sampled_indices_20)]
    del sampled_indices_20
    
    # Setup DataFrame with edges in each network (minimize memory use)
    prior_df = pd.DataFrame({
        'source_id': np.repeat(np.arange(shape[0], dtype=np.uint16), shape[1]),
        'target_id': np.tile(np.arange(shape[1], dtype=np.uint16), shape[0]),
    })
    
    # Compute XSwap prior for original and two sampled networks
    for name, network_edges in name_to_edges.items():        
        prior_df[f'permuted_edges_{name}'] = xswap.prior.compute_xswap_occurrence_matrix(
            network_edges, n_permutations=n_perms, shape=shape, allow_antiparallel=(not symmetric),
            allow_self_loops=include_self_loops).toarray().flatten()
        
        prior_df[f'edge_{name}'] = xswap.network_formats.edges_to_matrix(
            network_edges, add_reverse_edges=symmetric, shape=shape, dtype=int, sparse=False).flatten()
        del network_edges
    
    prior_df.to_csv(prior_path, index=False, compression='gzip')
    del prior_df

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

Anatomy - localizes - Disease
Anatomy - downregulates - Gene
Anatomy - expresses - Gene
Anatomy - upregulates - Gene
Biological Process - participates - Gene
Cellular Component - participates - Gene
Compound - resembles - Compound
Compound - palliates - Disease
Compound - treats - Disease
Compound - binds - Gene
Compound - downregulates - Gene
Compound - upregulates - Gene
Compound - includes - Pharmacologic Class
Compound - causes - Side Effect
Disease - resembles - Disease
Disease - associates - Gene
Disease - downregulates - Gene
Disease - upregulates - Gene
Disease - presents - Symptom
Gene - covaries - Gene
Gene - interacts - Gene
Gene < regulates < Gene
Gene - participates - Molecular Function
Gene - participates - Pathway

