In [1]:
import pathlib
from typing import List, Tuple, TypeVar

import hetmatpy.hetmat
import numpy as np
import numpy
import pandas as pd 
import scipy.sparse
import sklearn.metrics
import tqdm
import xswap

In [2]:
def compute_xswap_occurrence_matrix(edge_list: List[Tuple[int, int]],
                                    n_permutations: int,
                                    shape: Tuple[int, int],
                                    allow_self_loops: bool = False,
                                    allow_antiparallel: bool = False,
                                    swap_multiplier: float = 10,
                                    initial_seed: int = 0):
    """
    Compute the XSwap prior probability for every node pair in a network. The
    XSwap prior is the probability of a node pair having an edge between them in
    degree-preserving permutations of a network. The prior value for a node
    pair can be considered as the probability of an edge existing between two
    nodes given only the network's degree sequence.

    Parameters
    ----------
    edge_list : List[Tuple[int, int]]
        Edge list representing the graph whose XSwap edge priors are to be
        computed. Tuples contain integer values representing nodes. No value
         should be greater than C++'s `INT_MAX`, in this case 2_147_483_647.
    n_permutations : int
        The number of permuted networks used to compute the empirical XSwap prior
    allow_self_loops : bool
        Whether to allow edges like (0, 0). In the case of bipartite graphs,
        such an edge represents a connection between two distinct nodes, while
        in other graphs it may represent an edge from a node to itself, in which
        case an edge may or may not be meaningful depending on context.
    allow_antiparallel : bool
        Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case
        of bipartite graphs, these edges represent two connections between four
        distinct nodes, while for other graphs, these may be connections between
        the same two nodes.
    swap_multiplier : float
        The number of edge swap attempts is determined by the product of the
        number of existing edges and multiplier. For example, if five edges are
        passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer
        products will be rounded down to the nearest integer.
    initial_seed : int
        Random seed that will be passed to the C++ Mersenne Twister 19937 random
        number generator. `initial_seed` will be used for the first permutation,
        and the seed used for each subsequent permutation will be incremented by
        one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then
        the two permutations will pass seeds 0 and 1, respectively.

    Returns
    -------
    edge_counter : scipy.sparse.csc_matrix
        Adjacency matrix with entries equal to the number of permutations in
        which a given edge appeared
    """
    if len(edge_list) != len(set(edge_list)):
        raise ValueError("Edge list contained duplicate edges. "
                         "XSwap does not support multigraphs.")

    num_swaps = int(swap_multiplier * len(edge_list))

    # Set max_source and max_target to their mutual maximum to ensure enough
    # space is allocated for the bitset
    max_source = max([i[0] for i in edge_list])
    max_target = max([i[1] for i in edge_list])
    max_source, max_target = sorted([max_source, max_target])

    edge_counter = scipy.sparse.csc_matrix(shape, dtype=int)

    for i in range(n_permutations):
        permuted_edges, stats = xswap._xswap_backend._xswap(
            edge_list, [], max_source, max_target, allow_self_loops,
            allow_antiparallel, num_swaps, initial_seed + i)
        permuted_matrix = edges_to_matrix(
            permuted_edges, add_reverse_edges=(not allow_antiparallel),
            shape=shape, dtype=int, sparse=True)
        edge_counter += permuted_matrix

    return edge_counter


def matrix_to_edges(matrix: numpy.ndarray, include_reverse_edges: bool=True):
    """
    Convert (bi)adjacency matrix to an edge list. Inverse of `edges_to_matrix`.

    Parameters
    ----------
    matrix : numpy.ndarray
        Adjacency matrix or biadjacency matrix of a network
    include_reverse_edges : bool
        Whether to return edges that are the inverse of existing edges. For
        example, if returning [(0, 1), (1, 0)] is desired or not. If False,
        then only edges where source <= target are returned. This parameter
        should be `True` when passing a biadjacency matrix, as matrix positions
        indicate separate nodes.

    Returns
    -------
    edge_list : List[Tuple[int, int]]
    """
    sparse = scipy.sparse.coo_matrix(matrix)
    edges = zip(sparse.row, sparse.col)

    if not include_reverse_edges:
        edges = filter(lambda edge: edge[0] <= edge[1], edges)
    return list(edges)


def edges_to_matrix(edge_list: List[Tuple[int, int]], add_reverse_edges: bool,
                    shape: Tuple[int, int], dtype: TypeVar=bool, sparse: bool=True):
    """
    Convert edge list to (bi)adjacency matrix. Inverse of `matrix_to_edges`.

    Parameters
    ----------
    edge_list : List[Tuple[int, int]]
    add_reverse_edges : bool
        Whether to include the reverse of edges in the matrix. For example,
        if `edge_list = [(1, 0)]` and `add_reverse_edge = True`, then the
        returned matrix has `matrix[1, 0]` = `matrix[0, 1]` = 1. Else, the matrix
        only has `matrix[1, 0]` = 1. If a biadjacency matrix is desired, then
        set `add_reverse_edges = False`.
    shape : Tuple[int, int]
        Shape of the matrix to be returned. Allows edges to be converted to
        a matrix even when there are nodes without edges.
    dtype : data-type
        Dtype of the returned matrix. For example, `int`, `bool`, `float`, etc.
    sparse : bool
        Whether a sparse matrix should be returned. If `False`, returns a dense
        numpy.ndarray

    Returns
    -------
    matrix : scipy.sparse.csc_matrix or numpy.ndarray
    """
    matrix = scipy.sparse.csc_matrix(
        (numpy.ones(len(edge_list)), zip(*edge_list)), dtype=dtype, shape=shape,
    )

    if add_reverse_edges:
        matrix = (matrix + matrix.T) > 0

    if not sparse:
        matrix = matrix.toarray()

    return matrix

In [3]:
hetmat = hetmatpy.hetmat.HetMat('hetionet-v1.0.hetmat/')
metapaths = hetmat.metagraph.extract_all_metapaths(1, exclude_inverts=True)
metaedges = [metapath[0] for metapath in metapaths]

n_perms = 100

In [4]:
for metaedge in tqdm.tqdm_notebook(metaedges):
    print(metaedge, flush=True)
    prior_path = pathlib.Path(f'priors_2/{metaedge.abbrev}.csv.gz')
    if prior_path.exists():
        continue

    # Generate matrix and infer metaedge properties from it
    source, target, mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dense_threshold=1, dtype=bool)
    shape = mat.shape
    if mat.shape[0] != mat.shape[1]:
        # Can't be symmetric if non-square
        symmetric = False
    else:
        # Symmetric if square and every value is equal to its value in the transposed matrix
        symmetric = (mat != mat.T).nnz == 0
    include_self_loops = mat.diagonal().sum() > 0

    # Create edges from hetmat matrix
    name_to_edges = {
        'original': matrix_to_edges(mat, include_reverse_edges=(not symmetric)), 
    }
    del mat
    
    # Sample edges for reconstruction
    sampled_indices_50 = np.random.randint(low=0, high=len(name_to_edges['original']), 
                                           size=int(0.5 * len(name_to_edges['original'])))
    name_to_edges['sample_50'] = [name_to_edges['original'][i] for i in set(sampled_indices_50)]
    del sampled_indices_50
    
    sampled_indices_20 = np.random.randint(low=0, high=len(name_to_edges['original']), 
                                           size=int(0.2 * len(name_to_edges['original'])))    
    name_to_edges['sample_20'] = [name_to_edges['original'][i] for i in set(sampled_indices_20)]
    del sampled_indices_20
    
    # Setup DataFrame with edges in each network (minimize memory use)
    prior_df = pd.DataFrame({
        'source_id': np.repeat(np.arange(shape[0], dtype=np.uint16), shape[1]),
        'target_id': np.tile(np.arange(shape[1], dtype=np.uint16), shape[0]),
    })
    
    # Compute XSwap prior for original and two sampled networks
    for name, network_edges in name_to_edges.items():        
        prior_df[f'permuted_edges_{name}'] = compute_xswap_occurrence_matrix(
            network_edges, n_permutations=n_perms, shape=shape, allow_antiparallel=(not symmetric),
            allow_self_loops=include_self_loops).toarray().flatten()
        
        prior_df[f'edge_{name}'] = edges_to_matrix(network_edges, add_reverse_edges=symmetric,
                                                   shape=shape, dtype=int, sparse=False).flatten()
        del network_edges
    
    prior_df.to_csv(prior_path, index=False, compression='gzip')
    del prior_df

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

Anatomy - localizes - Disease
Anatomy - downregulates - Gene
Anatomy - expresses - Gene
Anatomy - upregulates - Gene
Biological Process - participates - Gene
Cellular Component - participates - Gene
Compound - resembles - Compound
Compound - palliates - Disease
Compound - treats - Disease
Compound - binds - Gene
Compound - downregulates - Gene
Compound - upregulates - Gene
Compound - includes - Pharmacologic Class
Compound - causes - Side Effect
Disease - resembles - Disease
Disease - associates - Gene
Disease - downregulates - Gene
Disease - upregulates - Gene
Disease - presents - Symptom
Gene - covaries - Gene
Gene - interacts - Gene
Gene < regulates < Gene
Gene - participates - Molecular Function
Gene - participates - Pathway

