In [1]:
import hetmatpy.hetmat
import numpy as np
import pandas as pd
import tqdm

import xswap

In [2]:
hetmat = hetmatpy.hetmat.HetMat('../../data/task1/hetionet-v1.0.hetmat/')

In [3]:
metaedges = hetmat.metagraph.extract_all_metapaths(1, exclude_inverts=True)
metaedges = [metaedge[0] for metaedge in metaedges]

In [4]:
for metaedge in tqdm.tqdm_notebook(metaedges):
    # Only use metaedges with >= 2000 edges
    _, _, adj_mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dtype=bool, dense_threshold=1)
    if adj_mat.nnz < 2000:
        continue
        
    # Determine how to treat metaedge
    square = True if adj_mat.shape[0] == adj_mat.shape[1] else False
    if square:
        allow_antiparallel = bool((adj_mat != adj_mat.T).nnz)
        allow_self_loops = bool(adj_mat.diagonal().sum())
    else:
        allow_antiparallel = True
        allow_self_loops = True
        
    # Compute XSwap prior
    edge_list = xswap.network_formats.matrix_to_edges(adj_mat, include_reverse_edges=allow_antiparallel)
    prior_df = xswap.prior.compute_xswap_priors(edge_list, n_permutations=100, shape=adj_mat.shape, 
                                                allow_self_loops=allow_self_loops, 
                                                allow_antiparallel=allow_antiparallel,
                                                dtypes={'id': np.uint16, 'degree': np.uint32,
                                                        'edge': bool, 'xswap_prior': float})
    
    prior_df.to_csv(f'../../data/task1/full_priors/{metaedge.abbrev}.tsv.gz', sep='\t', 
                    compression='gzip', index=False)
    del prior_df
    
    # Remove 30% of edges
    np.random.seed(0)
    edge_choices = np.random.choice(len(edge_list), replace=False, size=int(0.7 * len(edge_list)))
    sampled_edges = [edge_list[i] for i in edge_choices]
    
    # Compute XSwap prior
    sampled_prior_df = xswap.prior.compute_xswap_priors(sampled_edges, n_permutations=100, 
                                                        shape=adj_mat.shape, 
                                                        allow_self_loops=allow_self_loops, 
                                                        allow_antiparallel=allow_antiparallel)
    sampled_prior_df.to_csv(f'../../data/task1/sampled_priors/{metaedge.abbrev}.tsv.gz', sep='\t', 
                            compression='gzip', index=False)
    del sampled_prior_df, sampled_edges, edge_choices, edge_list

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))


