In [1]:
import json
import pathlib
import random

import numpy as np
import pandas as pd
import scipy.sparse

import hetmatpy.hetmat
import xswap

## Create HetMat for PPI

### Setup directories

In [2]:
ppi_hetmat_path = pathlib.Path('../data/ppi_hetmat/')
ppi_hetmat_path.mkdir(parents=True, exist_ok=True)
ppi_hetmat_path.joinpath('nodes').mkdir(exist_ok=True)
ppi_hetmat_path.joinpath('edges').mkdir(exist_ok=True)

### Set metagraph.json

In [3]:
metagraph = {
  "metanode_kinds": [
    "Protein"
  ],
  "metaedge_tuples": [
    [
      "Protein",
      "Protein",
      "interacts",
      "both"
    ],
    [
      "Protein",
      "Protein",
      "interacts-pruned",
      "both"
    ]
  ],
  "kind_to_abbrev": {
    "Protein": "P",
    "interacts": "i",
    "interacts-pruned": "p"
  }
}
with open(ppi_hetmat_path.joinpath('metagraph.json'), 'w') as metagraph_file:
    json.dump(metagraph, metagraph_file, indent=2)

### Set nodes

In [4]:
mapping_df = (
    pd.read_csv('../data/ppi_mapping.csv')
    .filter(items=['mapped', 'original'])
    .rename(columns={'mapped': 'position', 'original': 'identifier'})
    .assign(name='unknown')
    .sort_values('position')
    .reset_index(drop=True)
)
mapping_df.to_csv(ppi_hetmat_path.joinpath('nodes/Protein.tsv'), sep='\t', index=False)

### Set edges

In [5]:
num_nodes = len(mapping_df)
edges = xswap.preprocessing.load_processed_edges('../data/ppi_edges.csv')

# Direction swap edges so that source <= target
edges = list(set(map(tuple, map(sorted, edges))))

# Remove some edges to form "pruned" metaedge
sample_size = int(0.2 * len(edges))
random.seed(0)
sampled_edges = random.sample(edges, sample_size)
pruned_edges = list(set(edges) - set(sampled_edges))

In [6]:
def save_edges(edges, metaedge_name):
    mat = scipy.sparse.coo_matrix((np.ones(len(edges)), (zip(*edges))), shape=(num_nodes, num_nodes))
    mat = (mat + mat.T - np.diag(mat.diagonal()))  # Matrix is not symmetric but contains no repeats. Make sym.
    mat = scipy.sparse.csc_matrix(mat)
    scipy.sparse.save_npz(ppi_hetmat_path.joinpath(f'edges/{metaedge_name}.sparse.npz'), mat)

save_edges(edges, 'PiP')
save_edges(pruned_edges, 'PpP')

## Generate permutations of `PpP` metaedge

In [7]:
class xswap_hetmat(hetmatpy.hetmat.HetMat):
    def permute_graph(self, num_new_permutations=None, namer=None, start_from=None,
                      multiplier=10, seed=0):
        """
        Generate and save permutations of the HetMat adjacency matrices.
        Parameters
        ----------
        num_new_permutations : int
            The number of new, permuted HetMats to generate
        namer : generator
            Yields the names of new permutations. Cannot pass names of existing permutations
        start_from : str
            Name of permutation to use as starting point. For multiple permutations,
            the first permutation starts from start_from, and future permutations
            continue from the previous one.
        multiplier : int
            How many attempts to make when cross-swapping edges.
        seed : int
            Random seed for generating new permutations
        """
        if namer is None:
            # If no namer given, continue increasing names by one for new permutations
            namer = (f'{x:03}' for x in itertools.count(start=1))

        stat_dfs = list()
        for _ in range(num_new_permutations):
            permutation_name = next(namer)
            new_hetmat = hetmatpy.hetmat.initialize_permutation_directory(self, permutation_name)

            if start_from is None:
                start_from = self
            elif isinstance(start_from, str):
                start_from = self.permutations[start_from]
            assert isinstance(start_from, hetmatpy.hetmat.HetMat)
            
            # Only need to permute the pruned metaedge
            metaedge = hetmat.metagraph.get_metaedge('PpP')
            
            rows, cols, original_matrix = start_from.metaedge_to_adjacency_matrix(
                metaedge, dense_threshold=1)
            original_matrix_coo = scipy.sparse.coo_matrix(original_matrix)
            edges = list(zip(original_matrix_coo.row, original_matrix_coo.col))
            permuted_edges, stats = xswap.permute_edge_list(
                edges, allow_antiparallel=False, allow_self_loops=False, multiplier=multiplier,
                seed=seed
            )
            assert permuted_edges != edges
            permuted_matrix_coo = scipy.sparse.coo_matrix((np.ones(len(edges)), zip(*permuted_edges)),
                                                         shape=original_matrix.shape)
            permuted_matrix = scipy.sparse.csc_matrix(permuted_matrix_coo)
            path = new_hetmat.get_edges_path(metaedge, file_format=None)
            hetmatpy.hetmat.save_matrix(permuted_matrix, path)
            stat_df = pd.DataFrame([stats])
            stat_df['metaedge'] = metaedge
            stat_df['abbrev'] = metaedge.get_abbrev()
            stat_df['permutation'] = permutation_name
            stat_dfs.append(stat_df)
            start_from = permutation_name
            seed += 1
            self.permutations[permutation_name] = new_hetmat
        return pd.concat(stat_dfs)

In [None]:
%%time

permutation_names = (str(i).zfill(4) for i in range(1, 1001))
hetmat = xswap_hetmat('../data/ppi_hetmat/')
permutation_info_df = hetmat.permute_graph(num_new_permutations=1000, namer=permutation_names)
permutation_info_df.to_csv('../data/permutation_info.tsv', sep='\t', index=False)

In [None]:
permutation_info_df.head()