In [1]:
import csv
import itertools
import json
import lzma
import pathlib
import random

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse

import hetmatpy.degree_weight
import hetmatpy.hetmat
import xswap

## Create HetMat for PPI

### Setup directories

In [2]:
ppi_hetmat_path = pathlib.Path('../data/ppi_hetmat/')
ppi_hetmat_path.mkdir(exist_ok=True)
ppi_hetmat_path.joinpath('nodes').mkdir(exist_ok=True)
ppi_hetmat_path.joinpath('edges').mkdir(exist_ok=True)

### Set metagraph.json

In [3]:
metagraph = {
  "metanode_kinds": [
    "Protein"
  ],
  "metaedge_tuples": [
    [
      "Protein",
      "Protein",
      "interacts",
      "both"
    ],
    [
      "Protein",
      "Protein",
      "interacts-pruned",
      "both"
    ]
  ],
  "kind_to_abbrev": {
    "Protein": "P",
    "interacts": "i",
    "interacts-pruned": "p"
  }
}
with open(ppi_hetmat_path.joinpath('metagraph.json'), 'w') as metagraph_file:
    json.dump(metagraph, metagraph_file, indent=2)

### Set nodes

In [4]:
mapping_df = (
    pd.read_csv('../data/mappings/protein-protein.csv')
    .filter(items=['mapped', 'original'])
    .rename(columns={'mapped': 'position', 'original': 'identifier'})
    .assign(name='unknown')
    .sort_values('position')
    .reset_index(drop=True)
)
mapping_df.to_csv(ppi_hetmat_path.joinpath('nodes/Protein.tsv'), sep='\t', index=False)

### Set edges

In [5]:
num_nodes = len(mapping_df)
edges = xswap.preprocessing.load_processed_edges('../data/processed/protein-protein.csv')

# Direction swap edges so that source <= target
edges = [edge for edge in edges if edge[0] <= edge[1]] + [(edge[1], edge[0]) for edge in edges if edge[0] > edge[1]]

In [6]:
sample_size = int(0.2 * len(edges))
random.seed(0)
sampled_edges = random.sample(edges, sample_size)
pruned_edges = list(set(edges) - set(sampled_edges))

def save_edges(edges, metaedge_name):
    mat = scipy.sparse.coo_matrix((np.ones(len(edges)), (zip(*edges))), shape=(num_nodes, num_nodes))
    mat = (mat + mat.T - np.diag(mat.diagonal()))  # Matrix is not symmetric but contains no repeats. Make sym.
    mat = scipy.sparse.csc_matrix(mat)
    scipy.sparse.save_npz(ppi_hetmat_path.joinpath(f'edges/{metaedge_name}.sparse.npz'), mat)

save_edges(edges, 'PiP')
save_edges(pruned_edges, 'PpP')

## Compute features

In [7]:
hetmat = hetmatpy.hetmat.HetMat(ppi_hetmat_path)
row, col, ppp_matrix = hetmat.metaedge_to_adjacency_matrix('PpP', dense_threshold=0)
graph = nx.from_numpy_array(ppp_matrix)

In [9]:
source, target, dwpc_matrix_2 = hetmatpy.degree_weight.dwpc(hetmat, 'PpPpP', dense_threshold=1)
source, target, dwpc_matrix_3 = hetmatpy.degree_weight.dwpc(hetmat, 'PpPpPpP', dense_threshold=1)

In [10]:
source_target = itertools.product(source, target)

jaccard_feature = nx.jaccard_coefficient(graph, itertools.product(range(num_nodes), range(num_nodes)))
jaccard_feature = (feat[2] for feat in jaccard_feature)

cn_feature = (
    len(list(nx.common_neighbors(graph, i, j)))
    for i, j in itertools.product(range(num_nodes), range(num_nodes))
)

dwpc_2_feature = dwpc_matrix_2.toarray().flatten()
dwpc_3_feature = dwpc_matrix_3.toarray().flatten()
edge_original = hetmat.metaedge_to_adjacency_matrix('PiP', dense_threshold=0)[2].flatten()
edge_pruned = hetmat.metaedge_to_adjacency_matrix('PpP', dense_threshold=0)[2].flatten()

features = [jaccard_feature, cn_feature, dwpc_2_feature, dwpc_3_feature, edge_original, edge_pruned]

In [11]:
def row_generator(source_target_index, features_list):
    feat_iter = zip(*features_list)
    for source, target in source_target_index:
        new_row = (source, target, *next(feat_iter))
        yield new_row

In [12]:
%%time

with lzma.open('../data/edge_predict/ppi_full_features.tsv.xz', mode='wt', newline='') as xz_file:
    row_gen = row_generator(source_target, features)
    tsv_writer = csv.writer(xz_file, delimiter='\t')
    tsv_writer.writerow(['source', 'target', 'jaccard', 'cn', 'dwpc_2', 'dwpc_3', 
                         'edge_original', 'edge_pruned'])
    for row in row_gen:
        tsv_writer.writerow(row)

CPU times: user 14min 15s, sys: 10.2 s, total: 14min 25s
Wall time: 14min 25s


In [13]:
features_df = pd.read_table('../data/edge_predict/ppi_full_features.tsv.xz')
features_df.head()

Unnamed: 0,source,target,jaccard,cn,dwpc_2,dwpc_3,edge_original,edge_pruned
0,1,1,1.0,34,0.0,0.026542,1.0,0.0
1,1,10,0.0,0,0.0,0.0,0.0,0.0
2,1,100,0.128205,5,0.049555,0.003729,0.0,0.0
3,1,1000,0.0,0,0.0,0.0,0.0,0.0
4,1,1001,0.0,0,0.0,0.0,0.0,0.0
