In [None]:
import csv
import itertools
import json
import lzma
import pathlib
import random

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse

import hetmatpy.degree_weight
import hetmatpy.hetmat
import xswap

In [None]:
ppi_hetmat_path = '../data/ppi_hetmat'

## Compute features

In [None]:
hetmat = hetmatpy.hetmat.HetMat(ppi_hetmat_path)
source, target, ppp_matrix = hetmat.metaedge_to_adjacency_matrix('PpP', dense_threshold=0)
graph = nx.from_numpy_array(ppp_matrix)

In [None]:
source, target, dwpc_matrix_2 = hetmatpy.degree_weight.dwpc(hetmat, 'PpPpP', dense_threshold=1)
source, target, dwpc_matrix_3 = hetmatpy.degree_weight.dwpc(hetmat, 'PpPpPpP', dense_threshold=1)

In [None]:
jaccard_feature = nx.jaccard_coefficient(graph, itertools.product(range(num_nodes), range(num_nodes)))
jaccard_feature = (feat[2] for feat in jaccard_feature)

cn_feature = (
    len(list(nx.common_neighbors(graph, i, j)))
    for i, j in itertools.product(range(num_nodes), range(num_nodes))
)

dwpc_2_feature = dwpc_matrix_2.toarray().flatten()
dwpc_3_feature = dwpc_matrix_3.toarray().flatten()
edge_original = hetmat.metaedge_to_adjacency_matrix('PiP', dense_threshold=0)[2].flatten()
edge_pruned = hetmat.metaedge_to_adjacency_matrix('PpP', dense_threshold=0)[2].flatten()

features = [jaccard_feature, cn_feature, dwpc_2_feature, dwpc_3_feature, edge_original, edge_pruned]

In [None]:
def row_generator(source, target, features_list):
    feat_iter = zip(*features_list)
    for s, t in itertools.product(source, target):
        new_row = (s, t, *next(feat_iter))
        yield new_row

In [None]:
%%time

features_dir = pathlib.Path('../data/edge_predict/')
features_dir.mkdir(parents=True, exist_ok=True)

with lzma.open(features_dir.joinpath('ppi_full_features.tsv.xz'), mode='wt', newline='') as xz_file:
    row_gen = row_generator(source, target, features)
    tsv_writer = csv.writer(xz_file, delimiter='\t')
    tsv_writer.writerow(['source', 'target', 'jaccard', 'cn', 'dwpc_2', 'dwpc_3', 
                         'edge_original', 'edge_pruned'])
    for row in row_gen:
        tsv_writer.writerow(row)

In [12]:
features_df = pd.read_table('../data/edge_predict/ppi_full_features.tsv.xz')
features_df.head()

Unnamed: 0,source,target,jaccard,cn,dwpc_2,dwpc_3,edge_original,edge_pruned
0,1,1,1.0,34,0.0,0.022885,1.0,0.0
1,1,10,0.0,0,0.0,0.0,0.0,0.0
2,1,100,0.1,4,0.046576,0.003332,0.0,0.0
3,1,1000,0.0,0,0.0,0.0,0.0,0.0
4,1,1001,0.0,0,0.0,0.0,0.0,0.0
