In [1]:
import hetmatpy.hetmat
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn.metrics
import tqdm

In [2]:
def normalize(matrix):
    # Normalize adjacency matrix
    was_sparse = False
    if scipy.sparse.issparse(matrix):
        was_sparse = True
        matrix = matrix.toarray()
    row_sums = (
        matrix
        .sum(axis=1)
        .reshape(matrix.shape[0], 1)
    )
    row_sums[row_sums == 0] = 1
    normalized = np.divide(matrix, row_sums)
    if was_sparse:
        normalized = scipy.sparse.csc_matrix(normalized)
    return normalized

    
def rwr(normalized_adjacency, start_index, restart_prob, convergence_threshold=1e-6):
    # p(t+1) = (1-r) * W @ p(t) + r * p(0)
    # Setup start position
    p_t = np.zeros((1, normalized_adjacency.shape[0]))
    p_t[0, start_index] = 1
    p_0 = p_t.copy()
    
    # Iterate RWR until converge
    norm_difference = 1
    while norm_difference > convergence_threshold:
        p_t_1 = (1 - restart_prob) * p_t @ normalized_adjacency + restart_prob * p_0
        norm_difference = np.linalg.norm(p_t_1 - p_t, 1)
        p_t = p_t_1
    return p_t


def all_pairs_rwr(adjacency, restart_prob, convergence_threshold=1e-6):
    normalized_adjacency = normalize(adjacency)
    
    rwr_matrix = np.zeros(adjacency.shape)
    
    num_nodes = adjacency.shape[0]
    for seed_index in range(num_nodes):
        rwr_row = rwr(normalized_adjacency, seed_index, restart_prob, 
            convergence_threshold=convergence_threshold)
        rwr_matrix[seed_index, :] = rwr_row
    return rwr_matrix

In [3]:
hetmat = hetmatpy.hetmat.HetMat('../data/ppi_hetmat/')

### 1. Compute edge priors

In [4]:
%%time
sum_perm_edges = None

for name, permat in hetmat.permutations.items():
    source, target, perm_adj = permat.metaedge_to_adjacency_matrix('PpP', dense_threshold=0)
    
    if sum_perm_edges is None:
        sum_perm_edges = perm_adj
    else:
        sum_perm_edges += perm_adj

edge_prior = sum_perm_edges / len(hetmat.permutations)

np.save('../data/edge_predict/prior_edge.npy', edge_prior)

CPU times: user 23.1 s, sys: 17.6 s, total: 40.6 s
Wall time: 40.6 s


### 2. Compute RWR on the network

In [5]:
%%time

source, target, pruned_matrix = hetmat.metaedge_to_adjacency_matrix('PpP', dense_threshold=1)
ppi_rwr = all_pairs_rwr(pruned_matrix, 0.5)

np.save('../data/edge_predict/RWR_PpP.npy', ppi_rwr_sparse)

CPU times: user 7.2 s, sys: 144 ms, total: 7.34 s
Wall time: 7.34 s


### 3. Compute RWR on permuted networks

In [6]:
sum_perm_rwr = None
for name, permat in tqdm.tqdm(hetmat.permutations.items()):
    source, target, perm_adj = permat.metaedge_to_adjacency_matrix('PpP', dense_threshold=1)
    rwr_mat = all_pairs_rwr(perm_adj, 0.5)
    
    if sum_perm_rwr is None:
        sum_perm_rwr = rwr_mat
    else:
        sum_perm_rwr += rwr_mat

mean_rwr = sum_perm_rwr / len(hetmat.permutations)
np.save('../data/edge_predict/prior_RWR.npy', mean_rwr)

100%|██████████| 1000/1000 [1:34:01<00:00,  5.64s/it]


### 4. Combine data

In [7]:
def get_prior(pandas_row, prior_matrix):
    source_name, target_name = pandas_row[:2]
    source_id = source.index(source_name)
    target_id = target.index(target_name)
    return prior_matrix[source_id, target_id]

In [10]:
train_df = pd.read_table('../data/edge_predict/train_conditionals.tsv')

train_df['rwr'] = train_df.apply(lambda row: get_prior(row, ppi_rwr), axis=1)
train_df['edge_prior'] = train_df.apply(lambda row: get_prior(row, edge_prior), axis=1)
train_df['rwr_prior'] = train_df.apply(lambda row: get_prior(row, mean_rwr), axis=1)

In [11]:
train_df.head()

Unnamed: 0,source,target,jaccard,cn,dwpc_2,dwpc_3,edge_original,edge_pruned,lr_confidence,lr_prob,rwr,edge_prior,rwr_prior
0,2236,2237,0.0,0,0.0,0.0,1.0,0.0,-0.514505,0.374138,0.0,0.0,0.0
1,2774,2302,0.0,0,0.0,0.0,1.0,0.0,-0.514505,0.374138,0.0,0.0,0.0
2,2407,450,1.0,1,0.166667,0.0,1.0,0.0,6.15754,0.997887,0.023882,0.0,4.4e-05
3,808,16,0.0,0,0.0,0.002594,1.0,0.0,1.066292,0.743891,0.000206,0.002,0.000248
4,1156,1975,0.0,0,0.0,0.000573,1.0,0.0,-0.165037,0.458834,4.3e-05,0.005,0.000246


### 5. Compute and compare AUROC scores

In [12]:
sklearn.metrics.roc_auc_score(train_df['edge_original'], train_df['rwr'])

0.7363337863363567

In [13]:
sklearn.metrics.roc_auc_score(train_df['edge_original'], train_df['rwr_prior'])

0.653279021302566