In [1]:
import sys

import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

sys.path.insert(0, '../')

import analysis

In [2]:
n_perms = 1000
allow_antiparallel = False
allow_self_loops = True
directed = False

# 1. PPI network

In [None]:
net_to_rwr_function = {
    'train': analysis.invertible_rwr, 
    'test_recon': analysis.invertible_rwr,
    'test_new': analysis.rwr_approx_inv
}

In [3]:
ppi_df = pd.read_csv('../../data/3.all_nodes/ppi.tsv.xz', sep='\t', compression='xz')

full_features_df = pd.DataFrame()

for network in ['train', 'test_recon', 'test_new']:
    rwr_func = net_to_rwr_function[network]
    
    edges = list(map(tuple, 
        ppi_df
        .query(f'{network} == 1')
        .loc[:, ['id_a', 'id_b']]
        .values
    ))
    mat = analysis.edges_to_matrix(edges, directed=directed)
    print(network, mat.shape)
    degree_matrix = np.repeat(mat.sum(axis=1), mat.shape[1], axis=1) \
                    + np.repeat(mat.sum(axis=0), mat.shape[0], axis=0)
    
    # RWR is fastest on this network with exact matrix inverse on dense array
    feature_dict = {
        'edge_prior': scipy.sparse.csc_matrix(mat.shape),
        'rwr': rwr_func(mat.toarray(), 0.25),
        'mean_rwr': np.zeros(mat.shape),
        'p_rwr': np.zeros(mat.shape),
        'jaccard': analysis.jaccard(mat, degree_matrix),
        'mean_jaccard': np.zeros(mat.shape),
        'p_jaccard': np.zeros(mat.shape),
    }
    
    perm_edges = edges.copy()
    for i in tqdm.tnrange(n_perms):
        perm_edges, _ = xswap.permute_edge_list(
            perm_edges, 
            allow_self_loops=allow_self_loops, 
            allow_antiparallel=allow_antiparallel, 
            seed=i
        )
        perm_mat = analysis.edges_to_matrix(perm_edges, directed=directed).tocsc()

        feature_dict['edge_prior'] += perm_mat

        perm_rwr = rwr_func(perm_mat.toarray(), 0.25)
        feature_dict['mean_rwr'] += perm_rwr
        feature_dict['p_rwr'] += (perm_rwr >= feature_dict['rwr'])

        perm_jaccard = analysis.jaccard(perm_mat, degree_matrix)
        feature_dict['mean_jaccard'] += perm_jaccard
        feature_dict['p_jaccard'] += (perm_jaccard >= feature_dict['jaccard'])
        
    # Post-process features
    network_features_df = None
    for feature, array in feature_dict.items():
        # Normalize features by the number of permutations
        if feature in ['edge_prior', 'mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard']:
            feature_dict[feature] /= n_perms
            
        # Make features dense (for DataFrame)
        if scipy.sparse.issparse(array):
            array = array.toarray()
            
        feature_df = (
            pd.DataFrame(array)
            .reset_index()
            .melt(id_vars=['index'])
            .rename(columns={'index': 'id_a', 'variable': 'id_b', 'value': feature})
            .assign(
                id_a=lambda df: df['id_a'].astype(int),
                id_b=lambda df: df['id_b'].astype(int),
                network=network,
            )
            .query('id_a <= id_b')
        )
        if network_features_df is None:
            network_features_df = feature_df
        else:
            network_features_df = network_features_df.merge(feature_df, on=['id_a', 'id_b', 'network'])

    full_features_df = pd.concat([full_features_df, network_features_df])

train (4083, 4083)


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


test_recon (4083, 4083)


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


test_new (4083, 4083)


  diagonal = np.array(matrix.sum(axis=1)).flatten() ** (-1/2)
  return D@matrix@D


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))






In [4]:
%%time

ppi_features_df = (
    ppi_df
    .melt(
        id_vars=['id_a', 'id_b'],
        value_vars=['train', 'test_recon', 'test_new'],
        var_name='network', value_name='edge'
    )
    .merge(full_features_df, on=['id_a', 'id_b', 'network'], how='left')
)
    
ppi_features_df.to_csv('../../data/4.data/ppi.tsv.xz', sep='\t', compression='xz', index=False)

CPU times: user 38min 47s, sys: 6.39 s, total: 38min 53s
Wall time: 38min 53s
