In [1]:
import collections
import itertools

import numpy as np
import pandas as pd
import tqdm
import xswap

import analysis

In [2]:
df = pd.read_table('edges_df.tsv.gz')
edges_df = (
    df
    .query('string == 1')
    .sample(frac=0.7, random_state=0)
    .assign(train=1)
    .merge(df, on=['uniprot_a', 'uniprot_b', 'string', 'ht_2014'], how='right')
    .fillna(0)
    .reset_index(drop=True)
    .filter(items=['uniprot_a', 'uniprot_b', 'train', 'string', 'ht_2014'])
)

edges_df.head()

Unnamed: 0,uniprot_a,uniprot_b,train,string,ht_2014
0,Q96KB5,P61964,1.0,1.0,0.0
1,O43251,Q9BTD8,1.0,1.0,0.0
2,Q9H8Y8,Q14088,1.0,1.0,0.0
3,Q96PP8,P48775,1.0,1.0,0.0
4,P08670,Q96QU8,1.0,1.0,0.0


In [3]:
# Check the fraction dropped is 30%

edges_df.query('train == 0 and string == 1').shape[0] / (
    edges_df.query('train == 0 and string == 1').shape[0]
    + edges_df.query('train == 1 and string == 1').shape[0]
)

0.3

In [4]:
# Extract edge tuples
string_edges_df = edges_df.query('train == 1')
edges = zip(string_edges_df['uniprot_a'], string_edges_df['uniprot_b'])
edges = list(set(map(tuple, map(sorted, edges))))
mapped_edges, mapping, _ = (
    xswap.preprocessing.map_str_edges(edges, bipartite=False))   

# Create adjacency matrix
sp_mat = analysis.edges_to_matrix(mapped_edges)

# Create source, target degree matrices
degree = np.repeat(sp_mat.sum(axis=1), sp_mat.shape[1], axis=1) \
       + np.repeat(sp_mat.sum(axis=0), sp_mat.shape[0], axis=0)

In [5]:
# Compute features on unpermuted network
feature_mats = {
    'prior_empirical': np.zeros(sp_mat.shape),
    
    'rwr': analysis.invertible_rwr(sp_mat, 0.25),
    'mean_rwr': np.zeros(sp_mat.shape),
    'p_rwr': np.zeros(sp_mat.shape),
    
    'jaccard': (sp_mat@sp_mat) / (degree - sp_mat@sp_mat),
    'mean_jaccard': np.zeros(sp_mat.shape),
    'p_jaccard': np.zeros(sp_mat.shape),
}
# Compute RWR p-value
n_perms = 1000
perm_edges = mapped_edges.copy()
for i in tqdm.tnrange(n_perms):
    # Permute edges
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, 
                                            allow_antiparallel=False, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges)
    feature_mats['prior_empirical'] += perm_mat
    
    # Compute RWR on permuted network
    perm_rwr = analysis.invertible_rwr(perm_mat, 0.25)
    feature_mats['mean_rwr'] += perm_rwr
    feature_mats['p_rwr'] += (perm_rwr >= feature_mats['rwr'])
    
    # Compute Jaccard similarity on permuted network
    A2 = perm_mat@perm_mat
    perm_jac = A2 / (degree - A2)
    feature_mats['mean_jaccard'] += perm_jac
    feature_mats['p_jaccard'] += (perm_jac >= feature_mats['jaccard'])

# Normalize features to number of permutations
for feature in ['mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard', 'prior_empirical']:
    feature_mats[feature] = feature_mats[feature] / n_perms
    
feature_dict = {k: np.array(v).flatten() for k, v in feature_mats.items()}

# Unmap RWR and add to DataFrame
max_id = max(mapping.values())
reversed_map = {v: k for k, v in mapping.items()}
feature_dict['mapped_source'], feature_dict['mapped_target'] = zip(*itertools.product(
    range(max_id+1), range(max_id+1)))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [6]:
mapped_source, mapped_target = zip(*itertools.product(range(max_id+1), range(max_id+1)))
mapped_rwr_df = (
    pd.DataFrame
    .from_dict(feature_dict)
    .assign(
        uniprot_a=lambda df: df['mapped_source'].map(reversed_map),
        uniprot_b=lambda df: df['mapped_target'].map(reversed_map)
    )
    .filter(items=['uniprot_a', 'uniprot_b', 'prior_empirical', 
                   'rwr', 'mean_rwr', 'p_rwr', 
                   'jaccard', 'mean_jaccard', 'p_jaccard'])
)
# Dictionary of node to degree
uniprot_to_degree = collections.Counter(edges_df.groupby('uniprot_a')['train'].sum().to_dict())

df = (
    edges_df
    .merge(mapped_rwr_df, on=['uniprot_a', 'uniprot_b'], how='left')
    .fillna(0)
    .assign(
        source_degree=lambda df: df['uniprot_a'].map(uniprot_to_degree),
        target_degree=lambda df: df['uniprot_b'].map(uniprot_to_degree),
        mean_degree=lambda df: np.sqrt(df['source_degree'] * df['target_degree']),
    )
    .filter(['uniprot_a', 'uniprot_b', 'source_degree', 'target_degree', 'mean_degree', 'train', 
             'string', 'ht_2014', 'prior_empirical', 'rwr', 'mean_rwr', 'p_rwr', 'jaccard', 
             'mean_jaccard', 'p_jaccard',])
)

df.to_csv('p_vs_rank.tsv.gz', compression='gzip', sep='\t', index=False)
df.head()

Unnamed: 0,uniprot_a,uniprot_b,source_degree,target_degree,mean_degree,train,string,ht_2014,prior_empirical,rwr,mean_rwr,p_rwr,jaccard,mean_jaccard,p_jaccard
0,Q96KB5,P61964,137.0,139.0,137.996377,1.0,1.0,0.0,0.331,0.001224,0.000479,0.104,0.136943,0.107004,0.044
1,O43251,Q9BTD8,64.0,42.0,51.845926,1.0,1.0,0.0,0.075,0.002575,0.000215,0.0,0.162162,0.034287,0.0
2,Q9H8Y8,Q14088,59.0,81.0,69.130312,1.0,1.0,0.0,0.08,0.002608,0.000258,0.0,0.134615,0.057784,0.0
3,Q96PP8,P48775,35.0,31.0,32.939338,1.0,1.0,0.0,0.019,0.004824,0.000115,0.016,0.0,0.024507,1.0
4,P08670,Q96QU8,121.0,87.0,102.60117,1.0,1.0,0.0,0.256,0.001239,0.000378,0.246,0.063197,0.081829,0.911
