In [1]:
import sys

import numpy as np
import pandas as pd
import sklearn.metrics
import tqdm
import xswap

sys.path.insert(0, '../')

import analysis

In [2]:
edges_df = pd.read_csv('../../data/task3/3.all_nodes/ppi.tsv.xz', sep='\t')

edges = list(map(tuple, edges_df.query('test_recon == 1')[['id_a', 'id_b']].values))

In [3]:
mat = xswap.network_formats.edges_to_matrix(edges, add_reverse_edges=True, shape=(4083, 4083))

degree_matrix = np.repeat(mat.sum(axis=1).reshape((mat.shape[0], 1)), mat.shape[0], axis=1) \
                + np.repeat(mat.sum(axis=0).reshape((1, mat.shape[1])), mat.shape[1], axis=0)

# Unpermuted values

In [4]:
num_perms = 100

perm_edges = edges.copy()
indicator = np.zeros_like(mat.toarray(), dtype=int)
for i in tqdm.tnrange(num_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, allow_antiparallel=False)
    perm_mat = xswap.network_formats.edges_to_matrix(perm_edges, add_reverse_edges=True, 
                                                     shape=(4083, 4083), sparse=False)
    indicator += perm_mat

HBox(children=(IntProgress(value=0), HTML(value='')))




In [7]:
prior_df = (
    pd.DataFrame({
        'id_a': np.repeat(np.arange(mat.shape[0]), mat.shape[1]),
        'id_b': np.repeat(np.arange(mat.shape[1]).reshape((1, mat.shape[1])), mat.shape[0], axis=0).flatten(),
        'edge': mat.toarray().flatten(),
        'source_degree': np.array(np.repeat(mat.sum(axis=1).reshape((mat.shape[0], 1)), 
                                            mat.shape[0], axis=1)).flatten(),
        'target_degree': np.array(np.repeat(mat.sum(axis=0).reshape((1, mat.shape[1])), 
                                            mat.shape[1], axis=0)).flatten(),
        'indicator': indicator.flatten(),
    })
    .assign(
        dgp_edges = lambda df: df.groupby(['source_degree', 'target_degree']).transform(sum)['indicator'],
        num_dgp = lambda df: df.groupby(['source_degree', 'target_degree']).transform('count')['indicator'],
        edge_prior = lambda df: df['dgp_edges'] / df['num_dgp']
    )
    .filter(items=['id_a', 'id_b', 'edge', 'edge_prior'])
    .assign(
        jaccard = np.array(analysis.jaccard(mat, degree_matrix)).flatten(),
        preferential_attachment = analysis.preferential_attachment_index(mat).flatten(),
        rwr = np.array(analysis.invertible_rwr(mat.toarray(), 0.25)).flatten(),
        resource_allocation = analysis.resource_allocation_index(mat).flatten(),
        adamic = analysis.adamic_adar_index(mat).flatten(),
    )
    .filter(items=['id_a', 'id_b', 'edge', 'edge_prior', 'adamic', 'jaccard', 
                   'preferential_attachment', 'resource_allocation', 'rwr'])
)

prior_df.to_csv('../../data/ppi_feature_values.tsv', sep='\t', index=False)

prior_df.head()

Unnamed: 0,id_a,id_b,edge,edge_prior,adamic,jaccard,preferential_attachment,resource_allocation,rwr
0,0,0,False,0.688017,14.827277,1.0,5625,0.546692,0.251102
1,0,1,False,0.05303,0.0,0.0,225,0.0,1.3e-05
2,0,2,False,0.80303,0.426866,0.01227,6750,0.018679,8.3e-05
3,0,3,False,2.403409,0.704774,0.013841,16350,0.014245,8.4e-05
4,0,4,False,3.848485,0.348226,0.004706,26400,0.006774,9.3e-05


In [11]:
auroc_df = (
    prior_df
    .filter(items=['edge', 'edge_prior', 'adamic', 'jaccard', 'preferential_attachment', 
                   'resource_allocation', 'rwr'])
    .melt(id_vars=['edge'], var_name='feature')
    .groupby('feature')
    .apply(lambda df: sklearn.metrics.roc_auc_score(df['edge'].values, df['value'].values))
    .reset_index()
    .rename(columns={0: 'auroc'})
)

auroc_df.to_csv('../../data/unpermuted_auroc_value.tsv', sep='\t', index=False)

auroc_df.head()

Unnamed: 0,feature,auroc
0,adamic,0.884594
1,edge_prior,0.796931
2,jaccard,0.891467
3,preferential_attachment,0.79903
4,resource_allocation,0.900402


# Permuted values

In [7]:
num_perms = 100

true_edges = mat.toarray().flatten()
perm_edges = edges.copy()

rows = list()
for i in tqdm.tnrange(num_perms):
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=True, allow_antiparallel=False)
    perm_mat = xswap.network_formats.edges_to_matrix(perm_edges, add_reverse_edges=True, shape=(4083, 4083))
    
    row = {    
        'adamic': sklearn.metrics.roc_auc_score(
            true_edges, analysis.adamic_adar_index(perm_mat).flatten()
        ),
        'jaccard': sklearn.metrics.roc_auc_score(
            true_edges, np.array(analysis.jaccard(perm_mat, degree_matrix)).flatten()
        ),
        'preferential_attachment': sklearn.metrics.roc_auc_score(
            true_edges, analysis.preferential_attachment_index(perm_mat).flatten()
        ),
        'resource_allocation': sklearn.metrics.roc_auc_score(
            true_edges, analysis.resource_allocation_index(perm_mat).flatten()
        ),
        'rwr': sklearn.metrics.roc_auc_score(
            true_edges, np.array(analysis.invertible_rwr(perm_mat.toarray(), 0.25)).flatten()
        ),
    }
    rows.append(row)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [8]:
permuted_auroc_df = (
    pd.DataFrame(rows)
)

permuted_auroc_df.to_csv('../../data/permuted_auroc_values.tsv', sep='\t', index=False)

permuted_auroc_df.head()