In [1]:
import pathlib

import numpy as np
import pandas as pd
import scipy.stats
import sklearn.metrics
import tqdm

In [2]:
priors_dir = pathlib.Path('priors_2/')
prior_files = priors_dir.glob('*.csv.gz')

# prior_files = [file for file in prior_files if file.name.split('.')[0] == 'AlD']

In [3]:
def add_dgp_trim(df, edge_column, permuted_edges_column, num_perms):
    """
    Add degree-grouped permutation (DGP) information, and delete as many superfluous columns
    as possible (to use minimal memory)
    """
    df['source_degree'] = (df.groupby('source_id')
                             .transform(sum)[edge_column]
                             .astype(np.uint32))
    del df['source_id']

    df['target_degree'] = (df.groupby('target_id')
                             .transform(sum)[edge_column]
                             .astype(np.uint32))
    del df['target_id']

    df['dgp_edges'] = (df.groupby(['source_degree', 'target_degree'])
                         .transform(sum)[permuted_edges_column]
                         .astype(np.uint32))
    del df[permuted_edges_column]

    df['num_dgp'] = num_perms * (df.groupby(['source_degree', 'target_degree'])
                                   .transform('count')[edge_column])
    
    df['xswap_prior'] = df['dgp_edges'] / df['num_dgp']
    del df['dgp_edges'], df['num_dgp']

    
def compute_auroc_correlation(df, network, num_train_edges):
    """
    Compute the analytic prior and degree product, as well as AUROC values for the
    XSwap prior, the analytic prior, and the degree product. Delete the DataFrame (to
    save memory)
    """
    # Extract features from DataFrame to dictionary of numpy arrays (saves memory)
    features = dict()
    for feature_name in ['source_degree', 'target_degree', 'edge_original']:
        features[feature_name] = df[feature_name].values.astype(np.uint32)
        del df[feature_name]
    features['xswap_prior'] = df['xswap_prior'].values
    del df['xswap_prior']
    del df
    
    features['analytic_prior'] = features['source_degree'] * features['target_degree'] / (
        features['source_degree'] * features['target_degree'] + num_train_edges
        - features['source_degree'] - features['target_degree'] + 1)

    features['degree_product'] = features['source_degree'] * features['target_degree']
    del features['source_degree'], features['target_degree']
    
    # Compute summary values and add row
    row = {
        'metaedge': metaedge,
        'network': network,
        
        # AUROC
        'xswap_auc': sklearn.metrics.roc_auc_score(features['edge_original'], 
                                                   features['xswap_prior']),
        'analytic_auc': sklearn.metrics.roc_auc_score(features['edge_original'], 
                                                      features['analytic_prior']),
        'degree_product_auc': sklearn.metrics.roc_auc_score(features['edge_original'], 
                                                            features['degree_product']),
        # Pearson correlation
        'xswap_analytic_pearson': scipy.stats.pearsonr(features['xswap_prior'],
                                                       features['analytic_prior'])[0],
        'xswap_degree_pearson': scipy.stats.pearsonr(features['xswap_prior'],
                                                     features['degree_product'])[0],
        'analytic_degree_pearson': scipy.stats.pearsonr(features['analytic_prior'],
                                                        features['degree_product'])[0],
        # Spearman (rank) correlation
        'xswap_analytic_spearman': scipy.stats.spearmanr(features['xswap_prior'],
                                                         features['analytic_prior'])[0],
        'xswap_degree_spearman': scipy.stats.spearmanr(features['xswap_prior'],
                                                       features['degree_product'])[0],
        'analytic_degree_spearman': scipy.stats.spearmanr(features['analytic_prior'],
                                                          features['degree_product'])[0],
        # Mean absolute error
        'xswap_mae': sklearn.metrics.mean_absolute_error(features['edge_original'],
                                                         features['xswap_prior']),
        'analytic_mae': sklearn.metrics.mean_absolute_error(features['edge_original'],
                                                            features['analytic_prior']),
        'degree_mae': sklearn.metrics.mean_absolute_error(features['edge_original'],
                                                          features['degree_product']),
    }
    return row

In [4]:
rows = list()

for filename in tqdm.tqdm_notebook(list(prior_files)):
    metaedge = filename.name.split('.')[0]
    print(metaedge, flush=True)
    
    ### ORIGINAL NETWORK
    original_df = pd.read_csv(filename, usecols=['source_id', 'target_id', 'edge_original', 
                                                'permuted_edges_original'], dtype=np.uint32)
    
    n_edges = original_df['edge_original'].sum()
    
    add_dgp_trim(original_df, 'edge_original', 'permuted_edges_original', 100)
    row = compute_auroc_correlation(original_df, 'Original', n_edges)
    
    rows.append(row)
    
    ### SAMPLED NETWORKS
    for frac in ['50', '20']:
        sampled_df = pd.read_csv(filename, 
                                 usecols=['source_id', 'target_id', 'edge_original', 
                                          f'edge_sample_{frac}', f'permuted_edges_sample_{frac}'], 
                                 dtype=np.uint32)
        
        n_sampled_edges = sampled_df[f'edge_sample_{frac}'].sum()
        
        add_dgp_trim(sampled_df, f'edge_sample_{frac}', f'permuted_edges_sample_{frac}', 100)      
        
        # Only want to test for edges that didn't exist in the sampled network
        sampled_df = sampled_df.query(f'edge_sample_{frac} == 0')
        
        row = compute_auroc_correlation(sampled_df, f'Sample_{frac}', n_sampled_edges)
        
        rows.append(row)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

AdG
GpPW
CtD
CcSE
GcG
CpD
AuG
CiPC
CCpG
DrD
GpMF
CbG
BPpG
DdG
G<rG
AeG
GiG
CuG
CrC
AlD
CdG
DpS
DuG
DaG



In [5]:
auroc_df = pd.DataFrame.from_records(rows, columns=[
    'metaedge', 'network', 
    'xswap_auc', 'analytic_auc', 'degree_product_auc', 
    'xswap_analytic_pearson', 'xswap_degree_pearson', 'analytic_degree_pearson', 
    'xswap_analytic_spearman', 'xswap_degree_spearman', 'analytic_degree_spearman', 
    'xswap_mae', 'analytic_mae', 'degree_mae'])

auroc_df.to_csv("hetionet_auroc.csv", index=False)
auroc_df.head()

Unnamed: 0,metaedge,network,xswap_auc,analytic_auc,degree_product_auc,xswap_analytic_pearson,xswap_degree_pearson,analytic_degree_pearson,xswap_analytic_spearman,xswap_degree_spearman,analytic_degree_spearman,xswap_mae,analytic_mae,degree_mae
0,AdG,Original,0.989389,0.989156,0.989144,0.988825,0.989314,0.981232,0.999995,0.999994,1.0,0.016103,0.015864,1241.455
1,AdG,Sample_50,0.945352,0.945276,0.94525,0.996354,0.998329,0.993783,0.999999,0.999999,1.0,0.009741,0.00959,2635646.0
2,AdG,Sample_20,0.862589,0.862584,0.862573,0.998364,0.999323,0.997422,1.0,1.0,1.0,0.010892,0.010867,10770040.0
3,GpPW,Original,0.950966,0.951013,0.951023,0.993401,0.938277,0.938616,0.999966,0.999968,0.999998,0.004119,0.004033,186.5361
4,GpPW,Sample_50,0.9102,0.910155,0.910157,0.996377,0.981707,0.979949,0.999967,0.999982,0.999991,0.002114,0.002097,385385.0
