In [None]:
import pathlib

import numpy as np
import pandas as pd
import sklearn.metrics
import tqdm

original = set(dir())

In [None]:
def dataframe_to_summary_metrics(df):
    nrow = len(df)
    
    # Save test edges
    y_true = df['edge'].values
    
    # Compute features and hold in memory as numpy arrays (less memory use than pandas column)
    degree_product = df['source_degree'].values * df['target_degree'].values
    analytic_prior = degree_product / (degree_product - df['source_degree'].values
                                       - df['target_degree'].values + df['edge'].sum() + 1)
    del df['source_degree'], df['target_degree']
    
    metrics = list()
    for feature in ['xswap_prior', 'analytic_prior', 'scaled_degree']:        
        if feature == 'analytic_prior': 
            df['analytic_prior'] = analytic_prior
            del analytic_prior
        elif feature == 'scaled_degree':
            df['scaled_degree'] = degree_product / degree_product.max()
            del degree_product
            
        # Compute fraction and number of duplicates for each feature value
        row = (
            df
            .groupby(feature)
            .agg({'edge': ['count', 'mean']})
            .reset_index()
        )
        
        # Scored edges for later AUROC computation. (So df[feature] can be deleted ASAP)
        y_score = df[feature].values
        del df[feature]
        
        # Reformat from Dict[Tuple[str, str]: pd.Series] to Dict[str: np.ndarray]
        row = {
            feature: row[(feature, '')].values,
            'count': row[('edge', 'count')].values,
            'fraction_edges': row[('edge', 'mean')].values,
        }

        # Summarize feature values into single statistics
        row = {
            'feature': feature,
            'metaedge': metaedge,
            'cal': (row['count'] * (row[feature] - row['fraction_edges']) ** 2).sum() / nrow,
            'ref': (row['count'] * row['fraction_edges'] * (1 - row['fraction_edges'])).sum() / nrow,
            'auroc': sklearn.metrics.roc_auc_score(y_true, y_score),
        }
        row['brier'] = row['cal'] + row['ref']
        metrics.append(row)
        del y_score, row
    return metrics

In [None]:
full_network_all_metrics = list()
sampled_network_all_metrics = list()

full_prior_path = pathlib.Path('full_priors/')
sampled_prior_path = pathlib.Path('sampled_priors/')
prior_paths = sorted(list(full_prior_path.glob('*.tsv.gz')))
# prior_paths = [pathlib.Path('full_priors/G<rG.tsv.gz'), ]
# prior_paths = [pathlib.Path('full_priors/AlD.tsv.gz'), ]

for prior_path in tqdm.tqdm_notebook(prior_paths):
    metaedge = prior_path.name.split('.')[0]
    print(metaedge, flush=True)
    
    # Load DataFrame and produce dataframe with features and outcomes only (save memory)
    prior_df = pd.read_csv(prior_path, sep='\t', usecols=['edge', 'source_degree',
                                                          'target_degree', 'xswap_prior'])
    
    full_network_all_metrics.extend(dataframe_to_summary_metrics(prior_df))
    original_edges = prior_df['edge'].values.astype(bool)
    del prior_df
    
    sampled_df = (
        pd.read_csv(f'sampled_priors/{metaedge}.tsv.gz', sep='\t', 
                    usecols=['edge', 'source_degree', 'target_degree', 'xswap_prior'])
        .assign(original_edges=original_edges)
        .query('edge == 0')
        .drop('edge', axis=1)
        .rename(columns={'original_edges': 'edge'})
    )
    del original_edges
    sampled_network_all_metrics.extend(dataframe_to_summary_metrics(sampled_df))
    del sampled_df['edge'], sampled_df

## Third task: Translating between degree sequences

In [None]:
third_task_all_metrics = list()

full_prior_path = pathlib.Path('../../data/4.data/')
prior_paths = sorted(list(full_prior_path.glob('*.tsv.xz')))
# prior_paths = [pathlib.Path('full_priors/G<rG.tsv.xz'), ]

for prior_path in tqdm.tqdm_notebook(prior_paths):
    metaedge = prior_path.with_suffix('').stem
    print(metaedge, flush=True)

    prior_df = (
        pd.read_csv(prior_path, sep='\t', usecols=['id_a', 'id_b', 
                                                   'network', 'edge', 'edge_prior'])
        .query('network != "train"')
        .rename(columns={'edge_prior': 'xswap_prior'})
    )

    prior_df = (
        prior_df
        .query('network == "test_recon"')
        .reset_index(drop=True)
        .assign(
            edge_other = prior_df.query('network == "test_new"')['edge'].values,
        )
        .drop(['network'], axis=1)
        .assign(
            source_degree = lambda df: df.groupby('id_a').transform(sum)['edge'],
            target_degree = lambda df: df.groupby('id_b').transform(sum)['edge'],
        )
        .drop(['id_a', 'id_b', 'edge'], axis=1)
        .rename(columns={'edge_other': 'edge'})

    )
    third_task_all_metrics.extend(dataframe_to_summary_metrics(prior_df))

In [None]:
full_metrics = pd.DataFrame.from_records(full_network_all_metrics)
sampled_metrics = pd.DataFrame.from_records(sampled_network_all_metrics)
third_task_metrics = pd.DataFrame.from_records(third_task_all_metrics)

all_metrics = pd.concat([
    full_metrics.assign(network='full'),
    sampled_metrics.assign(network='sampled'),
    third_task_metrics.assign(network='other')
])

all_metrics.to_csv('hetionet_calibration_metrics.csv', index=False, columns=['network', 'metaedge', 'feature',
                                                                             'cal', 'ref', 'brier', 'auroc'])