In [None]:
from experimenter import *
from utils import *
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm_notebook
from tabulate import tabulate
mcm = mixed_combinations_map
if 'HDPa' in mcm['PA']:
    mcm['PA'].remove('HDPa')

In [None]:
def to_mean_std(dfs, _round=True):
    if _round:
        return pd.concat(dfs).reset_index().groupby('index').\
              agg(lambda x: '{} $\\pm$ {}'.format("%.1f" % round(np.mean(x), 1),
                                               "%.1f" % round(np.std(x), 1)))
    else:
        return pd.concat(dfs).reset_index().groupby('index').\
              agg(lambda x: '{} $\\pm$ {}'.format("%.1f" % np.mean(x),
                                               "%.1f" % np.std(x)))

def get_overfit_score_df(train_df, test_df):
    overfit_df = train_df.subtract(test_df)
    overfit_df = overfit_df.div(train_df)
    overfit_df = overfit_df.mul(100)
    return overfit_df

def get_data_split_name(d, s, mode='full'):
    if mode == 'full':
        return '{} ({})'.format(d, s)
    if mode == 'abbr':
        return '{} ({})'.format(get_data_abbr(d), s[0])
    if mode == 'idx':
        return '{} ({})'.format(get_data_idx(d), s[0])

def get_latex_table(df, file_name = None, bold_best=None, col_mode = 'math', ascending = True):
    if col_mode == 'math':
        table_df = df.rename(columns = {c: '${}$'.format(c) for c in df.columns})
    elif col_mode == 'sf':
        table_df = df.rename(columns = {c: '\\textsf{{{}}}'.format(c) for c in df.columns})
    elif col_mode == 'tt':
        table_df = df.rename(columns = {c: '\\texttt{{{}}}'.format(c) for c in df.columns})
    if bold_best == 'per_col':
        pass
    elif bold_best == 'per_row':
        for i in table_df.index:
            max_i = table_df.loc[i, :].max() if ascending else table_df.loc[i, :].min()
            table_df.loc[i, :] = table_df.loc[i, :].apply(lambda x: '\\textbf{{{}}}'.format(x) if x == max_i else x)
    return table_df.to_latex(file_name, escape=False, column_format = 'l'+'c'*df.shape[1])

## (i) Data description
data_description.tex

In [None]:
# data_names = get_data_names()
# headers = ['Name', 'Abbr', '|V|', '|F|']
# data_names[]

## (ii) All data, macro classifiers, AUC scores: 12 x 5
macro_feat_perf_auc.tex

In [None]:
def read_macro_feat_results(data_name, split_mode):
    params = get_default_params()
    params['data_params']['data_name'] = data_name
    params['data_params']['split_mode'] = split_mode
    params['data_params']['base_path'] = '/home2/e1-313-15477'
    train_dfs, test_dfs, overfit_dfs = [], [], []
    # i = 0
    for i in range(5):
        params['iter_var'] = i
        cl_perfs = {}
        interim_train_dfs = []
        interim_test_dfs = []
        G_feats = default_lp_cols
        W_feats = ['w_{}'.format(c) for c in default_lp_cols]
        H_feats = default_hyper_cols
        col_map = {tuple(G_feats): 'macro-G', 
                   tuple(H_feats): 'macro-H',
                   tuple(W_feats): 'macro-W',
                   tuple(G_feats + H_feats): 'macro-GH',
                   tuple(W_feats+H_feats): 'macro-WH'}
        output = perform_GWH_classification(params, G_feats, W_feats, H_feats, 'xgboost')
        lp_col = ''
        train_df = pd.concat([output[k]['train_perf'].rename(columns = {'xgboost_train': k}).T for k in output]).T
        test_df = pd.concat([output[k]['test_perf'].rename(columns = {'xgboost_test': k}).T for k in output]).T
        train_df.rename(columns = col_map, inplace=True)
        test_df.rename(columns = col_map, inplace=True)
        overfit_df = get_overfit_score_df(train_df, test_df)

        train_dfs.append(train_df)
        test_dfs.append(test_df)
        overfit_dfs.append(overfit_df)

    train_df = to_mean_std(train_dfs)
    test_df = to_mean_std(test_dfs)
    overfit_df = to_mean_std(overfit_dfs)
    return train_df, test_df, overfit_df

In [None]:
data_names = get_data_names()
# data_names = ['email-Enron', 'contact-high-school']
split_modes = ['structural', 'temporal']
train_dfs = {}
test_dfs = {}
overfit_dfs = {}
iterator = list(product(data_names, split_modes))
for d, s in tqdm_notebook(iterator):
    train_df, test_df, overfit_df = read_macro_feat_results(d, s)
    train_dfs[(d, s)] = train_df
    test_dfs[(d, s)] = test_df
    overfit_dfs[(d, s)] = overfit_df

In [None]:
metric = 'p@100'
#'auc',
#            'p@+',
#            'r@+',
#            'p@100',
rows = []
train_df_list = []
for s, d in product(split_modes, data_names):
    row = train_dfs[(d, s)].loc[metric, :]
    row.name = get_data_split_name(d, s, 'abbr')
    rows.append(row)
    df1 = pd.DataFrame(row).T
    train_df_list.append(df1)
    
rows = []
test_df_list = []
for s, d in product(split_modes, data_names):
    row = test_dfs[(d, s)].loc[metric, :]
    row.name = get_data_split_name(d, s, 'abbr')
    rows.append(row)
    df1 = pd.DataFrame(row).T
    test_df_list.append(df1)

    
rows = []
overfit_df_list = []
for s, d in product(split_modes, data_names):
    row = overfit_dfs[(d, s)].loc[metric, :]
    row.name = get_data_split_name(d, s, 'abbr')
    rows.append(row)
    df1 = pd.DataFrame(row).T
    overfit_df_list.append(df1)

final_train_df = pd.concat(train_df_list)
final_test_df = pd.concat(test_df_list)
final_overfit_df = pd.concat(overfit_df_list)

# print(tabulate(final_train_df, headers='keys', tablefmt='psql'))
# print(tabulate(final_train_df.rank(axis=1, ascending=False), headers='keys', tablefmt='psql'))
print(tabulate(final_test_df.rank(axis=1, ascending=False), headers='keys', tablefmt='psql'))
print(get_latex_table(final_test_df, bold_best = 'per_row', col_mode='tt'))

## (iv) All data, micro classifiers, rank-freq scores: 12 x 5


micro_feat_rank_perf_auc.tex

In [None]:
results_preparer.ipynb

## (v) 1 data, micro classifiers, AUC scores: 10 x 10
micro_feat_perf_auc_{data_name}.tex

## (vi) All data, standalone, rank-freq scores: 12 x 6
standalone_rank_perf_auc.tex

In [None]:
def read_standalone_results(data_name, split_mode, metric):
    params = get_default_params()
    params['data_params']['data_name'] = data_name
    params['data_params']['split_mode'] = split_mode
    params['data_params']['base_path'] = '/home2/e1-313-15477'
    dfs = []
    for i in range(5):
        params['iter_var'] = i
        _, lp_results = perform_link_prediction(params['data_params'],
            params['lp_data_params'],
            params['lp_params'],
            params['iter_var'])
        dfs.append(lp_results['perf'])
    df = to_mean_std(dfs)

    GWH_cols = ['stand-G', 'stand-W', 'stand-H\\textsubscript{max}', 'stand-H\\textsubscript{avg}', 'stand-H\\textsubscript{L1}', 'stand-H\\textsubscript{L2}']
    rows = []
    df_list = []
    for c in default_lp_cols:
        cols = [c, 'w_' + c] + mcm[c][1:]
        row = df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, GWH_cols)))
        df_list.append(df1)
    return pd.concat(df_list)

In [None]:
# data_names = get_data_names()
data_names = ['email-Enron', 'contact-high-school', 'tags-math-sx', 'threads-math-sx','NDC-substances','coauth-DBLP']
split_modes = ['structural', 'temporal']
dfs = []
iterator = list(product(split_modes, data_names))
for s, d in tqdm_notebook(iterator):
    df = read_standalone_results(d, s, 'r@+')
    df = df.rank(axis=1, ascending=False)
#     print(to_mean_std([rank_df.loc[i, :] for i in rank_df.index]))
    df = to_mean_std([df.loc[i, :] for i in df.index]).loc[df.columns, :].rename(columns={0: get_data_split_name(d, s, mode='abbr')}).T
    dfs.append(df)
table_df = pd.concat(dfs)

In [None]:
print(tabulate(table_df.rank(axis=1, ascending=True), headers='keys', tablefmt='psql'))
print(get_latex_table(table_df, bold_best = 'per_row', col_mode='tt', ascending=False))

## (vii) 1 data, standalone, AUC scores: 10 x 12
standalone_perf_auc_{data_name}.tex

In [None]:
data_name = 'coauth-DBLP'
df1 = read_standalone_results(data_name, 'structural', 'auc')
df2 = read_standalone_results(data_name, 'temporal', 'auc')
df = pd.concat([df1.rename(index={c: '{} ({})'.format(c, 's') for c in df1.index}),
           df2.rename(index={c: '{} ({})'.format(c, 't') for c in df2.index})])
print(get_latex_table(df, bold_best = 'per_row', col_mode='tt'))

## (viii) All data, all metrics, best performing (performance): 12 x 4
metric_best_performing.tex