In [None]:
from experimenter import *
from utils import *
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
mcm = mixed_combinations_map
if 'HDPa' in mcm['PA']:
    mcm['PA'].remove('HDPa')

In [None]:
def prepare_clf_columns(lp_col, cols):
    col_map = {}
    for c in cols:
        if len(c) == 1 and lp_col == c[0]:
            col_map[c] = 'micro-G_{}'.format(lp_col)
        elif len(c) == 1 and c[0].startswith('w_'):
            col_map[c] = 'micro-W_{}'.format(lp_col)
        elif len(c) == 4:
            col_map[c] = 'micro-H_{}'.format(lp_col)
        elif len(c) == 5 and lp_col in c:
            col_map[c] = 'micro-GH_{}'.format(lp_col)
        else:
            col_map[c] = 'micro-WH_{}'.format(lp_col)
    return col_map


def to_mean_std(dfs):
    df = pd.concat(dfs).reset_index().groupby('index').agg(list)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: '{} $\\pm$ {}'.format("%.1f" % round(np.mean(x), 1),
                                               "%.1f" % round(np.std(x), 1)))
    return df

def get_overfit_score_df(train_df, test_df):
    overfit_df = train_df.subtract(test_df)
    overfit_df = overfit_df.div(train_df)
    overfit_df = overfit_df.mul(100)
    return overfit_df


def get_perf_table(data_name, split_mode):
    params = get_default_params()
    params['data_params']['data_name'] = data_name
    params['data_params']['split_mode'] = split_mode
    params['data_params']['base_path'] = '/home2/e1-313-15477'
    dfs, train_dfs, test_dfs, overfit_dfs = [], [], [], []
    
    for i in range(5):
        params['iter_var'] = i
        _, lp_results = perform_link_prediction(params['data_params'],
            params['lp_data_params'],
            params['lp_params'],
            params['iter_var'])
        cl_perfs = {}
        interim_train_dfs = []
        interim_test_dfs = []
        for lp_col in tqdm(default_lp_cols):
            G_feats = [lp_col]
            W_feats = ['w_{}'.format(lp_col)]
            H_feats = mcm[lp_col][1:]
            output = perform_GWH_classification(params, G_feats, W_feats, H_feats, 'xgboost')
            cl_perfs[lp_col] = output
            train_df = pd.concat([output[k]['train_perf'].rename(columns = {'xgboost_train': k}).T for k in output]).T
            test_df = pd.concat([output[k]['test_perf'].rename(columns = {'xgboost_test': k}).T for k in output]).T
            clf_cols_map = prepare_clf_columns(lp_col, train_df.columns)
            train_df.rename(columns = clf_cols_map, inplace=True)
            test_df.rename(columns = clf_cols_map, inplace=True)
            interim_train_dfs.append(train_df.T)
            interim_test_dfs.append(test_df.T)
        
        train_df = pd.concat(interim_train_dfs).T
        test_df = pd.concat(interim_test_dfs).T
        overfit_df = get_overfit_score_df(train_df, test_df)
        
        train_dfs.append(train_df)
        test_dfs.append(test_df)
        overfit_dfs.append(overfit_df)
        dfs.append(lp_results['perf'])

    
    df = to_mean_std(dfs)
    train_df = to_mean_std(train_dfs)
    test_df = to_mean_std(test_dfs)
    overfit_df = to_mean_std(overfit_dfs)
    return df, train_df, test_df, overfit_df


def reformat_tables(df, train_df, test_df, overfit_df, metric):
    GWH_cols = ['G', 'W', 'H_{max}', 'H_{avg}', 'H_{L1}', 'H_{L2}']
    base_cols = ['micro-G', 'micro-W', 'micro-H', 'micro-GH', 'micro-WH']
    rows = []
    df_list = []
    for c in default_lp_cols:
        cols = [c, 'w_' + c] + mcm[c][1:]
        row = df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, GWH_cols)))
        df_list.append(df1)
        
    rows = []
    train_df_list = []
    for c in default_lp_cols:
        cols = ['{}_{}'.format(x, c) for x in base_cols]
        row = train_df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, base_cols)))
        train_df_list.append(df1)
        
    rows = []
    test_df_list = []
    for c in default_lp_cols:
        cols = ['{}_{}'.format(x, c) for x in base_cols]
        row = test_df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, base_cols)))
        test_df_list.append(df1)
    
    rows = []
    overfit_df_list = []
    for c in default_lp_cols:
        cols = ['{}_{}'.format(x, c) for x in base_cols]
        row = overfit_df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, base_cols)))
        overfit_df_list.append(df1)
    return pd.concat(df_list), pd.concat(train_df_list), pd.concat(test_df_list), pd.concat(overfit_df_list)


def get_latex_table(df, file_name = None, bold_best=None, col_mode = 'math', ascending = True):
    if col_mode == 'math':
        table_df = df.rename(columns = {c: '${}$'.format(c) for c in df.columns})
    elif col_mode == 'sf':
        table_df = df.rename(columns = {c: '\\textsf{{{}}}'.format(c) for c in df.columns})
    elif col_mode == 'tt':
        table_df = df.rename(columns = {c: '\\texttt{{{}}}'.format(c) for c in df.columns})
    if bold_best == 'per_col':
        pass
    elif bold_best == 'per_row':
        for i in table_df.index:
            max_i = table_df.loc[i, :].max() if ascending else table_df.loc[i, :].min()
            table_df.loc[i, :] = table_df.loc[i, :].apply(lambda x: '\\textbf{{{}}}'.format(x) if x == max_i else x)
    return table_df.to_latex(file_name, escape=False, column_format = 'l'+'c'*df.shape[1])

def get_data_split_name(d, s, mode='full'):
    if mode == 'full':
        return '{} ({})'.format(d, s)
    if mode == 'abbr':
        return '{} ({})'.format(get_data_abbr(d), s[0])
    if mode == 'idx':
        return '{} ({})'.format(get_data_idx(d), s[0])


In [None]:
from tabulate import tabulate
from itertools import product
data_names = [
              'email-Enron',
              'contact-high-school',
              'NDC-substances',
              'tags-math-sx',
              'threads-math-sx',
              'coauth-DBLP'
             ]
split_modes = [
               'structural',
               'temporal',
              ]
data_splits = list(product(data_names, split_modes))
metrics = [
#           'auc',
#            'p@+',
#            'r@+',
           'p@100',
          ]
dfs = []
for d, s in tqdm_notebook(data_splits):
    base_folder = 'tables/perf/{}_{}/'.format(d, s)
    mkdir_p(base_folder)
    ds_name = '{} ({})'.format(get_data_abbr(d), s[0])
    try:
        df, train_df, test_df, overfit_df = get_perf_table(d, s)
    except FileNotFoundError:
        print('File Not Found')
        continue
    for m in metrics:
        perf_table_df, train_perf_table_df, test_perf_table_df, overfit_table_df = reformat_tables(df, train_df, test_df, overfit_df, m)
#         rank_df = perf_table_df.rank(axis=1, ascending=False)
#         train_rank_df = train_perf_table_df.rank(axis=1, ascending=False)
        test_rank_df = test_perf_table_df.rank(axis=1, ascending=False)
        print(d, s, m)
#         print('Standalone: ')
#         file_name = '{}/standalone_{}.tex'.format(base_folder, m)
#         print(tabulate(perf_table_df, headers='keys', tablefmt='psql'))
#         get_latex_table(perf_table_df, file_name, bold_best = 'per_row')
#         print(tabulate(rank_df.T[[ds_name]].T, headers='keys', tablefmt='psql'))
#         print('Classifier Train: ')
#         print(tabulate(train_perf_table_df, headers='keys', tablefmt='psql'))
#         print(tabulate(train_rank_df, headers='keys', tablefmt='psql'))
        print('Classifier Test: ')
#         file_name = '{}/classifier_{}.tex'.format(base_folder, m)
#         print(tabulate(test_perf_table_df, headers='keys', tablefmt='psql'))
#         get_latex_table(test_perf_table_df, bold_best = 'per_row')
        tabulate(test_rank_df, headers='keys', tablefmt='psql')
        df = to_mean_std([test_rank_df.loc[i, :] for i in test_rank_df.index]).loc[test_rank_df.columns, :].rename(columns={0: get_data_split_name(d, s, mode='abbr')}).T
        dfs.append(df)
#         print('Overfit %: ')
#         file_name = '{}/classifier_{}_overfit.tex'.format(base_folder, m)
#         print(tabulate(overfit_table_df, headers='keys', tablefmt='psql'))
#         get_latex_table(overfit_table_df, file_name)
#         print('\n\n')
table_df = pd.concat(dfs)


In [None]:
print(tabulate(table_df.rank(axis=1, ascending=True), headers='keys', tablefmt='psql'))
print(get_latex_table(table_df, bold_best = 'per_row', col_mode='tt', ascending=False))

In [None]:
data_name = 'coauth-DBLP'
_, _, test_df1, _ = reformat_tables(*get_perf_table(data_name, 'structural'), 'auc')
_, _, test_df2, _ = reformat_tables(*get_perf_table(data_name, 'temporal'), 'auc')

test_df = pd.concat([test_df1.rename(index={c: '{} ({})'.format(c, 's') for c in test_df1.index}),
           test_df2.rename(index={c: '{} ({})'.format(c, 't') for c in test_df2.index})])
print(get_latex_table(test_df, bold_best = 'per_row', col_mode='tt', ascending=True))