In [None]:
import argparse
import sys
import os
from collections import defaultdict
from pprint import pprint

from tqdm import tqdm

sys.path.append('../')
from src.utils import get_data_abbr, mkdir_p, get_base_path, get_library_path
import pandas as pd
# from scipy.stats import kendalltau
import numpy as np

library_path = get_library_path()
sys.path.append(library_path)
sys.path.append(os.path.join(library_path, "hynetworkx"))

from src.data_preparer import filter_size, prepare_lp_data, get_time_filter_params
from src.hypergraph_link_predictor import get_hypergraph_scores, hypergraph_score_abbr_map, all_hypergraph_score_names
from src.link_predictor import get_perf_df
from src.linkpred_predictor import get_linkpred_scores, predictor_abbr_map, all_predictor_names
from src.supervised_link_predictor import classify
from src.experimenter import perform_GWH_classification
from src.incidence_matrix import parse_benson_incidence_matrix as parse_S
#from scr.tables_generator.ipynb import to_mean_std,get_overfit_score_df,get_data_split_name,get_latex_table,read_macro_feat_results,read_standalone_results

from joblib import Memory

base_path = get_base_path()

In [None]:
base_path = '/content/gdrive/My Drive/Colab Notebooks/data/'
data_params = {'data_name': 'contact-high-school',
               'base_path': base_path,
               'split_mode': 'structural',
               'max_size_limit': 10}
lp_data_params = {'rho': 0.2,
                  'neg_factor': 5,
                  'neg_mode': 'random',
                  'weighted_flag': False}
lp_params = {'linkpred_indices': None,  # Say None for all scores
             'hypergraph_score_indices': None,  # Say None for all scores
             }
classifier_params = {'features': None,
                     'classifier': 'xgboost'}
column_s = ['classifier']
data_names = ['women']
split_modes = ['structural']
weighted_flags = [False, True]

lp_cols = ['AA', 'AS', 'CN', 'Cos', 'PA', 'JC', 'MxO', 'MnO', 'NM', 'Prn']
metrics = ['ap', 'auc', 'p@+', 'p@10', 'p@100', 'r@+', 'r@10', 'r@100']
hyper_cols = ['HAAM', 'HAAa', 'HAAl1', 'HAAl2',
              'HASM', 'HASa', 'HASl1', 'HASl2',
              'HCNM', 'HCNa', 'HCNl1', 'HCNl2',
              'HCosM', 'HCosa', 'HCosl1', 'HCosl2',
              'HDPa', 'HPM', 'HPa', 'HPl1', 'HPl2',
              'HJCM', 'HJCa', 'HJCl1', 'HJCl2',
              'HmaxoM', 'HmaxoA', 'Hmaxol1', 'Hmaxol2',
              'HminoM', 'HminoA', 'Hminol1', 'Hminol2',
              'HNMM', 'HNMa', 'HNMl1', 'HNMl2',
              'HPearM', 'HPeara', 'HPearl1', 'HPearl2',
              ]

combined_tables = {}
mixed_combinations_map = {'AA': ['AA', 'HAAM', 'HAAa', 'HAAl1', 'HAAl2'],
                          'AS': ['AS', 'HASM', 'HASa', 'HASl1', 'HASl2'],
                          'CN': ['CN', 'HCNM', 'HCNa', 'HCNl1', 'HCNl2'],
                          'Cos': ['Cos', 'HCosM', 'HCosa', 'HCosl1', 'HCosl2'],
                          'PA': ['PA', 'HDPa', 'HPM', 'HPa', 'HPl1', 'HPl2'],
                          'JC': ['JC', 'HJCM', 'HJCa', 'HJCl1', 'HJCl2'],
                          'Kz': ['Kz', 'HKz'],
                          'MxO': ['MxO', 'HmaxoM', 'HmaxoA', 'Hmaxol1', 'Hmaxol2'],
                          'MnO': ['MnO', 'HminoM', 'HminoA', 'Hminol1', 'Hminol2'],
                          'NM': ['NM', 'HNMM', 'HNMa', 'HNMl1', 'HNMl2'],
                          'Prn': ['Prn', 'HPearM', 'HPeara', 'HPearl1', 'HPearl2'], }


data_params={'data_name': 'email-Enron',
                                      'base_path': base_path,
                                      'split_mode': 'temporal',
                                      'max_size_limit': 10}
default_lp_cols = ['AA', 'AS', 'CN', 'Cos', 'PA', 'JC', 'MxO', 'MnO', 'NM', 'Prn']
abbr_pred_map = {a: p for p, a in predictor_abbr_map.items()}
default_lp_names = [abbr_pred_map[a] for a in default_lp_cols]

lp_data_params={'rho': 0.2,
                                         'neg_factor': 5,
                                         'neg_mode': 'random'}
default_lp_indices = [all_predictor_names.index(p) for p in default_lp_names]
default_hyper_cols = ['HAAM', 'HAAa', 'HAAl1', 'HAAl2',
                      'HASM', 'HASa', 'HASl1', 'HASl2',
                      'HCNM', 'HCNa', 'HCNl1', 'HCNl2',
                      'HCosM', 'HCosa', 'HCosl1', 'HCosl2',
                      'HDPa', 'HPM', 'HPa', 'HPl1', 'HPl2',
                      'HJCM', 'HJCa', 'HJCl1', 'HJCl2',
                      'HmaxoM', 'HmaxoA', 'Hmaxol1', 'Hmaxol2',
                      'HminoM', 'HminoA', 'Hminol1', 'Hminol2',
                      'HNMM', 'HNMa', 'HNMl1', 'HNMl2',
                      'HPearM', 'HPeara', 'HPearl1', 'HPearl2',
                      ]
hyg_abbr_pred_map = {a: p for p, a in hypergraph_score_abbr_map.items()}
default_hyg_names = [hyg_abbr_pred_map[a] for a in default_hyper_cols]
default_hyper_indices = [all_hypergraph_score_names.index(p) for p in default_hyg_names]


lp_params={'linkpred_indices': default_lp_indices,
                                    'hypergraph_score_indices': default_hyper_indices}
iter_var= 0

In [None]:
from scipy.sparse import csr_matrix, triu, hstack, find

def parse_s_women():
    df=pd.read_csv("../../out.opsahl-southernwomen",header=None, sep=' ')
    df=df[[0,1]]
    number_of_women=max(df[0])
    number_of_functions=max(df[1])
    hyperedges = set()
    hyperedge_list = []
    hyperedge_times=[]
    rows = []
    cols = []
    j = 0
    i = 0
    hyperedge_women_map = defaultdict(list)
    for k in range (len(df[1])):
        hyperedge_women_map[df[1][k]].append(df[0][k])

    for k in range (1,number_of_functions+1):
        hyperedge = frozenset(hyperedge_women_map[k])

    #     hyperedge_list.append(hyperedge)
        if hyperedge not in hyperedges:
            hyperedges.add(hyperedge)
            #print([x-1 for x in hyperedge])
            rows.extend([x-1 for x in hyperedge])
            cols.extend([j] * len(hyperedge))
            j=j+1
            hyperedge_times.append(0)

    m = len(hyperedges)
    n = number_of_women
    matrix = csr_matrix(([1] * len(rows), (rows, cols)), shape=(n, m))
#     id_label_map = {v.id: v.label for v in vertex_list}
    return matrix, np.array(hyperedge_times)

default_lp_cols = ['AA', 'AS', 'CN', 'Cos', 'PA', 'JC', 'MxO', 'MnO', 'NM', 'Prn']
abbr_pred_map = {a: p for p, a in predictor_abbr_map.items()}
default_lp_names = [abbr_pred_map[a] for a in default_lp_cols]
default_lp_indices = [all_predictor_names.index(p) for p in default_lp_names]

metrics = ['ap', 'auc', 'p@+', 'p@10', 'p@100', 'r@+', 'r@10', 'r@100']
default_hyper_cols = ['HAAM', 'HAAa', 'HAAl1', 'HAAl2',
                      'HASM', 'HASa', 'HASl1', 'HASl2',
                      'HCNM', 'HCNa', 'HCNl1', 'HCNl2',
                      'HCosM', 'HCosa', 'HCosl1', 'HCosl2',
                      'HDPa', 'HPM', 'HPa', 'HPl1', 'HPl2',
                      'HJCM', 'HJCa', 'HJCl1', 'HJCl2',
                      'HmaxoM', 'HmaxoA', 'Hmaxol1', 'Hmaxol2',
                      'HminoM', 'HminoA', 'Hminol1', 'Hminol2',
                      'HNMM', 'HNMa', 'HNMl1', 'HNMl2',
                      'HPearM', 'HPeara', 'HPearl1', 'HPearl2',
                      ]
hyg_abbr_pred_map = {a: p for p, a in hypergraph_score_abbr_map.items()}
default_hyg_names = [hyg_abbr_pred_map[a] for a in default_hyper_cols]
default_hyper_indices = [all_hypergraph_score_names.index(p) for p in default_hyg_names]

combined_tables = {}
mixed_combinations_map = {'AA': ['AA', 'HAAM', 'HAAa', 'HAAl1', 'HAAl2'],
                          'AS': ['AS', 'HASM', 'HASa', 'HASl1', 'HASl2'],
                          'CN': ['CN', 'HCNM', 'HCNa', 'HCNl1', 'HCNl2'],
                          'Cos': ['Cos', 'HCosM', 'HCosa', 'HCosl1', 'HCosl2'],
                          'PA': ['PA', 'HDPa', 'HPM', 'HPa', 'HPl1', 'HPl2'],
                          'JC': ['JC', 'HJCM', 'HJCa', 'HJCl1', 'HJCl2'],
                          'Kz': ['Kz', 'HKz'],
                          'MxO': ['MxO', 'HmaxoM', 'HmaxoA', 'Hmaxol1', 'Hmaxol2'],
                          'MnO': ['MnO', 'HminoM', 'HminoA', 'Hminol1', 'Hminol2'],
                          'NM': ['NM', 'HNMM', 'HNMa', 'HNMl1', 'HNMl2'],
                          'Prn': ['Prn', 'HPearM', 'HPeara', 'HPearl1', 'HPearl2'], }


def perform_link_prediction1(data_params, lp_data_params, lp_params=None, iter_var=0):
    """
    data_params: {'data_name', 'base_path', 'split_mode', 'max_size_limit'}
    lp_data_params: {'rho', 'neg_factor', 'neg_mode', 'weighted_flag'}
    lp_params: {'linkpred_indices', 'hypergraph_score_indices'}

    returns: (data, lp_data, lp_results)
    """
    #print('READING DATASET...')
    data_name, base_path, split_mode, max_size_limit = [data_params[x] for x in
                                                        ['data_name', 'base_path', 'split_mode', 'max_size_limit']]

    #print('PREPARING LP DATA...')
    rho, neg_factor, neg_mode = [lp_data_params[x] for x in
                                 ['rho', 'neg_factor', 'neg_mode']]

    S, times = parse_s_women()
    weighted_lp_data = prepare_lp_data(S, True, times, rho, neg_factor, neg_mode)
    #print(weighted_lp_data)

    #print('PERFORMING LINK PREDICTION...')
    if lp_params:
        linkpred_indices, hypergraph_score_indices = [lp_params[x] for x in
                                                      ['linkpred_indices', 'hypergraph_score_indices']]
    else:
        linkpred_indices, hypergraph_score_indices = None, None
        
        
            
    print(linkpred_indices)
    print(hypergraph_score_indices)
        
    weighted_linkpred_scores_df = get_linkpred_scores(weighted_lp_data, True, linkpred_indices)
    unweighted_linkpred_scores_df = get_linkpred_scores(weighted_lp_data, False, linkpred_indices)
    unweighted_linkpred_cols = list(unweighted_linkpred_scores_df.columns)
    cols_map = {c: 'w_{}'.format(c) for c in unweighted_linkpred_cols}
    weighted_linkpred_scores_df = weighted_linkpred_scores_df.rename(columns=cols_map)
    weighted_linkpred_cols = list(weighted_linkpred_scores_df.columns)

    hyg_scores_df = get_hypergraph_scores(weighted_lp_data, hypergraph_score_indices)
    hyg_scores_cols = list(hyg_scores_df.columns)
    scores_df = pd.merge(unweighted_linkpred_scores_df, weighted_linkpred_scores_df, left_index=True, right_index=True)
    scores_df = pd.merge(scores_df, hyg_scores_df, left_index=True, right_index=True)
    pos_pairs = set(zip(*weighted_lp_data['A_test_pos'].nonzero()))
    scores_df['label'] = scores_df.index.map(lambda x: int(x in pos_pairs))
    perf_df = get_perf_df(scores_df, unweighted_linkpred_cols + weighted_linkpred_cols, hyg_scores_cols)
    return weighted_lp_data, \
           {'scores': scores_df, 'perf': perf_df}



a,b=perform_link_prediction1(data_params, lp_data_params, lp_params=lp_params, iter_var=0)

In [None]:
c=b['perf']
# c[['AA','w_AA', 'HAAM', 'HAAa', 'HAAl1', 'HAAl2']].rank(axis=1, ascending=False).T.sort_values('p@+', ascending=True).head(20)
c[['AA', 'w_AA','HAAM', 'HAAa', 'HAAl1', 'HAAl2']].T.sort_values('p@+', ascending=True).head(20)


In [None]:
c=b['scores']
d=c[['AA','w_AA', 'HAAM', 'HAAa', 'HAAl1', 'HAAl2','label']]
d

In [None]:
d.rank(axis=0, ascending=False)
# d

In [None]:
def read_standalone_results(data_name, split_mode, metric):
    #params = get_default_params()
    data_params['data_name'] = data_name
    data_params['split_mode'] = split_mode
    data_params['base_path'] = '/home2/e1-313-15477'
    dfs = []
    for i in range(5):
        iter_var= i
        _, lp_results = perform_link_prediction1(data_params,
            lp_data_params,
            lp_params,
            iter_var)
        dfs.append(lp_results['perf'])
    df = to_mean_std(dfs)

    GWH_cols = ['stand-G', 'stand-W', 'stand-H\\textsubscript{max}', 'stand-H\\textsubscript{avg}', 'stand-H\\textsubscript{L1}', 'stand-H\\textsubscript{L2}']
    rows = []
    df_list = []
    for c in default_lp_cols:
        cols = [c, 'w_' + c] + mcm[c][1:]
        row = df.loc[metric, cols]
        row.name = c
        rows.append(row)
        df1 = pd.DataFrame(row).T
        df1 = df1.rename(columns=dict(zip(df1.columns, GWH_cols)))
        df_list.append(df1)
    return pd.concat(df_list)

In [None]:
def to_mean_std(dfs, _round=True):
    if _round:
        return pd.concat(dfs).reset_index().groupby('index').\
              agg(lambda x: '{} $\\pm$ {}'.format("%.1f" % round(np.mean(x), 1),
                                               "%.1f" % round(np.std(x), 1)))
    else:
        return pd.concat(dfs).reset_index().groupby('index').\
              agg(lambda x: '{} $\\pm$ {}'.format("%.1f" % np.mean(x),
                                               "%.1f" % np.std(x)))
    
def get_data_split_name(d, s, mode='full'):
    if mode == 'full':
        return '{} ({})'.format(d, s)
    if mode == 'abbr':
        return 'wom'
    if mode == 'idx':
        return '{} ({})'.format(get_data_idx(d), s[0])

In [None]:
from itertools import product
from tqdm import tqdm_notebook
from tabulate import tabulate
mcm = mixed_combinations_map
if 'HDPa' in mcm['PA']:
    mcm['PA'].remove('HDPa')
data_names = ['women']
split_modes = ['structural']
dfs = []
iterator = list(product(split_modes, data_names))
for s, d in tqdm_notebook(iterator):
    df = read_standalone_results(d, s, 'auc')
    df = df.rank(axis=1, ascending=False)
#     print(to_mean_std([rank_df.loc[i, :] for i in rank_df.index]))
    df = to_mean_std([df.loc[i, :] for i in df.index]).loc[df.columns, :].rename(columns={0: get_data_split_name(d, s, mode='abbr')}).T
    dfs.append(df)
table_df = pd.concat(dfs)

In [None]:
table_df

In [None]:
print(tabulate(table_df.rank(axis=1, ascending=True), headers='keys', tablefmt='psql'))
#print(get_latex_table(table_df, bold_best = 'per_row', col_mode='tt', ascending=False))

In [None]:
from src.data_preparer import S_to_A, S_to_B, incidence_to_hyperedges

In [None]:
S, times = parse_s_women()
S.sum(axis=1)

In [None]:
S.sum(axis=0)

In [None]:
a['S_train'].shape

In [None]:
len(incidence_to_hyperedges(a['S_train']))