In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import json 
import numpy as np
import seaborn as sns
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
year = 2017
k_list = [10,20,30]
methods = ['graphsage', 'specter', 'combsage','tfidf']

In [None]:
data = {} 
with open(f'co_citations/{year}.json', 'r') as infile:
    for json_line in infile.readlines():
        try:
            data.update(json.loads(json_line))
        except:
            continue

co_cit_df = pd.DataFrame(data)
del data

In [None]:
recs = {}
for method in methods:
    recs[method] = pd.read_json(f'recommendations/{method}_{year}.json')

In [None]:
final_df = pd.read_json('meta_data.jsonl', lines = True).set_index('paperId')

In [None]:
papers_2017 = final_df[final_df['year']<=2017].reset_index()['paperId']
random_recs = {}
for query in tqdm(recs['specter'].columns):
    r = {} 
    for k in k_list: 
        r[f'@{k}'] = set(papers_2017.sample(k))
    random_recs[query] = r

In [None]:
recs['random'] = pd.DataFrame(random_recs)

In [None]:
def eval_recommendation(rec_df):
    res = {}
    for query in tqdm(co_cit_df.columns):
        r = {}
        for k in k_list: 
            recommended = set(rec_df.loc[query][f'@{k}'])
            relevant = set(co_cit_df[query].dropna().index)
            retrieved = recommended.intersection(relevant)
            if len(relevant) == 0:
                precision, recall = np.nan, np.nan
            else: 
                precision = len(retrieved)/k
                recall = len(retrieved)/len(relevant)
            r[f'prec@{k}'] = precision
            r[f'recall@{k}'] = recall
            r[f'ret@{k}'] = retrieved
        res[query] = r
    return pd.DataFrame(res).T

In [None]:
res = {emb:eval_recommendation(recs[emb].T) for emb in recs}

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'ret' not in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
hitrate = {}
for emb in res:
    h = {}
    for k in k_list:
        h[f'@{k}'] = len(res[emb][res[emb][f'prec@{k}'] > 0])/len(res[emb].dropna())
    hitrate[emb] = h

In [None]:
pd.DataFrame(hitrate)[methods]

In [None]:
print(pd.DataFrame(hitrate)[methods].T.style.to_latex())

In [None]:
def mean_div(recs, attr = 'scibert_cls'):
    if len(recs) > 0:
        X = np.vstack(final_df.loc[list(recs)][attr].values)
        return 1 - cosine_similarity(X).mean()
    else:
        return np.nan
def mean_nov(recs, q_index, attr = 'scibert_cls'):
    if len(recs) > 0:
        X = np.vstack(final_df.loc[list(recs)][attr].values)
        Q = final_df.loc[q_index][attr]
        Q = np.array(Q).reshape(1,-1)
        return 1 - cosine_similarity(Q,X).mean()
    else:
        return np.nan

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'div@{k}'] = recs[emb].T[f'@{k}'].apply(mean_div)

In [None]:
del res['random']
del res['tfidf']

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'div' in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'nov@{k}'] = recs[emb].T.apply(lambda x: mean_nov(x[f'@{k}'], x.name), axis=1)

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'nov' in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
with open(f'embeddings/deepwalk_{year}.json', 'r') as infile:
    dw = json.load(infile)

In [None]:
final_df.loc[:,'deepwalk'] = pd.Series(dw)

In [None]:
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

In [None]:
methods = ['combsage', 'graphsage', 'specter'] 

In [None]:
for emb in methods:
    for k in k_list:
        res[emb].loc[:,f'div_dw@{k}'] = recs[emb].T[f'@{k}'].apply(mean_div,attr='deepwalk')

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'div' in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
for emb in methods:
    for k in k_list:
        res[emb].loc[:,f'nov_dw@{k}'] = recs[emb].T.apply(lambda x: mean_nov(x[f'@{k}'], x.name, attr = 'deepwalk'), axis=1)

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'nov' in col]
pd.DataFrame({emb:res[emb][numeric_col].median() for emb in res}).T

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'ret_div@{k}'] = res[emb][f'ret@{k}'].apply(mean_div)

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'div' in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'ret_nov@{k}'] = res[emb].apply(lambda x: mean_nov(x[f'ret@{k}'], x.name), axis=1)

In [None]:
numeric_col = [col for col in res['graphsage'].columns if 'ret_nov' in col]
pd.DataFrame({emb:res[emb][numeric_col].mean() for emb in res}).T

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'ret_div_dw@{k}'] = res[emb][f'ret@{k}'].apply(mean_div, attr = 'deepwalk')

In [None]:
for emb in ['combsage','graphsage', 'specter']:
    for k in k_list:
        res[emb].loc[:,f'ret_nov_dw@{k}'] = res[emb].apply(lambda x: mean_nov(x[f'ret@{k}'],
                                                                              x.name, attr = 'deepwalk'), axis=1)

In [None]:
latex = table.to_latex(
        index=True,
        escape=False,
        # sparsify=True,
        multirow=True,
        multicolumn=True,
        multicolumn_format='c',
        position='p',
    )
print(latex)