In [None]:
from torch_geometric.nn import MLP
import torch_geometric
import torch
import torch.nn as nn

import matplotlib.pyplot as plt
from tqdm import tqdm
import json 
import scipy
import pickle
import numpy as np
import pandas as pd

In [None]:
year = 2017
training_years = [2016]
k_list = [10,20,30]
methods = ['combsage','graphsage','specter']

In [None]:
def load_embeddings(method, year):
    if method == 'specter':
        emb = pd.read_json(f'specter/{year}/output_{year-1}.jsonl', lines = True).set_index('paper_id')
        return emb['embedding'].to_dict()
    elif method == 'tfidf':
        X = scipy.sparse.load_npz(f'embeddings/tfidf_({year-1})_{year}.npz')
        with open(f'embeddings/tfidf_({year-1})_{year}_index.json') as infile:
            ids = json.load(infile)
        return {k:X[ids[k]] for k in ids}
    else: 
        with open(f'embeddings/{method}_({year-1})_{year}.json','r') as infile:
            return json.load(infile)

In [None]:
class Net(torch_geometric.nn.models.MLP):
    def __init__(self, channels, dropout = 0):
        super().__init__(channel_list=channels, dropout=dropout)

    def forward(self, x):
        return torch.sigmoid(super().forward(x))


In [None]:
data = {} 
with open(f'co_citations/{year}.json', 'r') as infile:
    for json_line in infile.readlines():
        try:
            data.update(json.loads(json_line))
        except:
            continue

co_cit_df = pd.DataFrame(data)
del data

In [None]:
d = {}
for method in methods:
    d[method] = set(load_embeddings(method,2017))

In [None]:
final_df = pd.read_json('meta_data.jsonl', lines = True).set_index('paperId')

In [None]:
recs = {}

In [None]:
embs = load_embeddings('tfidf',2017)

In [None]:
targets = list(embs)
model = pickle.load(open('params/recommenders/tfidf_2016.sav', 'rb'))

In [None]:
t = scipy.sparse.vstack(pd.Series(embs).values)

In [None]:
method = 'tfidf'

In [None]:
preds = {}
 rec_df = {}
for query in tqdm(co_cit_df.columns):
    

    q = embs[query]
    q = scipy.sparse.vstack([q]*len(targets))

    E = scipy.sparse.hstack([t,q])
    pr = model.predict_proba(E)
    pr = pr[:,1]
    pr_d = {targets[i]:pr[i] for i in range(len(targets))}

    recommendations = pd.Series(pr_d).sort_values(ascending = False)
    top_k = list(recommendations.iloc[:50].index)
    r = {}
    for k in reversed(k_list):
        top_k = top_k[:k]
        r[f'@{k}'] = top_k
    rec_df[query] = r
    
recs[method] = pd.DataFrame(rec_df)
recs[method].to_json(f'recommendations/tfidf_{year}.json')

In [None]:
for method in methods:
    embs = load_embeddings(method, year)

    targets = list(embs)
    dim = len(embs[targets[0]])

    model = Net([dim*2, 64, 1], dropout=0.2)

    y_str = '_'.join([str(y) for y in training_years])
    params = torch.load(f'params/recommenders/{method}_{y_str}_update.pth')
    model.load_state_dict(params)
    model.eval()

    preds = {}
    t = np.vstack(pd.Series(embs).values)

    rec_df = {}
    total_rank_df = {} 
    for query in tqdm(co_cit_df.columns):

        # E = np.vstack([embs[query]+embs[target] for target in embs])
        q = np.array(embs[query])
        q = q[None,:].repeat(len(targets),axis = 0)

        E = np.concatenate((q,t),axis=1)
        E = torch.from_numpy(E).float()

        pr = model(E).detach().numpy()
        pr_d = {targets[i]:pr[i][0] for i in range(len(targets))}
        recommendations = pd.Series(pr_d).sort_values(ascending = False)
        top_k = list(recommendations.iloc[:50].index)
        r = {}
        for k in reversed(k_list):
            top_k = top_k[:k]
            r[f'@{k}'] = top_k
        rec_df[query] = r
        total_rank_df[query] = pr_d
    pd.DataFrame(total_rank_df).to_json(f'recommendations/total/{method}_{year}.json')
    recs[method] = pd.DataFrame(rec_df)
    recs[method].to_json(f'recommendations/{method}_{year}.json')

In [None]:
recs = {}
for method in ['graphsage', 'specter', 'combsage', 'tfidf']:
    recs[method] = pd.read_json(f'recommendations/{method}_{year}.json')

In [None]:
def eval_recommendation(rec_df):
    res = {}
    for query in tqdm(co_cit_df.columns):
        r = {}
        for k in k_list: 
            recommended = set(rec_df.loc[query][f'@{k}'])
            relevant = set(co_cit_df[query].dropna().index)
            retrieved = recommended.intersection(relevant)
            if len(relevant) == 0:
                precision, recall = np.nan, np.nan
            else: 
                precision = len(retrieved)/k
                recall = len(retrieved)/len(relevant)
            r[f'prec@{k}'] = precision
            r[f'recall@{k}'] = recall
        res[query] = r
    return pd.DataFrame(res).T

In [None]:
res = {emb:eval_recommendation(recs[emb].T) for emb in recs}

In [None]:
pd.DataFrame({emb:res[emb].mean() for emb in res}).T

In [None]:
hitrate = {}
for emb in res:
    h = {}
    for k in k_list:
        h[f'@{k}'] = len(res[emb][res[emb][f'prec@{k}'] > 0])/len(res[emb])
    hitrate[emb] = h

In [None]:
pd.DataFrame(hitrate)

In [None]:
def bootstrap(attr, rounding = 3):
    bootstrap = {method:{} for method in ['tfidf']}
    for method in ['tfidf']: 
        for k in k_list:
            means = []
            for _ in range(1000):
                means.append(res[method][f'{attr}@{k}'].sample(len(res[method]), replace = True).mean())
            means = np.array(means)
            bootstrap[method][f'{attr}@{k}'] =  f'{means.mean():.3f}±{(1.96*means.std()):.3f}'
    return pd.DataFrame(bootstrap)

In [None]:
attrs = ['prec', 'recall']
table = pd.concat([bootstrap(attr) for attr in attrs]).T
table.to_latex()

In [None]:
q = co_cit_df.columns[0]