In [None]:
from torch_geometric.nn import MLP
import torch_geometric
import torch
import torch.nn as nn

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

import matplotlib.pyplot as plt
from tqdm import tqdm
import json 
import scipy
import numpy as np
import pandas as pd
import pickle

In [None]:
emb_method = 'graphsage'
training_years = [2016]
device = 'cpu'

In [None]:
def load_embeddings(method, training_year, year = None):
    if year == None: 
        year = training_year 
    if method == 'specter':
        emb = pd.read_json(f'specter/{year}/output.jsonl', lines = True).set_index('paper_id')
        return emb['embedding']
    elif method == 'tfidf':
        X = scipy.sparse.load_npz(f'embeddings/tfidf_({training_year})_{year}.npz')
        with open(f'embeddings/tfidf_({training_year})_{year}_index.json') as infile:
            ids = json.load(infile)
        return {k:X[ids[k]] for k in ids}
    else: 
        with open(f'embeddings/{method}_({training_year})_{year}.json','r') as infile:
            return json.load(infile)

In [None]:
class Net(torch_geometric.nn.models.MLP):
    def __init__(self, channels, dropout = 0):
        super().__init__(channel_list=channels, dropout=dropout)

    def forward(self, x):
        return torch.sigmoid(super().forward(x))


In [None]:
co_citations = {} 
for year in training_years: 
    with open(f'co_citations/{year}.json', 'r') as f:
        d = json.load(f)

    pairs = []
    for q in tqdm(d): 
        for c in d[q]: 
            pairs.append({'q': q, 'c': c, 'n': d[q][c]})
            
    emb = load_embeddings(emb_method, year)
    
    co_cit_df = pd.DataFrame(pairs)
    if emb_method == 'tfidf':
        co_cit_df.loc[:,'emb'] = co_cit_df.apply(lambda x: scipy.sparse.hstack([emb[x['q']],emb[x['c']]]), axis=1)
    else:
        co_cit_df.loc[:,'emb'] = co_cit_df.apply(lambda x: emb[x['q']] + emb[x['c']], axis=1)
    co_citations[year] = co_cit_df[['emb','n']] 

In [None]:
train_df = pd.concat([co_citations[year] for year in training_years], axis = 0)

In [None]:
positive_sample = train_df[train_df['n'] > -1]
negative_sample = train_df[train_df['n'] == -1].sample(len(positive_sample))
train_df = pd.concat([negative_sample,positive_sample])

In [None]:
if emb_method == 'tfidf':
    X = scipy.sparse.vstack(train_df['emb'])
else:
    X = np.vstack(train_df['emb'])
y = train_df['n'].map(lambda x: 0 if x == -1 else 1).values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1)
X_train, X_val = torch.from_numpy(X_train).float(), torch.from_numpy(X_val).float()
y_train, y_val = torch.from_numpy(y_train).float(), torch.from_numpy(y_val).float() 

# m_train = X_train.mean(0, keepdim=True)
# s_train = X_train.std(0, unbiased=False, keepdim=True)
# X_train -= m_train
# X_train /= s_train
# X_val -= m_train
# X_val /= s_train

In [None]:
dim = X_train.shape[1]

model = Net([dim, 64, 1], dropout=0.2)
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def _train_batch(inputs, targets):
    model.train()  # Set model to training mode
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    return loss.item()

def _evaluate(inputs, targets):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(inputs)
        val_loss = criterion(outputs, targets)
    return val_loss.item()

loss_list = []
best_loss = 10000
torch.autograd.set_detect_anomaly(True)

batch_size = len(X_train)
num_epochs = 1000
num_batches = len(X_train) // batch_size

for epoch in range(num_epochs):
    # print(f'Epoch: {epoch}')
    losses = np.array([])
    # with range(num_batches) as tq:
    for batch in range(num_batches):
        batch_start = batch * batch_size
        batch_end = (batch + 1) * batch_size
        inputs = X_train[batch_start:batch_end].clone()
        targets = y_train[batch_start:batch_end].reshape(-1, 1)
        loss = _train_batch(inputs, targets)
        losses = np.append(losses, loss)
            # tq.set_postfix({'loss':'{:.3f}'.format(losses.mean())})

    val_loss = _evaluate(X_val, y_val.reshape(-1, 1))

    if epoch % 100 == 0:
        print('epoch %d, loss: %.4f, val_loss: %.4f'
              % (epoch, losses.mean(), val_loss))
    if val_loss < best_loss:
        best_loss = val_loss
        best_params = model.state_dict()

    loss_list.append(loss)

In [None]:
model.load_state_dict(best_params)

In [None]:
y_str = '_'.join([str(y) for y in training_years])
torch.save(best_params, f'params/recommenders/{emb_method}_{y_str}_update.pth')