In [1]:
#!pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html
import dgl
import torch as th
import torch
import numpy as np
from dgl import save_graphs, load_graphs
import torch.nn as nn
import dgl.nn as dglnn
import torch
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import dgl.function as fn
#from torchmetrics.classification import BinaryAUROC
import torch.nn as nn
from dgl.dataloading.negative_sampler import _BaseNegativeSampler
from dgl import backend as b

In [2]:
glist, label_dict = load_graphs("./graphs/hetero_graphs_all_authors.bin")
train_hetero_graph = glist[0]
val_hetero_graph = glist[1]
test_hetero_graph = glist[2]

In [3]:
class PerSourceUniformCustom(_BaseNegativeSampler):

    def __init__(self, k):
        self.k = k

    def _generate(self, g, eids, canonical_etype):
        unique_authors = torch.unique(g.edges(etype = "authored")[1])
        _, _, vtype = canonical_etype
        shape = b.shape(eids)
        dtype = b.dtype(eids)
        ctx = b.context(eids)
        shape = (shape[0] * self.k,)
        src, _ = g.find_edges(eids, etype=canonical_etype)
        src = b.repeat(src, self.k, 0)
        dst_indexes = th.randint(0, len(unique_authors), shape, dtype=dtype, device=ctx)
        dst = unique_authors[dst_indexes]
        return src, dst

In [4]:
def construct_negative_graph(graph, k, etype):
    utype, edge_type, vtype = etype
    src, dst = graph.edges(etype=etype)
    eids = graph.edge_ids(src, dst, etype=edge_type)
    neg_sampler = PerSourceUniformCustom(k)
    neg_src, neg_dst = neg_sampler(graph, {edge_type: eids})[etype]
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

In [13]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, num_classes_papers, num_classes_authors):

        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            "cites": dglnn.GraphConv(303, 512),
        })
        
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(512, 303)

    def forward(self, graph, inputs):
        classes_paper = inputs["paper"]
        classes_author = inputs["author"]

        h_cites = self.conv1(graph["cites"], {"paper": classes_paper, "author": classes_author})["paper"]
        h_cites = self.dropout(h_cites)
        h_cites = F.relu(h_cites)


        h_cites = self.linear(h_cites)
        return {"paper": h_cites, "author": classes_author}
    
class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']
        
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names, num_classes_paper, num_classes_author):
        super().__init__()
        # Encoder
        self.sage = RGCN(in_features, hidden_features, out_features, num_classes_paper, num_classes_author)
        # Decoder
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return h

    def scores(self, g, neg_g, x, etype):
        h = self(g, neg_g, x, etype)
        return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [6]:
def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).squeeze(1).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

def compute_loss(pos_score, neg_score):
    # Margin loss
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

def compute_loss_logits(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).to("cuda")
    
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).to(device)
    return F.binary_cross_entropy_with_logits(scores.squeeze(1), labels)

def accuracy(logits, graph):
  with torch.no_grad():
    all_papers = torch.unique(graph.edges(etype="authored")[0])
    src, dst = graph.edges(etype="authored")
    unique_authors = torch.unique(dst)
    tst = 0
    author_logits = logits["author"][unique_authors]

    for idx, index_paper in enumerate(all_papers):

      current_logits = logits["paper"][index_paper]
      dot_product_all = torch.sum(current_logits * author_logits, dim=-1)
      max = torch.argmax(dot_product_all)
      max = unique_authors[max].item()
      filter_acc = src == index_paper
      if max in dst[filter_acc]:
        tst += 1
    return tst/len(all_papers)

In [7]:
# Load data into GPU memory
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [22]:
# Load the nlp features: Paper: SPECTER; author: publication history's title as Word2Vec
author_feats = torch.load("./author_feats.pt")
paper_feats = torch.load("./paper_feats.pt")
paper_feats = torch.tensor(paper_feats)[:135246]

In [26]:
k = 8
loss_training_epoch = []
loss_validation_epoch = []

auc_training_epoch = []
auc_validation_epoch = []

acc_training = []
acc_validation = []

node_features = {'paper': paper_feats.to(device), "author": author_feats.type(torch.float32).to(device)}
pred = HeteroDotProductPredictor()

In [27]:
model = Model(256, 1024, 512, train_hetero_graph.etypes, len(train_hetero_graph.nodes("paper")), len(train_hetero_graph.nodes("author"))).to(device)

opt = torch.optim.Adam(model.parameters())

for epoch in range(5000000):
    negative_graph = construct_negative_graph(train_hetero_graph, k, ('paper', 'authored', 'author'))
    pos_score, neg_score = model.scores(train_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
    loss = compute_loss_logits(pos_score.to(device), neg_score.to(device))
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 200 == 0:
        with torch.no_grad():
          logits_train = model(train_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
          acc_train = accuracy(logits_train, train_hetero_graph.to(device))
          auc_train = compute_auc(pos_score.cpu(), neg_score.cpu())
          loss_train = loss.item()

          loss_training_epoch.append(loss_train)
          auc_training_epoch.append(auc_train)

          logits_val = model(val_hetero_graph.to(device), "x", node_features, ('paper', 'authored', 'author'))
          acc_val = accuracy(logits_val, val_hetero_graph.to(device))

          negative_graph = construct_negative_graph(val_hetero_graph, k, ('paper', 'authored', 'author'))
          pos_score_eval, neg_score_eval = model.scores(val_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
          loss_val = compute_loss_logits(pos_score_eval.to(device), neg_score.to(device)).item()
          auc_val = compute_auc(pos_score_eval.cpu(), neg_score_eval.cpu())

          loss_validation_epoch.append(loss_val)
          auc_validation_epoch.append(auc_val)

          acc_training.append(acc_train)
          acc_validation.append(acc_val)

          print(f"EPOCH: {epoch}; Loss: {loss_train}, AUC: {auc_train}, Acc Train: {acc_train}; Loss {loss_val}, AUC: {auc_val} Acc Evaluation: {acc_val}")

    """
    if epoch % 50 == 0:

      with torch.no_grad():
        total_list  = loss_training_epoch + loss_validation_epoch + auc_training_epoch + auc_validation_epoch + acc_training + acc_validation
        with open(f'./drive/MyDrive/Bachelor_thesis/models/author_metrics_{epoch}_final.txt', 'w') as f:
          for item in total_list:
              f.write(str(item) + '\n')
        torch.save(model, f"./drive/MyDrive/Bachelor_thesis/models/author_{epoch}_final.pt")
    """

EPOCH: 0; Loss: 0.45162147283554077, AUC: 0.5130368387395515, Acc Train: 0.00021891418563922942; Loss 0.3108879327774048, AUC: 0.5011177517674399 Acc Evaluation: 0.0033333333333333335
EPOCH: 200; Loss: 0.2891857922077179, AUC: 0.8054390910315407, Acc Train: 0.00032837127845884414; Loss 0.1369597464799881, AUC: 0.719358371645274 Acc Evaluation: 0.01
EPOCH: 400; Loss: 0.26397010684013367, AUC: 0.8479597443415426, Acc Train: 0.00021891418563922942; Loss 0.09830338507890701, AUC: 0.7488341331896425 Acc Evaluation: 0.0033333333333333335
EPOCH: 600; Loss: 0.25205519795417786, AUC: 0.8659057578210628, Acc Train: 0.000766199649737303; Loss 0.09268484264612198, AUC: 0.7637213416839196 Acc Evaluation: 0.01
EPOCH: 800; Loss: 0.2433425486087799, AUC: 0.8801204465541044, Acc Train: 0.0024080560420315237; Loss 0.118519127368927, AUC: 0.7679949360042916 Acc Evaluation: 0.05333333333333334
EPOCH: 1000; Loss: 0.2377735674381256, AUC: 0.8855926712703592, Acc Train: 0.0056370402802101574; Loss 0.11351606

In [None]:
with torch.no_grad():
    torch.save(model, f"./drive/MyDrive/Bachelor_thesis/models/author.pt")