In [1]:

import dgl
import torch as th
import numpy as np
from dgl import save_graphs, load_graphs
import torch.nn as nn
import dgl.nn as dglnn
import torch
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import dgl.function as fn
from torchmetrics.classification import BinaryAUROC
import torch.nn as nn
from dgl.dataloading.negative_sampler import _BaseNegativeSampler
from dgl import backend as b

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/cu117/repo.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
glist, label_dict = load_graphs("./drive/MyDrive/Bachelor_thesis/models/hetero_graphs_primary_all_features_w_test.bin")
train_hetero_graph = glist[0]
val_hetero_graph = glist[1]
test_hetero_graph = glist[2]

In [3]:
class PerSourceUniformCustom(_BaseNegativeSampler):

    def __init__(self, k):
        self.k = k

    def _generate(self, g, eids, canonical_etype):
        unique_authors = torch.unique(g.edges(etype = "authored")[1])
        #print(len(unique_authors))
        _, _, vtype = canonical_etype
        shape = b.shape(eids)
        dtype = b.dtype(eids)
        ctx = b.context(eids)
        shape = (shape[0] * self.k,)
        src, _ = g.find_edges(eids, etype=canonical_etype)
        src = b.repeat(src, self.k, 0)
        dst_indexes = th.randint(0, len(unique_authors), shape, dtype=dtype, device=ctx)
        dst = unique_authors[dst_indexes]
        return src, dst

In [4]:
def construct_negative_graph(graph, k, etype):
    utype, edge_type, vtype = etype
    src, dst = graph.edges(etype=etype)
    eids = graph.edge_ids(src, dst, etype=edge_type)
    #eids = torch.unique(train_hetero_graph.edges(etype="authored")[0])
    neg_sampler = PerSourceUniformCustom(k)
    #neg_samples = dgl.dataloading.negative_sampler.PerSourceUniform(k)
    neg_src, neg_dst = neg_sampler(graph, {edge_type: eids})[etype]
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

In [5]:
class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']
        
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names, num_classes_paper, num_classes_author):
        super().__init__()
        # Encoder
        self.sage = RGCN(in_features, hidden_features, out_features, num_classes_paper, num_classes_author)
        # Decoder
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return h

    def scores(self, g, neg_g, x, etype):
      h = self(g, neg_g, x, etype)
      return self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [6]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, num_classes_papers, num_classes_authors):

        super().__init__()

        self.conv2 = dglnn.HeteroGraphConv({
            "cites": dglnn.GraphConv(num_classes_authors, 512),
        })
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(512, num_classes_authors)

    def forward(self, graph, inputs):
        classes_paper = inputs["paper"]
        classes_author = inputs["author"]

        h_cites = self.conv2(graph["cites"], {"paper": classes_paper, "author": classes_author})["paper"].flatten(1)
        h_cites = self.dropout(h_cites)
        h_cites = F.relu(h_cites)
        h_cites = self.linear(h_cites)

        return {"paper": h_cites, "author": classes_author}

In [7]:
def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).squeeze(1).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

def compute_loss(pos_score, neg_score):
    # Margin loss
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

def compute_loss_logits(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).to("cuda")
    
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).to("cuda")
    return F.binary_cross_entropy_with_logits(scores.squeeze(1), labels)

def accuracy(logits, graph):
  with torch.no_grad():
    all_papers = torch.unique(graph.edges(etype="authored")[0])
    src, dst = graph.edges(etype="authored")
    unique_authors = torch.unique(dst)
    tst = 0
    author_logits = logits["author"][unique_authors]

    for idx, index_paper in enumerate(all_papers):

      current_logits = logits["paper"][index_paper]
      dot_product_all = torch.sum(current_logits * author_logits, dim=-1)

      max = torch.argmax(dot_product_all)
      max = unique_authors[max].item()
      filter_acc = src == index_paper
      if max in dst[filter_acc]:
        tst += 1
    return tst/len(all_papers)

In [8]:
# Load data into GPU memory
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
# Sum Pooling Genders
train_hetero_graph.nodes["gender"].data["h"] = train_hetero_graph.nodes('gender').type(torch.float32)
train_hetero_graph.nodes["author"].data["h"] = th.zeros(len(train_hetero_graph.nodes("author")))
train_hetero_graph["gendered"].update_all(fn.copy_u('h', 'm'), fn.max('m', 'h'))
author_genders = train_hetero_graph.nodes["author"].data["h"].view(-1,1)

In [10]:
# Sum Pooling Countries
country_feats = F.one_hot(train_hetero_graph.nodes('country')).type(torch.float32)
train_hetero_graph.nodes["country"].data["h"] = country_feats
train_hetero_graph.nodes["affiliation"].data["h"] = th.zeros((len(train_hetero_graph.nodes("affiliation")), country_feats.shape[0]))
train_hetero_graph["contains"].update_all(fn.copy_u('h', 'm'), fn.max('m', 'h'))
affiliation_countries = train_hetero_graph.nodes["affiliation"].data["h"]

In [11]:
# Sum Pooling Affiliations
affiliation_feats = F.one_hot(train_hetero_graph.nodes('affiliation')).type(torch.float32)
train_hetero_graph.nodes["affiliation"].data["h"] = torch.concat([affiliation_feats, affiliation_countries], dim=1)
train_hetero_graph.nodes["author"].data["h"] = th.zeros((len(train_hetero_graph.nodes("author")), country_feats.shape[0]))
train_hetero_graph["affiliated"].update_all(fn.copy_u('h', 'm'), fn.max('m', 'h'))
author_affiliation_country = train_hetero_graph.nodes["author"].data["h"]

In [12]:
# Sum Pooling Authors
author_feats = F.one_hot(train_hetero_graph.nodes['author'].data['feature']).type(torch.float32)
author_feats = torch.concat([author_feats, author_genders, author_affiliation_country], dim=1)
train_hetero_graph.nodes["author"].data["h"] = author_feats 
train_hetero_graph.nodes["paper"].data["h"] = th.zeros((len(train_hetero_graph.nodes['paper'].data['feature']), author_feats.shape[0]))
train_hetero_graph["writes"].update_all(fn.copy_u('h', 'm'), fn.max('m', 'h'))
paper_feats = train_hetero_graph.nodes["paper"].data["h"]

In [13]:
author_feats = F.one_hot(train_hetero_graph.nodes['author'].data['feature']).type(torch.float32)
train_hetero_graph.nodes["author"].data["h"] = author_feats
train_hetero_graph.nodes["paper"].data["h"] = th.zeros((len(train_hetero_graph.nodes['paper'].data['feature']), author_feats.shape[0]))
train_hetero_graph["writes"].update_all(fn.copy_u('h', 'm'), fn.max('m', 'h'))
paper_feats = train_hetero_graph.nodes["paper"].data["h"]

In [14]:
node_features = {'paper': paper_feats.to(device), "author": author_feats.to(device)}


k = 8
loss_training_epoch = []
loss_validation_epoch = []

auc_training_epoch = []
auc_validation_epoch = []

acc_training = []
acc_validation = []

pred = HeteroDotProductPredictor()

In [15]:
model = Model(256, 1024, 512, train_hetero_graph.etypes, len(train_hetero_graph.nodes("paper")), author_feats.shape[1]).to(device)

opt = torch.optim.Adam(model.parameters())

for epoch in range(5000000):
    negative_graph = construct_negative_graph(train_hetero_graph, k, ('paper', 'authored', 'author'))
    pos_score, neg_score = model.scores(train_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
    loss = compute_loss_logits(pos_score.to(device), neg_score.to(device))
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 10 == 0:
        with torch.no_grad():
          logits_train = model(train_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
          acc_train = accuracy(logits_train, train_hetero_graph.to(device))
          auc_train = compute_auc(pos_score.cpu(), neg_score.cpu())
          loss_train = loss.item()

          loss_training_epoch.append(loss_train)
          auc_training_epoch.append(auc_train)

          logits_val = model(val_hetero_graph.to(device), "", node_features, ('paper', 'authored', 'author'))
          acc_val = accuracy(logits_val, val_hetero_graph.to(device))

          negative_graph = construct_negative_graph(val_hetero_graph, k, ('paper', 'authored', 'author'))
          pos_score_eval, neg_score_eval = model.scores(val_hetero_graph.to(device), negative_graph.to(device), node_features, ('paper', 'authored', 'author'))
          loss_val = compute_loss_logits(pos_score_eval.to(device), neg_score.to(device)).item()
          auc_val = compute_auc(pos_score_eval.cpu(), neg_score_eval.cpu())

          loss_validation_epoch.append(loss_val)
          auc_validation_epoch.append(auc_val)

          acc_training.append(acc_train)
          acc_validation.append(acc_val)

          print(f"EPOCH: {epoch}; Loss: {loss_train}, AUC: {auc_train}, Acc Train: {acc_train}; Loss {loss_val}, AUC: {auc_val} Acc Evaluation: {acc_val}")
    """
    if epoch % 50 == 0:
      with torch.no_grad():
        total_list  = loss_training_epoch + loss_validation_epoch + auc_training_epoch + auc_validation_epoch + acc_training + acc_validation
        with open(f'./drive/MyDrive/Bachelor_thesis/models/author_metrics_{epoch}_final.txt', 'w') as f:
          for item in total_list:
              f.write(str(item) + '\n')
        torch.save(model, f"./drive/MyDrive/Bachelor_thesis/models/author_{epoch}_final.pt")
    """

EPOCH: 0; Loss: 0.6933674216270447, AUC: 0.5058396841721274, Acc Train: 0.005870020964360587; Loss 0.6934388279914856, AUC: 0.5120361111111111 Acc Evaluation: 0.006666666666666667
EPOCH: 10; Loss: 0.6593655347824097, AUC: 0.7704891794013069, Acc Train: 0.13584905660377358; Loss 0.6532668471336365, AUC: 0.5977361111111111 Acc Evaluation: 0.04
EPOCH: 20; Loss: 0.5308005213737488, AUC: 0.6927981378197944, Acc Train: 0.033962264150943396; Loss 0.48865777254104614, AUC: 0.5649972222222223 Acc Evaluation: 0.01
EPOCH: 30; Loss: 0.3809870779514313, AUC: 0.6061314711531277, Acc Train: 0.012578616352201259; Loss 0.262660413980484, AUC: 0.5373951388888889 Acc Evaluation: 0.006666666666666667
EPOCH: 40; Loss: 0.3369716703891754, AUC: 0.6365892040135543, Acc Train: 0.009643605870020965; Loss 0.14426401257514954, AUC: 0.5321 Acc Evaluation: 0.006666666666666667
EPOCH: 50; Loss: 0.32721319794654846, AUC: 0.6996721516817637, Acc Train: 0.020545073375262055; Loss 0.12499365210533142, AUC: 0.54334166666

KeyboardInterrupt: ignored