# Split Relations Model

## Import libraries

In [1]:
import os
import sys
import itertools
import gc

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F

print('PyTroch Version', torch.__version__)
print('GPU Available:', torch.cuda.is_available())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTroch Version 1.10.2
GPU Available: False


In [2]:
import pykeen
pykeen.env()

| Key             | Value                    |
|-----------------|--------------------------|
| OS              | posix                    |
| Platform        | Linux                    |
| Release         | 5.15.0-41-generic        |
| Time            | Sun Jul 31 00:46:46 2022 |
| Python          | 3.9.7                    |
| PyKEEN          | 1.7.0                    |
| PyKEEN Hash     | UNHASHED                 |
| PyKEEN Branch   |                          |
| PyTorch         | 1.10.2                   |
| CUDA Available? | false                    |
| CUDA Version    | N/A                      |
| cuDNN Version   | N/A                      |


## Import data

In [3]:
dataset_name = 'imdb'

In [4]:
if dataset_name == 'dblp':
    from torch_geometric.datasets import DBLP
    dataset = DBLP(root='./data/dblp')
elif dataset_name == 'imdb':
    from torch_geometric.datasets import IMDB
    dataset = IMDB(root='./data/imdb')

print('Dataset:', dataset)

print('Number of graphs:', len(dataset))

data = dataset[0]
print(data)

Dataset: IMDB()
Number of graphs: 1
HeteroData(
  [1mmovie[0m={
    x=[4278, 3066],
    y=[4278],
    train_mask=[4278],
    val_mask=[4278],
    test_mask=[4278]
  },
  [1mdirector[0m={ x=[2081, 3066] },
  [1mactor[0m={ x=[5257, 3066] },
  [1m(movie, to, director)[0m={ edge_index=[2, 4278] },
  [1m(movie, to, actor)[0m={ edge_index=[2, 12828] },
  [1m(director, to, movie)[0m={ edge_index=[2, 4278] },
  [1m(actor, to, movie)[0m={ edge_index=[2, 12828] }
)


## Parameterize datasets

In [5]:
data.metadata()

(['movie', 'director', 'actor'],
 [('movie', 'to', 'director'),
  ('movie', 'to', 'actor'),
  ('director', 'to', 'movie'),
  ('actor', 'to', 'movie')])

In [6]:
data.metadata()[0][0]

'movie'

In [7]:
main_entity = data.metadata()[0][0]
rest_entities = data.metadata()[0][1:]
print(main_entity, 'is the main entity')
print(rest_entities, 'are the rest entities')

movie is the main entity
['director', 'actor'] are the rest entities


In [8]:
num_classes = len(data[main_entity].y.unique())
print('Number of classes:', num_classes)
print('Classes:', data[main_entity].y.unique())

Number of classes: 3
Classes: tensor([0, 1, 2])


## Change split ratios

We are going to change split ratio into 20/10/70 for train/val/test with respect to class support proportion in each test.

In [9]:
data[main_entity].train_mask.sum()

tensor(400)

In [10]:
len(data[main_entity].x)

4278

In [16]:
class_support = torch.bincount(data[main_entity].y).tolist()
class_support

[1135, 1584, 1559]

In [17]:
w = [x/sum(class_support) for x in class_support]
w

[0.2653108929406265, 0.3702664796633941, 0.3644226273959794]

In [11]:
TRAIN_SIZE = 0.2
VAL_SIZE = 0.1

rand_sample = np.random.rand(len(data[main_entity].x))

train_msk = rand_sample < TRAIN_SIZE
val_msk = (rand_sample < TRAIN_SIZE + VAL_SIZE) & (rand_sample >= TRAIN_SIZE)
test_msk = rand_sample >= TRAIN_SIZE + VAL_SIZE

In [12]:
print('TRAIN SIZE', sum(train_msk))
print('VAL SIZE', sum(val_msk))
print('TEST SIZE', sum(test_msk))

TRAIN SIZE 786
VAL SIZE 433
TEST SIZE 2838


In [13]:
train_msk = torch.from_numpy(train_msk)
val_msk = torch.from_numpy(val_msk)
test_msk = torch.from_numpy(test_msk)

In [14]:
data[main_entity].train_mask.sum()

tensor(400)

In [15]:
train_msk.sum()

tensor(786)

In [16]:
data[main_entity].train_mask = train_msk
data[main_entity].val_mask = val_msk
data[main_entity].test_mask = test_msk

In [17]:
data[main_entity].train_mask.sum()

tensor(786)

## Create TransR embeddings

### Creating triples

We want to create a (n,3)-tensor where each row will be a (node_id, relation_id, node_id) triple. We have 3 types of nodes: movie, director, actor and 4 types of realtions.

In [302]:
print('Ranges of each node type')
print('Movie:',0,'-',data['movie'].x.size()[0]-1)
print('Director:', data['movie'].x.size()[0], '-', data['movie'].x.size()[0]+data['director'].x.size()[0]-1)
print('Actor:', data['movie'].x.size()[0]+data['director'].x.size()[0], data['movie'].x.size()[0]+data['director'].x.size()[0]+data['actor'].x.size()[0]-1)

Ranges of each node type
Movie: 0 - 4277
Director: 4278 - 6358
Actor: 6359 11615


In [159]:
print('Ranges of each node type')

# Main Entity
print("Main Entity")
print(main_entity,':',0,'-',data[main_entity].x.size()[0]-1)

# Rest entities
print("Rest Entities")
start = data[main_entity].x.size()[0]
for entity in rest_entities:
    try:
        end = start + data[entity].x.size()[0]-1
    except AttributeError:
        end = start + data[entity].num_nodes-1
    print(entity, ':', start, '-', end)
    start = end + 1

Ranges of each node type
Main Entity
author : 0 - 4056
Rest Entities
paper : 4057 - 18384
term : 18385 - 26107
conference : 26108 - 26127


Reindex tails in `movie_to_director` and `movie_to_actor` relations

In [160]:
data.metadata()

(['author', 'paper', 'term', 'conference'],
 [('author', 'to', 'paper'),
  ('paper', 'to', 'author'),
  ('paper', 'to', 'term'),
  ('paper', 'to', 'conference'),
  ('term', 'to', 'paper'),
  ('conference', 'to', 'paper')])

In [None]:
'''
main_entity_size = data[main_entity].x.size()[0]
current_size = main_entity_size
for entity in rest_entities:
    try:
        entity_size = data[entity].x.size()[0]
    except AttributeError:
        entity_size = data[entity].num_nodes
    offset_entity = torch.tensor([[0],[current_size]])
    offset_entity = offset_entity.tile(1, data[('movie', 'to', 'director')].edge_index.size()[1])
'''

In [303]:
movie_size = data['movie'].x.size()[0]
director_size = data['director'].x.size()[0]
print(movie_size, director_size)
movie_size = data['movie'].x.size()[0]
director_size = data['director'].x.size()[0]
offset_director = torch.tensor([[0],[movie_size]])
offset_director = offset_director.tile(1, data[('movie', 'to', 'director')].edge_index.size()[1])
movie_to_director = data[('movie', 'to', 'director')].edge_index + offset_director

offset_actor = torch.tensor([[0],[movie_size + director_size]])
offset_actor = offset_actor.tile(1, data[('movie', 'to', 'actor')].edge_index.size()[1])
movie_to_actor = data[('movie', 'to', 'actor')].edge_index + offset_actor
print(movie_to_director.size(), movie_to_actor.size())

4278 2081
torch.Size([2, 4278]) torch.Size([2, 12828])


- `movie-to-actor`: 0
- `movie-to-director`: 1

In [None]:
pad = torch.zeros(movie_to_actor.size()[1])
movie_to_actor = torch.column_stack((movie_to_actor[0],pad,movie_to_actor[1]))
pad = torch.ones(movie_to_director.size()[1])
movie_to_director = torch.column_stack((movie_to_director[0],pad,movie_to_director[1]))
print(movie_to_director.size(), movie_to_actor.size())

In [None]:
triples = torch.concat((movie_to_director, movie_to_actor))
triples.size()

#### DBLP

In [170]:
author_size = data['author'].x.size()[0]
paper_size = data['paper'].x.size()[0]
term_size = data['term'].x.size()[0]
print(author_size, paper_size, term_size)

# Author - Paper
offset_paper = torch.tensor([[0],[author_size]])
offset_paper = offset_paper.tile(1, data[('author', 'to', 'paper')].edge_index.size()[1])
author_to_paper = data[('author', 'to', 'paper')].edge_index + offset_paper

# Paper - Term
offset_term = torch.tensor([[0],[author_size + paper_size]])
offset_term = offset_term.tile(1, data[('paper', 'to', 'term')].edge_index.size()[1])
paper_to_term = data[('paper', 'to', 'term')].edge_index + offset_term

# Paper - Conference
offset_conference = torch.tensor([[0],[author_size + paper_size + term_size]])
offset_conference = offset_conference.tile(1, data[('paper', 'to', 'conference')].edge_index.size()[1])
paper_to_conference = data[('paper', 'to', 'conference')].edge_index + offset_conference

print(author_to_paper.size(), paper_to_term.size(), paper_to_conference.size())

4057 14328 7723
torch.Size([2, 19645]) torch.Size([2, 85810]) torch.Size([2, 14328])


- `author-to-paper`: 0
- `paper-to-term`: 1
- `paper-to-conference`: 2

In [171]:
pad = torch.zeros(author_to_paper.size()[1])
author_to_paper = torch.column_stack((author_to_paper[0],pad,author_to_paper[1]))
pad = torch.ones(paper_to_term.size()[1])
paper_to_term = torch.column_stack((paper_to_term[0],pad,paper_to_term[1]))
pad = torch.full((paper_to_conference.size()[1],), 2)
paper_to_conference = torch.column_stack((paper_to_conference[0],pad,paper_to_conference[1]))
print(author_to_paper.size(), paper_to_term.size(), paper_to_conference.size())

torch.Size([19645, 3]) torch.Size([85810, 3]) torch.Size([14328, 3])


torch.Size([17106, 3])

In [306]:
triples

tensor([[0.0000e+00, 1.0000e+00, 5.0670e+03],
        [1.0000e+00, 1.0000e+00, 4.9580e+03],
        [2.0000e+00, 1.0000e+00, 6.0350e+03],
        ...,
        [4.2770e+03, 0.0000e+00, 6.4590e+03],
        [4.2770e+03, 0.0000e+00, 7.4370e+03],
        [4.2770e+03, 0.0000e+00, 7.7980e+03]])

In [307]:
entity_ids = [i for i in range (data.num_nodes)]
relation_ids = [0,1]

In [308]:
triples.long().type()

'torch.LongTensor'

In [338]:
models = ['TransR', 'TransH', 'RotatE', 'DistMult', 'ComplEx']

### Pipeline

In [339]:
from pykeen.triples import CoreTriplesFactory
from pykeen.pipeline import pipeline

def train_kg_emb_model(model, triples, data, entity_ids, relation_ids):
    # Load training data
    num_epochs = 100
    emb_dim = 200
    rel_dim = 200
    
    if model == 'TransR':
        model_kwargs = {"embedding_dim":emb_dim,
                        "relation_dim": rel_dim}
    else:
        model_kwargs = {"embedding_dim":emb_dim}

    training = CoreTriplesFactory(mapped_triples=triples.long(), num_entities=data.num_nodes, 
                                    num_relations=2, create_inverse_triples=False,
                                    entity_ids=entity_ids, relation_ids=relation_ids)

    result = pipeline(
        training=training,
        testing=training,
        model=model,
        random_seed=42,
        model_kwargs=model_kwargs,
        training_kwargs={"num_epochs":num_epochs},
        #stopper='early', # early stopping arguments. You need the validation set with this.
        #stopper_kwargs=dict(frequency=3, patience=3, relative_delta=0.002),
        #epochs=5,  # short epochs for testing - you should go higher
    )

    # Save mode to a directory. You can load it afterwards
    result.save_to_directory('Models_pykeen/imdb_' + model + '_ep_' + str(num_epochs) + '_dim_' + str(emb_dim))
    return result

for model in models:
    result = train_kg_emb_model(model, triples, data, entity_ids, relation_ids)

Training epochs on cpu: 100%|██████████| 100/100 [11:20<00:00,  6.81s/epoch, loss=4.29e-5, prev_loss=4.53e-5]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 17.1k/17.1k [17:54<00:00, 15.9triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 1074.44s seconds
Training epochs on cpu: 100%|██████████| 100/100 [03:36<00:00,  2.16s/epoch, loss=0.00273, prev_loss=0.00274]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 17.1k/17.1k [07:29<00:00, 38.0triple/s]
INFO:pykeen.eval

## Create GNN

In [18]:
data.metadata()[1]

[('author', 'to', 'paper'),
 ('paper', 'to', 'author'),
 ('paper', 'to', 'term'),
 ('paper', 'to', 'conference'),
 ('term', 'to', 'paper'),
 ('conference', 'to', 'paper')]

In [19]:
import itertools
from torch_geometric.nn import Linear, HeteroConv, GCNConv, SAGEConv, GATConv

class SplitGCN(torch.nn.Module):
    def __init__(self, metadata, emb_size, dense_size, out_size, 
                    num_dense_layers, num_clf_layers, p, device='cpu', transr=None):

        # TODO: Implement a function that initializes self.convs, 
        # self.bns, and self.softmax.
        super(SplitGCN, self).__init__()

        self.num_relations = int(len(metadata[1])/2)
        self.device = device
        self.edge_conv_dict = {}
        self.num_dense_layers = num_dense_layers
        self.num_clf_layers = num_clf_layers
        self.dropout = nn.Dropout(p)
        self.relu = nn.ReLU()

        for node_type in [('author', 'paper'), ('paper', 'term'), ('paper', 'conference')]:
            self.conv = HeteroConv({
                (node_type[0], 'to', node_type[1]): SAGEConv((-1, -1), emb_size),
                (node_type[1], 'to', node_type[0]): SAGEConv((-1, -1), emb_size)
            })
            self.edge_conv_dict[node_type] = self.conv

        if transr is not None:
            self.transr = transr

        self.linears_rel = nn.ModuleList()
        for i in range (self.num_relations):
            self.rel_mlps = nn.ModuleList()
            try:
                linear = Linear(emb_size + self.transr[i].size()[1], dense_size)
            except:
                linear = Linear(emb_size, dense_size)
            self.rel_mlps.append(linear)
            for i in range(num_dense_layers-1):
                linear = Linear(emb_size, dense_size)
                self.rel_mlps.append(linear)
            self.linears_rel.append(self.rel_mlps)
        
        self.linears_clf = nn.ModuleList()
        clflinear = Linear(self.num_relations*dense_size, dense_size)
        self.linears_clf.append(clflinear)
        for i in range(self.num_clf_layers - 2):
            clflinear = Linear(dense_size, dense_size)

            self.linears_clf.append(clflinear)
        clflinear = Linear(dense_size, num_classes)
        self.linears_clf.append(clflinear)

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()


    def forward(self, x_dict, edge_index_dict, transr=None):
        # TODO: Implement a function that takes the feature tensor x and
        # edge_index tensor adj_t and returns the output tensor as
        # shown in the figure.
        
        concatenated_embs = []
        for node_type in [('author', 'paper'), ('paper', 'term'), ('paper', 'conference')]:
            conv = self.edge_conv_dict[node_type]
            
            relation = (node_type[0], 'to', node_type[1])
            reverse = (node_type[1], 'to', node_type[0])

            single_edge_index_dict = {relation: edge_index_dict[relation].to(self.device),
                                        reverse: edge_index_dict[reverse].to(self.device)}
            
            single_x_dict = {relation[0]: x_dict[relation[0]].to(self.device), 
                            relation[2]: x_dict[relation[2]].to(self.device)}

            single_x_dict = conv(single_x_dict, single_edge_index_dict)
            concatenated_embs.append(single_x_dict['author'])

        linear_outputs = []
        #print("here")
        #print(len(concatenated_embs), len(self.linears_rel))
        for i, (x, mlp) in enumerate(zip(concatenated_embs, self.linears_rel)):
            #print(type(x))
            #print(x.size())
            #print(len(transr))
            #print(transr[0].size())
            if transr:
                x = torch.cat((x, transr[i]), 1)
            #print(x.size())
            for linear in mlp:
                #print("im in")
                #print(x.size())
                x = linear(x)
                x = self.relu(x)
                x = self.dropout(x)
            
            #print('linear output', x.size())
            linear_outputs.append(x)
        
        #print(len(linear_outputs))
        out = torch.cat(linear_outputs, dim=-1)
        #print('clf layer input', out.size())
        for i in range(len(self.linears_clf) - 1):
            out = self.linears_clf[i](out)
            out = self.relu(out)
            out = self.dropout(out)
        out = self.linears_clf[-1](out)
        #print('clf layer output', out.size())
        return out


model = SplitGCN(data.metadata(), 128, 200, num_classes, 3, 3, 0.2)
print(model)
del model
gc.collect()


SplitGCN(
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (conv): HeteroConv(num_relations=2)
  (linears_rel): ModuleList(
    (0): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
    (1): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
    (2): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
  )
  (rel_mlps): ModuleList(
    (0): Linear(128, 200, bias=True)
    (1): Linear(128, 200, bias=True)
    (2): Linear(128, 200, bias=True)
  )
  (linears_clf): ModuleList(
    (0): Linear(600, 200, bias=True)
    (1): Linear(200, 200, bias=True)
    (2): Linear(200, 4, bias=True)
  )
)


672

In [220]:
import itertools
from torch_geometric.nn import Linear, HeteroConv, GCNConv, SAGEConv, GATConv

class SplitGCN(torch.nn.Module):
    def __init__(self, metadata, emb_size, dense_size, out_size, 
                    num_dense_layers, num_clf_layers, p, device='cpu', transr=None):

        # TODO: Implement a function that initializes self.convs, 
        # self.bns, and self.softmax.
        super(SplitGCN, self).__init__()

        self.num_relations = int(len(metadata[1])/2)
        self.device = device
        self.edge_conv_dict = {}
        self.num_dense_layers = num_dense_layers
        self.num_clf_layers = num_clf_layers
        self.dropout = nn.Dropout(p)
        self.relu = nn.ReLU()

        for node_type in ['actor', 'director']:
            self.conv = HeteroConv({
                ('movie', 'to', node_type): SAGEConv((-1, -1), emb_size),
                (node_type, 'to', 'movie'): SAGEConv((-1, -1), emb_size)
            })
            self.edge_conv_dict[node_type] = self.conv

        if transr is not None:
            self.transr = transr

        self.linears_rel = nn.ModuleList()
        for i in range (self.num_relations):
            self.rel_mlps = nn.ModuleList()
            try:
                linear = Linear(emb_size + self.transr[i].size()[1], dense_size)
            except:
                linear = Linear(emb_size, dense_size)
            self.rel_mlps.append(linear)
            for i in range(num_dense_layers-1):
                linear = Linear(emb_size, dense_size)
                self.rel_mlps.append(linear)
            self.linears_rel.append(self.rel_mlps)
        
        self.linears_clf = nn.ModuleList()
        clflinear = Linear(self.num_relations*dense_size, dense_size)
        self.linears_clf.append(clflinear)
        for i in range(self.num_clf_layers - 2):
            clflinear = Linear(dense_size, dense_size)

            self.linears_clf.append(clflinear)
        clflinear = Linear(dense_size, num_classes)
        self.linears_clf.append(clflinear)

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()


    def forward(self, x_dict, edge_index_dict, transr=None):
        # TODO: Implement a function that takes the feature tensor x and
        # edge_index tensor adj_t and returns the output tensor as
        # shown in the figure.
        
        concatenated_embs = []
        for node_type in ['actor', 'director']:
            conv = self.edge_conv_dict[node_type]
            
            relation = ('movie', 'to', node_type)
            reverse = (node_type, 'to', 'movie')

            single_edge_index_dict = {relation: edge_index_dict[relation].to(self.device),
                                        reverse: edge_index_dict[reverse].to(self.device)}
            
            single_x_dict = {relation[0]: x_dict[relation[0]].to(self.device), 
                            relation[2]: x_dict[relation[2]].to(self.device)}

            single_x_dict = conv(single_x_dict, single_edge_index_dict)
            concatenated_embs.append(single_x_dict['movie'])

        linear_outputs = []
        #print("here")
        #print(len(concatenated_embs), len(self.linears_rel))
        for i, (x, mlp) in enumerate(zip(concatenated_embs, self.linears_rel)):
            #print(type(x))
            #print(x.size())
            #print(len(transr))
            #print(transr[0].size())
            if transr:
                x = torch.cat((x, transr[i]), 1)
            #print(x.size())
            for linear in mlp:
                #print("im in")
                #print(x.size())
                x = linear(x)
                x = self.relu(x)
                x = self.dropout(x)
            
            #print('linear output', x.size())
            linear_outputs.append(x)
        
        #print(len(linear_outputs))
        out = torch.cat(linear_outputs, dim=-1)
        #print('clf layer input', out.size())
        for i in range(len(self.linears_clf) - 1):
            out = self.linears_clf[i](out)
            out = self.relu(out)
            out = self.dropout(out)
        out = self.linears_clf[-1](out)
        #print('clf layer output', out.size())
        return out


model = SplitGCN(data.metadata(), 128, 200, num_classes, 3, 3, 0.2)
print(model)
del model
gc.collect()


SplitGCN(
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (conv): HeteroConv(num_relations=2)
  (linears_rel): ModuleList(
    (0): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
    (1): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
  )
  (rel_mlps): ModuleList(
    (0): Linear(128, 200, bias=True)
    (1): Linear(128, 200, bias=True)
    (2): Linear(128, 200, bias=True)
  )
  (linears_clf): ModuleList(
    (0): Linear(400, 200, bias=True)
    (1): Linear(200, 200, bias=True)
    (2): Linear(200, 3, bias=True)
  )
)


473

In [30]:
import itertools
from torch_geometric.nn import Linear, HeteroConv, GCNConv, SAGEConv, GATConv

class SplitGCN(torch.nn.Module):
    def __init__(self, metadata, emb_size, dense_size, out_size, 
                    num_dense_layers, num_clf_layers, p, device='cpu', transr=None):

        # TODO: Implement a function that initializes self.convs, 
        # self.bns, and self.softmax.
        super(SplitGCN, self).__init__()

        self.num_relations = int(len(metadata[1])/2)
        self.device = device
        self.edge_conv_dict = {}
        self.num_dense_layers = num_dense_layers
        self.num_clf_layers = num_clf_layers
        self.dropout = nn.Dropout(p)
        self.relu = nn.ReLU()


        '''
        [('author', 'to', 'paper'),
        ('paper', 'to', 'author'),
        ('paper', 'to', 'term'),
        ('paper', 'to', 'conference'),
        ('term', 'to', 'paper'),
        ('conference', 'to', 'paper')]
        '''
    
        self.conv = HeteroConv({
            ('author', 'to', 'paper'): SAGEConv((-1, -1), emb_size),
            ('paper', 'to', 'author'): SAGEConv((-1, -1), emb_size)
        })
        self.edge_conv_dict['author'] = self.conv

        self.conv = HeteroConv({
            ('term', 'to', 'paper'): SAGEConv((-1, -1), emb_size),
            ('paper', 'to', 'term'): SAGEConv((-1, -1), emb_size)
        })
        self.edge_conv_dict['term'] = self.conv

        self.conv = HeteroConv({
            ('conference', 'to', 'paper'): SAGEConv((-1, -1), emb_size),
            ('paper', 'to', 'conference'): SAGEConv((-1, -1), emb_size)
        })
        self.edge_conv_dict['conference'] = self.conv


        if transr is not None:
            self.transr = transr

        self.linears_rel = nn.ModuleList()
        for i in range (self.num_relations):
            self.rel_mlps = nn.ModuleList()
            try:
                linear = Linear(emb_size + self.transr[i].size()[1], dense_size)
            except:
                linear = Linear(emb_size, dense_size)
            self.rel_mlps.append(linear)
            for i in range(num_dense_layers-1):
                linear = Linear(emb_size, dense_size)
                self.rel_mlps.append(linear)
            self.linears_rel.append(self.rel_mlps)
        
        self.linears_clf = nn.ModuleList()
        clflinear = Linear(self.num_relations*dense_size, dense_size)
        self.linears_clf.append(clflinear)
        for i in range(self.num_clf_layers - 2):
            clflinear = Linear(dense_size, dense_size)

            self.linears_clf.append(clflinear)
        clflinear = Linear(dense_size, num_classes)
        self.linears_clf.append(clflinear)

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()


    def forward(self, x_dict, edge_index_dict, transr=None):
        # TODO: Implement a function that takes the feature tensor x and
        # edge_index tensor adj_t and returns the output tensor as
        # shown in the figure.
        
        concatenated_embs = []
        for node_type in ['author', 'term']:
            conv = self.edge_conv_dict[node_type]
            
            relation = ('paper', 'to', node_type)
            reverse = (node_type, 'to', 'paper')

            single_edge_index_dict = {relation: edge_index_dict[relation].to(self.device),
                                        reverse: edge_index_dict[reverse].to(self.device)}
            
            single_x_dict = {relation[0]: x_dict[relation[0]].to(self.device), 
                            relation[2]: x_dict[relation[2]].to(self.device)}

            single_x_dict = conv(single_x_dict, single_edge_index_dict)
            concatenated_embs.append(single_x_dict['author'])

        linear_outputs = []
        #print("here")
        #print(len(concatenated_embs), len(self.linears_rel))
        for i, (x, mlp) in enumerate(zip(concatenated_embs, self.linears_rel)):
            #print(type(x))
            #print(x.size())
            #print(len(transr))
            #print(transr[0].size())
            if transr:
                print(x)
                x = torch.cat((x, transr[i]), 1)
            #print(x.size())
            for linear in mlp:
                #print("im in")
                #print(x.size())
                x = linear(x)
                x = self.relu(x)
                x = self.dropout(x)
            
            #print('linear output', x.size())
            linear_outputs.append(x)
        
        #print(len(linear_outputs))
        out = torch.cat(linear_outputs, dim=-1)
        #print('clf layer input', out.size())
        for i in range(len(self.linears_clf) - 1):
            out = self.linears_clf[i](out)
            out = self.relu(out)
            out = self.dropout(out)
        out = self.linears_clf[-1](out)
        #print('clf layer output', out.size())
        return out


model = SplitGCN(data.metadata(), 128, 200, num_classes, 3, 3, 0.2)
print(model)
del model
gc.collect()


SplitGCN(
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (conv): HeteroConv(num_relations=2)
  (linears_rel): ModuleList(
    (0): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
    (1): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
    (2): ModuleList(
      (0): Linear(128, 200, bias=True)
      (1): Linear(128, 200, bias=True)
      (2): Linear(128, 200, bias=True)
    )
  )
  (rel_mlps): ModuleList(
    (0): Linear(128, 200, bias=True)
    (1): Linear(128, 200, bias=True)
    (2): Linear(128, 200, bias=True)
  )
  (linears_clf): ModuleList(
    (0): Linear(600, 200, bias=True)
    (1): Linear(200, 200, bias=True)
    (2): Linear(200, 4, bias=True)
  )
)


1577

## Helper functions

In [21]:
def train(model, data, optimizer, loss_fn, target_entity, transr=None):
    # TODO: Implement a function that trains the model by 
    # using the given optimizer and loss_fn.

    model.train()
    optimizer.zero_grad()
    outputs = model(data.x_dict, data.edge_index_dict, transr)
    mask = data[target_entity].train_mask
    labels = data[target_entity].y[mask]
    loss = loss_fn(outputs[mask], labels)
    loss.backward()
    optimizer.step()

    return loss.item()

In [22]:
# Test function here
@torch.no_grad()
def test(model, data, target_entity, transr=None, save_model_results=False):
    # a function that tests the model by 
    # using the given split_idx and evaluator.
    model.eval()

    # The output of model on all data
    out = model(data.x_dict, data.edge_index_dict, transr)
    pred = out.argmax(dim=-1, keepdim=True)[:,0]
    
    accs = []
    f1scores = []
    for split in ['train_mask', 'val_mask', 'test_mask']:
        mask = data[target_entity][split]
        acc = (pred[mask] == data[target_entity].y[mask]).sum() / mask.sum()
        accs.append(float(acc))
        f1 = f1_score(data[target_entity].y[mask], pred[mask], average='macro')
        f1scores.append(float(f1))
        #print(pred[mask].size(), data['movie'].y[mask].size())
        #print(pred[mask].sum(), data['movie'].y[mask].sum())
        #print(mask.sum(), (pred[mask] == data['movie'].y[mask]).sum())

    if save_model_results:
      print ("Saving Model Predictions")

      data = {}
      data['y_pred'] = pred.view(-1).cpu().detach().numpy()

      df = pd.DataFrame(data=data)
      # Save locally as csv
      df.to_csv('imdb.csv', sep=',', index=False)


    return (accs, f1scores)

## Training & Evaluation

### Train & Evaluate with KG Embeddings

In [23]:
train_size = data[main_entity].train_mask.sum().item()
val_size = data[main_entity].val_mask.sum().item()
test_size = data[main_entity].test_mask.sum().item()
print("Train size:", train_size)
print("Valid size:", val_size)
print("Test size:", test_size)
print("Total size:", train_size + val_size + test_size)

Train size: 786
Valid size: 433
Test size: 2838
Total size: 4057


In [24]:
import copy

def run_train_eval(target_entity, data, kg_emb, seed, output_file):

    torch.manual_seed(seed)
    print('KG embedding model:', kg_emb, "Seed:", seed)

    transr_model = torch.load('Models_pykeen/' + kg_emb + '/trained_model.pkl')

    transr_emb_entity = transr_model.entity_representations[0](indices=None).detach().numpy()
    transr_emb_relation = transr_model.relation_representations[0](indices=None).detach().numpy()
    num_clf_nodes = data[data.node_types[0]].x.shape[0]
    embs = transr_emb_relation

    rel_embs_list = []
    for i in range (embs.shape[0]):
        rel_embs_list.append(torch.tensor(np.tile(embs[i], (num_clf_nodes, 1))))
    print('Number of relation embeddings:', len(rel_embs_list))
    print('Embeddings size', rel_embs_list[0].size())

    # Model Parameters
    emb_dim = 128
    hidden_dim = 128
    output_dim = num_classes
    num_rel_layers = 2
    num_clf_layers = 2
    p = 0.2

    model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_rel_layers, num_clf_layers, p, device, rel_embs_list)        
    # num_layers = 3
    # model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device, rel_embs_list)


    model, data = model.to(device), data.to(device)

    print(next(model.parameters()).device)

    # Reset model parameters
    # model.reset_parameters()

    # Define hyperparameters
    num_epochs = 300
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = F.cross_entropy

    best_model = None
    best_test_acc = 0
    best_test_f1 = 0

    for epoch in range(1, 1 + num_epochs):
        # train model
        loss = train(model, data, optimizer, loss_fn, target_entity, rel_embs_list)
        
        # evaluate model 
        (accuracy, f1) = test(model, data, target_entity, rel_embs_list)
        
        train_acc, valid_acc, test_acc = accuracy
        _, _, test_f1 = f1
        if test_f1 > best_test_f1:
            best_test_f1 = test_f1
            # best_model = copy.deepcopy(model)
        print(f'Epoch: {epoch:02d}, '
                f'Loss: {loss:.4f}, '
                f'Train: {100 * train_acc:.2f}%, '
                f'Valid: {100 * valid_acc:.2f}% '
                f'Test: {100 * test_acc:.2f}%')
    
    with open(output_file, "a") as f:
        f.write(kg_emb + ' best test f1 score: ' + str(best_test_f1) + '\n')
    
    return model, best_test_f1

In [26]:
kg_emb_models = [] 
for model in os.listdir('./Models_pykeen'):
    if 'ep' in model and 'dblp' in model:
        kg_emb_models.append(model)
kg_emb_models

['dblp_TransR_ep_100_dim_200',
 'dblp_TransH_ep_100_dim_200',
 'dblp_ComplEx_ep_100_dim_200',
 'dblp_RotatE_ep_100_dim_200',
 'dblp_DistMult_ep_100_dim_200']

### Run Training

In [31]:
output_file = 'scores_sage_relu_dblp.txt'
for i, model in enumerate(kg_emb_models):
    test_acc_list = []
    for seed in range (5): 
        model_summary, acc = run_train_eval(main_entity, data, model, seed, output_file)
        test_acc_list.append(acc)
    avg_acc = sum(test_acc_list)/5

    if i==0 and seed==0:
        with open(output_file, "a") as f:
            f.write(model_summary)    
    
    with open(output_file, "a") as f:
        f.write('Average accuracy: ' + str(avg_acc) + '\n')
        

KG embedding model: dblp_TransR_ep_100_dim_200 Seed: 0
Number of relation embeddings: 2
Embeddings size torch.Size([4057, 200])
cpu
tensor([[ 0.0119, -0.0454,  0.1339,  ..., -0.0889, -0.0362,  0.0005],
        [ 0.1192,  0.0404,  0.0743,  ...,  0.2375,  0.1329,  0.0346],
        [-0.0386,  0.0194,  0.2685,  ...,  0.0118,  0.0783, -0.0123],
        ...,
        [-0.0230,  0.0302,  0.0034,  ...,  0.0028,  0.0186,  0.0046],
        [-0.0230,  0.0302,  0.0034,  ...,  0.0028,  0.0186,  0.0046],
        [ 0.0501, -0.0377,  0.1515,  ..., -0.0804, -0.0108, -0.0979]],
       grad_fn=<AddBackward0>)
[]


TypeError: expected Tensor as element 0 in argument 0, but got list

### Train & Evaluate without KG Embeddings

In [363]:
test_acc_list = []
for seed in range (5):
    torch.manual_seed(seed)

    # Model Parameters
    emb_dim = 128
    hidden_dim = 128
    output_dim = num_classes
    num_layers = 3

    model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device)

    model, data = model.to(device), data.to(device)

    print(next(model.parameters()).device)

    # Reset model parameters
    # model.reset_parameters()

    # Define hyperparameters
    num_epochs = 25
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = F.cross_entropy

    best_model = None
    best_test_acc = 0

    for epoch in range(1, 1 + num_epochs):
        # train model
        loss = train(model, data, optimizer, loss_fn)
        
        # evaluate model 
        result = test(model, data)
        
        train_acc, valid_acc, test_acc = result
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_model = copy.deepcopy(model)
        print(f'Epoch: {epoch:02d}, '
                f'Loss: {loss:.4f}, '
                f'Train: {100 * train_acc:.2f}%, '
                f'Valid: {100 * valid_acc:.2f}% '
                f'Test: {100 * test_acc:.2f}%')

    test_acc_list.append(best_test_acc)
    with open("scores_revised.txt", "a") as f:
        f.write( 'No KG Embeddings best test acc: ' + str(best_test_acc) + '\n')

avg_acc = sum(test_acc_list)/5
with open("scores_revised.txt", "a") as f:
    f.write('Average accuracy: ' + str(avg_acc) + '\n')

cpu
Epoch: 01, Loss: 1.1009, Train: 66.00%, Valid: 47.50% Test: 41.09%
Epoch: 02, Loss: 1.0828, Train: 73.75%, Valid: 51.50% Test: 43.62%
Epoch: 03, Loss: 1.0632, Train: 74.00%, Valid: 49.00% Test: 43.47%
Epoch: 04, Loss: 1.0405, Train: 74.75%, Valid: 49.00% Test: 43.21%
Epoch: 05, Loss: 1.0140, Train: 75.75%, Valid: 49.25% Test: 43.04%
Epoch: 06, Loss: 0.9834, Train: 77.00%, Valid: 49.25% Test: 43.13%
Epoch: 07, Loss: 0.9486, Train: 80.00%, Valid: 48.50% Test: 43.42%
Epoch: 08, Loss: 0.9097, Train: 83.25%, Valid: 49.50% Test: 43.65%
Epoch: 09, Loss: 0.8667, Train: 85.25%, Valid: 50.00% Test: 44.02%
Epoch: 10, Loss: 0.8199, Train: 88.50%, Valid: 50.00% Test: 44.62%
Epoch: 11, Loss: 0.7697, Train: 90.75%, Valid: 51.00% Test: 45.37%
Epoch: 12, Loss: 0.7166, Train: 92.00%, Valid: 51.50% Test: 46.89%
Epoch: 13, Loss: 0.6611, Train: 93.50%, Valid: 53.25% Test: 47.90%
Epoch: 14, Loss: 0.6040, Train: 94.50%, Valid: 54.00% Test: 48.79%
Epoch: 15, Loss: 0.5461, Train: 95.50%, Valid: 54.00% Test

### Train & Evaluate with look-up embedding

In [364]:
num_clf_nodes = data[data.node_types[0]].x.shape[0]
embs = [None, None]
embs[0] = torch.Tensor([0,1])
embs[1] = torch.Tensor([1,0])
rel_embs_list = []
for i in range (len(embs)):
    rel_embs_list.append(torch.tensor(np.tile(embs[i], (num_clf_nodes, 1))))
print('Number of relation embeddings:', len(rel_embs_list))

Number of relation embeddings: 2


In [365]:
test_acc_list = []
for seed in range (5):
    torch.manual_seed(seed)

    # Model Parameters
    emb_dim = 128
    hidden_dim = 128
    output_dim = num_classes
    num_layers = 3

    model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device, rel_embs_list)

    model, data = model.to(device), data.to(device)

    print(next(model.parameters()).device)

    # Reset model parameters
    # model.reset_parameters()

    # Define hyperparameters
    num_epochs = 30
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = F.cross_entropy

    best_model = None
    best_test_acc = 0

    for epoch in range(1, 1 + num_epochs):
        # train model
        loss = train(model, data, optimizer, loss_fn, rel_embs_list)
        
        # evaluate model 
        result = test(model, data, rel_embs_list)
        
        train_acc, valid_acc, test_acc = result
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_model = copy.deepcopy(model)
        print(f'Epoch: {epoch:02d}, '
                f'Loss: {loss:.4f}, '
                f'Train: {100 * train_acc:.2f}%, '
                f'Valid: {100 * valid_acc:.2f}% '
                f'Test: {100 * test_acc:.2f}%')

    test_acc_list.append(best_test_acc)
    with open("scores_revised.txt", "a") as f:
        f.write( 'Look-up Embedding best test acc: ' + str(best_test_acc) + '\n')

avg_acc = sum(test_acc_list)/5
with open("scores_revised.txt", "a") as f:
    f.write('Average accuracy: ' + str(avg_acc) + '\n')

cpu
Epoch: 01, Loss: 1.0966, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 02, Loss: 1.0775, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 03, Loss: 1.0572, Train: 42.00%, Valid: 40.00% Test: 36.49%
Epoch: 04, Loss: 1.0337, Train: 51.50%, Valid: 40.50% Test: 36.75%
Epoch: 05, Loss: 1.0065, Train: 62.50%, Valid: 42.50% Test: 37.98%
Epoch: 06, Loss: 0.9755, Train: 69.25%, Valid: 43.75% Test: 39.62%
Epoch: 07, Loss: 0.9403, Train: 72.25%, Valid: 45.75% Test: 41.69%
Epoch: 08, Loss: 0.9012, Train: 73.50%, Valid: 46.75% Test: 43.42%
Epoch: 09, Loss: 0.8580, Train: 76.50%, Valid: 47.75% Test: 44.45%
Epoch: 10, Loss: 0.8110, Train: 78.75%, Valid: 50.50% Test: 45.54%
Epoch: 11, Loss: 0.7606, Train: 82.50%, Valid: 51.75% Test: 45.86%
Epoch: 12, Loss: 0.7072, Train: 86.00%, Valid: 53.25% Test: 46.84%
Epoch: 13, Loss: 0.6516, Train: 89.25%, Valid: 53.75% Test: 47.67%
Epoch: 14, Loss: 0.5947, Train: 91.50%, Valid: 55.50% Test: 48.79%
Epoch: 15, Loss: 0.5373, Train: 93.75%, Valid: 54.75% Test