# Split Relations Model

## Import libraries

In [286]:
import os
import sys
import itertools
import gc

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

print('PyTroch Version', torch.__version__)
print('GPU Available:', torch.cuda.is_available())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTroch Version 1.10.2
GPU Available: False


In [230]:
import pykeen
pykeen.env()

| Key             | Value                    |
|-----------------|--------------------------|
| OS              | posix                    |
| Platform        | Linux                    |
| Release         | 5.13.0-30-generic        |
| Time            | Sun Apr  3 22:30:15 2022 |
| Python          | 3.9.7                    |
| PyKEEN          | 1.7.0                    |
| PyKEEN Hash     | UNHASHED                 |
| PyKEEN Branch   |                          |
| PyTorch         | 1.10.2                   |
| CUDA Available? | false                    |
| CUDA Version    | N/A                      |
| cuDNN Version   | N/A                      |


## Import data

In [231]:
from torch_geometric.datasets import IMDB

dataset = IMDB(root='./data/imdb')
print('Dataset:', dataset)

print('Number of graphs:', len(dataset))

data = dataset[0]
print(data)

num_classes = len(data['movie'].y.unique())
print('Number of classes:', num_classes)
print('Classes:', data['movie'].y.unique())

Dataset: IMDB()
Number of graphs: 1
HeteroData(
  [1mmovie[0m={
    x=[4278, 3066],
    y=[4278],
    train_mask=[4278],
    val_mask=[4278],
    test_mask=[4278]
  },
  [1mdirector[0m={ x=[2081, 3066] },
  [1mactor[0m={ x=[5257, 3066] },
  [1m(movie, to, director)[0m={ edge_index=[2, 4278] },
  [1m(movie, to, actor)[0m={ edge_index=[2, 12828] },
  [1m(director, to, movie)[0m={ edge_index=[2, 4278] },
  [1m(actor, to, movie)[0m={ edge_index=[2, 12828] }
)
Number of classes: 3
Classes: tensor([0, 1, 2])


## Create TransR embeddings

### Creating triples

We want to create a (n,3)-tensor where each row will be a (node_id, relation_id, node_id) triple. We have 3 types of nodes: movie, director, actor and 4 types of realtions.

In [232]:
print('Ranges of each node type')
print('Movie:',0,'-',data['movie'].x.size()[0]-1)
print('Director:', data['movie'].x.size()[0], '-', data['movie'].x.size()[0]+data['director'].x.size()[0]-1)
print('Actor:', data['movie'].x.size()[0]+data['director'].x.size()[0], data['movie'].x.size()[0]+data['director'].x.size()[0]+data['actor'].x.size()[0]-1)

Ranges of each node type
Movie: 0 - 4277
Director: 4278 - 6358
Actor: 6359 11615


Reindex tails in `movie_to_director` and `movie_to_actor` relations

In [233]:
movie_size = data['movie'].x.size()[0]
director_size = data['director'].x.size()[0]
print(movie_size, director_size)
movie_size = data['movie'].x.size()[0]
director_size = data['director'].x.size()[0]
offset_director = torch.tensor([[0],[movie_size]])
offset_director = offset_director.tile(1, data[('movie', 'to', 'director')].edge_index.size()[1])
movie_to_director = data[('movie', 'to', 'director')].edge_index + offset_director
offset_actor = torch.tensor([[0],[movie_size + director_size]])
offset_actor = offset_actor.tile(1, data[('movie', 'to', 'actor')].edge_index.size()[1])
movie_to_actor = data[('movie', 'to', 'actor')].edge_index + offset_actor
print(movie_to_director.size(), movie_to_actor.size())

4278 2081
torch.Size([2, 4278]) torch.Size([2, 12828])


- `movie-to-actor`: 0
- `movie-to-director`: 1

In [234]:
pad = torch.zeros(movie_to_actor.size()[1])
movie_to_actor = torch.column_stack((movie_to_actor[0],pad,movie_to_actor[1]))
pad = torch.ones(movie_to_director.size()[1])
movie_to_director = torch.column_stack((movie_to_director[0],pad,movie_to_director[1]))
print(movie_to_director.size(), movie_to_actor.size())

torch.Size([4278, 3]) torch.Size([12828, 3])


In [235]:
triples = torch.concat((movie_to_director, movie_to_actor))
triples.size()

torch.Size([17106, 3])

In [236]:
triples

tensor([[0.0000e+00, 1.0000e+00, 5.0670e+03],
        [1.0000e+00, 1.0000e+00, 4.9580e+03],
        [2.0000e+00, 1.0000e+00, 6.0350e+03],
        ...,
        [4.2770e+03, 0.0000e+00, 6.4590e+03],
        [4.2770e+03, 0.0000e+00, 7.4370e+03],
        [4.2770e+03, 0.0000e+00, 7.7980e+03]])

In [237]:
entity_ids = [i for i in range (data.num_nodes)]
relation_ids = [0,1]

In [238]:
triples.long().type()

'torch.LongTensor'

In [270]:
models = ['TransR', 'TransH', 'RotatE', 'DistMult', 'ComplEx']

### Pipeline

In [298]:
from pykeen.triples import CoreTriplesFactory
from pykeen.pipeline import pipeline

def train_kg_emb_model(model, triples, data, entity_ids, relation_ids):
    # Load training data
    emb_dim = 200
    num_epochs = 50
    training = CoreTriplesFactory(mapped_triples=triples.long(), num_entities=data.num_nodes, 
                                    num_relations=2, create_inverse_triples=False,
                                    entity_ids=entity_ids, relation_ids=relation_ids)

    result = pipeline(
        training=training,
        testing=training,
        model=model,
        random_seed=42,
        model_kwargs={"embedding_dim":emb_dim},
        training_kwargs={"num_epochs":num_epochs},
        #stopper='early', # early stopping arguments. You need the validation set with this.
        #stopper_kwargs=dict(frequency=3, patience=3, relative_delta=0.002),
        #epochs=5,  # short epochs for testing - you should go higher
    )

    # Save mode to a directory. Yoy can load it afterwards
    result.save_to_directory('Models_pykeen/imdb_' + model + '_ep_' + str(num_epochs) + '_dim_' + str(emb_dim))
    return result

for model in models:
    result = train_kg_emb_model(model, triples, data, entity_ids, relation_ids)

Training epochs on cpu: 100%|██████████| 50/50 [02:04<00:00,  2.49s/epoch, loss=0.000898, prev_loss=0.00107]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 17.1k/17.1k [03:36<00:00, 79.2triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 216.11s seconds


## Create GNN

In [274]:
data.metadata()[1]

[('movie', 'to', 'director'),
 ('movie', 'to', 'actor'),
 ('director', 'to', 'movie'),
 ('actor', 'to', 'movie')]

In [287]:
import itertools
from torch_geometric.nn import Linear, HeteroConv, GCNConv, SAGEConv

class SplitGCN(torch.nn.Module):
    def __init__(self, metadata, emb_size, dense_size, out_size, num_layers, device='cpu', transr=None):
        # TODO: Implement a function that initializes self.convs, 
        # self.bns, and self.softmax.
        super(SplitGCN, self).__init__()

        self.num_relations = int(len(metadata[1])/2)
        self.device = device
        self.edge_conv_dict = {}
        self.num_layers = num_layers
        
        '''
        for edge_type in metadata[1]:
            self.convs = nn.ModuleList()
            for _ in range (num_layers):
                self.conv = HeteroConv({
                    edge_type: SAGEConv((-1,-1), emb_size)
                })
                self.convs.append(self.conv)
            self.edge_conv_dict[edge_type] = self.convs
        '''

        for node_type in ['actor', 'director']:
            self.conv = HeteroConv({
                ('movie', 'to', node_type): SAGEConv((-1,-1), emb_size),
                (node_type, 'to', 'movie'): SAGEConv((-1,-1), emb_size)
            })
            self.edge_conv_dict[node_type] = self.conv

        if transr is not None:
            self.transr = transr

        self.linears = nn.ModuleList()
        for i in range (self.num_relations):
            try:
                linear = Linear(emb_size + self.transr[i].size()[1], dense_size)
            except:
                linear = Linear(emb_size, dense_size)
            self.linears.append(linear)
        
        self.clflinear = Linear(self.num_relations*dense_size, out_size)


    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()


    def forward(self, x_dict, edge_index_dict, transr=None):
        # TODO: Implement a function that takes the feature tensor x and
        # edge_index tensor adj_t and returns the output tensor as
        # shown in the figure.
        '''
        concatenated_embs = []
        #print(len(self.convs))
        for relation, edge_tensor in edge_index_dict.items():
            # create one convolution subnetwork for each realtion
            convs = self.edge_conv_dict[relation]

            # isolate relations
            single_edge_index_dict = {relation: edge_tensor.to(self.device)}
            single_x_dict = {relation[0]: x_dict[relation[0]].to(self.device), 
                            relation[2]: x_dict[relation[2]].to(self.device)}

            # forward propagate data into each relation-specific GNN
            for i, conv in enumerate(convs):
                print(i,conv)
                print(single_x_dict.keys())
                print(single_edge_index_dict.keys())
                single_x_dict = conv(single_x_dict, single_edge_index_dict)
            concatenated_embs.append(single_x_dict['movie'])
        '''
        
        concatenated_embs = []
        for node_type in ['actor', 'director']:
            conv = self.edge_conv_dict[node_type]
            
            relation = ('movie', 'to', node_type)
            reverse = (node_type, 'to', 'movie')

            single_edge_index_dict = {relation: edge_index_dict[relation].to(self.device),
                                        reverse: edge_index_dict[reverse].to(self.device)}
            
            single_x_dict = {relation[0]: x_dict[relation[0]].to(self.device), 
                            relation[2]: x_dict[relation[2]].to(self.device)}

            single_x_dict = conv(single_x_dict, single_edge_index_dict)
            concatenated_embs.append(single_x_dict['movie'])

        linear_outputs = []
        for i, (x, linear) in enumerate(zip(concatenated_embs, self.linears)):
            #print(x.size())
            #print(transr.size())
            if transr:
                x = torch.cat((x, transr[i]), 1)
            x = linear(x)
            #print('linear output', x.size())
            linear_outputs.append(x)
        
        #print(len(linear_outputs))
        out = torch.cat(linear_outputs, dim=-1)
        #print('clf layer input', out.size())
        out = self.clflinear(out)
        #print('clf layer output', out.size())
        return out

'''
model = SplitGCN(data.metadata(), 256, 256, num_classes, 2, device=device, transr=rel_embs_list)
print(model)
del model
gc.collect()
'''

'\nmodel = SplitGCN(data.metadata(), 256, 256, num_classes, 2, device=device, transr=rel_embs_list)\nprint(model)\ndel model\ngc.collect()\n'

In [276]:
def train(model, data, optimizer, loss_fn, transr=None):
    # TODO: Implement a function that trains the model by 
    # using the given optimizer and loss_fn.

    model.train()
    optimizer.zero_grad()
    outputs = model(data.x_dict, data.edge_index_dict, transr)
    mask = data['movie'].train_mask
    labels = data['movie'].y[mask]
    loss = loss_fn(outputs[mask], labels)
    loss.backward()
    optimizer.step()

    return loss.item()

In [277]:
# Test function here
@torch.no_grad()
def test(model, data, transr=None, save_model_results=False):
    # a function that tests the model by 
    # using the given split_idx and evaluator.
    model.eval()

    # The output of model on all data
    out = model(data.x_dict, data.edge_index_dict, transr)
    pred = out.argmax(dim=-1, keepdim=True)[:,0]
    
    accs = []
    for split in ['train_mask', 'val_mask', 'test_mask']:
        mask = data['movie'][split]
        acc = (pred[mask] == data['movie'].y[mask]).sum() / mask.sum()
        accs.append(float(acc))
        #print(pred[mask].size(), data['movie'].y[mask].size())
        #print(pred[mask].sum(), data['movie'].y[mask].sum())
        #print(mask.sum(), (pred[mask] == data['movie'].y[mask]).sum())

    if save_model_results:
      print ("Saving Model Predictions")

      data = {}
      data['y_pred'] = pred.view(-1).cpu().detach().numpy()

      df = pd.DataFrame(data=data)
      # Save locally as csv
      df.to_csv('imdb.csv', sep=',', index=False)


    return accs

## Training & Evaluation

### Train & Evaluate with KG Embeddings

In [282]:
import copy

def run_train_eval(data, kg_emb):

    print('KG embedding model:', kg_emb)

    transr_model = torch.load('Models_pykeen/' + kg_emb + '/trained_model.pkl')

    transr_emb_entity = transr_model.entity_representations[0](indices=None).detach().numpy()
    transr_emb_relation = transr_model.relation_representations[0](indices=None).detach().numpy()
    num_clf_nodes = data[data.node_types[0]].x.shape[0]
    embs = transr_emb_relation

    rel_embs_list = []
    for i in range (embs.shape[0]):
        rel_embs_list.append(torch.tensor(np.tile(embs[i], (num_clf_nodes, 1))))
    print('Number of relation embeddings:', len(rel_embs_list))

    # Model Parameters
    emb_dim = 128
    hidden_dim = 128
    output_dim = num_classes
    num_layers = 3

    model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device, rel_embs_list)

    model, data = model.to(device), data.to(device)

    print(next(model.parameters()).device)

    # Reset model parameters
    # model.reset_parameters()

    # Define hyperparameters
    num_epochs = 30
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = F.cross_entropy

    best_model = None
    best_valid_acc = 0

    for epoch in range(1, 1 + num_epochs):
        # train model
        loss = train(model, data, optimizer, loss_fn, rel_embs_list)
        
        # evaluate model 
        result = test(model, data, rel_embs_list)
        
        train_acc, valid_acc, test_acc = result
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_model = copy.deepcopy(model)
        print(f'Epoch: {epoch:02d}, '
                f'Loss: {loss:.4f}, '
                f'Train: {100 * train_acc:.2f}%, '
                f'Valid: {100 * valid_acc:.2f}% '
                f'Test: {100 * test_acc:.2f}%')
    
    with open("scores.txt", "a") as f:
        f.write(kg_emb + ' best valid acc: ' + str(best_valid_acc) + '\n')

In [283]:
kg_emb_models = [] 
for model in os.listdir('./Models_pykeen'):
    if 'ep' in model:
        kg_emb_models.append(model)
kg_emb_models

['imdb_TransR_ep_50_dim_200',
 'imdb_TransH_ep_25_dim_200',
 'imdb_RotatE_ep_25_dim_200',
 'imdb_DistMult_ep_25_dim_200',
 'imdb_TransH_ep_50_dim_200',
 'imdb_TransR_ep_25_dim_200',
 'imdb_RotatE_ep_50_dim_200',
 'imdb_DistMult_ep_50_dim_200']

In [299]:
for model in kg_emb_models:
    run_train_eval(data, model)

cpu
Epoch: 01, Loss: 1.1074, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 02, Loss: 1.6132, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 03, Loss: 1.1192, Train: 35.25%, Valid: 39.50% Test: 36.23%
Epoch: 04, Loss: 1.1734, Train: 35.25%, Valid: 39.50% Test: 36.23%
Epoch: 05, Loss: 1.2742, Train: 36.50%, Valid: 22.75% Test: 28.72%
Epoch: 06, Loss: 1.2396, Train: 57.00%, Valid: 30.75% Test: 37.18%
Epoch: 07, Loss: 1.0959, Train: 73.50%, Valid: 46.25% Test: 41.55%
Epoch: 08, Loss: 0.9894, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 09, Loss: 1.0024, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 10, Loss: 1.0439, Train: 39.75%, Valid: 39.50% Test: 36.43%
Epoch: 11, Loss: 1.0188, Train: 41.50%, Valid: 39.50% Test: 36.46%
Epoch: 12, Loss: 0.9359, Train: 69.25%, Valid: 42.00% Test: 39.02%
Epoch: 13, Loss: 0.8500, Train: 92.00%, Valid: 51.75% Test: 48.13%
Epoch: 14, Loss: 0.7988, Train: 90.75%, Valid: 50.50% Test: 50.83%
Epoch: 15, Loss: 0.7824, Train: 81.50%, Valid: 42.50% Test

### Train & Evaluate without KG Embeddings

In [288]:
import copy

# Model Parameters
emb_dim = 128
hidden_dim = 128
output_dim = num_classes
num_layers = 3

model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device)

model, data = model.to(device), data.to(device)

print(next(model.parameters()).device)

# Reset model parameters
# model.reset_parameters()

# Define hyperparameters
num_epochs = 25
optimizer = torch.optim.Adam(model.parameters())
loss_fn = F.cross_entropy

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + num_epochs):
    # train model
    loss = train(model, data, optimizer, loss_fn)
    
    # evaluate model 
    result = test(model, data)
    
    train_acc, valid_acc, test_acc = result
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_model = copy.deepcopy(model)
    print(f'Epoch: {epoch:02d}, '
            f'Loss: {loss:.4f}, '
            f'Train: {100 * train_acc:.2f}%, '
            f'Valid: {100 * valid_acc:.2f}% '
            f'Test: {100 * test_acc:.2f}%')

cpu
Epoch: 01, Loss: 1.0969, Train: 62.25%, Valid: 46.50% Test: 41.14%
Epoch: 02, Loss: 1.0792, Train: 68.75%, Valid: 45.50% Test: 41.29%
Epoch: 03, Loss: 1.0598, Train: 70.50%, Valid: 46.75% Test: 41.75%
Epoch: 04, Loss: 1.0369, Train: 71.50%, Valid: 47.25% Test: 42.32%
Epoch: 05, Loss: 1.0100, Train: 73.00%, Valid: 47.75% Test: 42.58%
Epoch: 06, Loss: 0.9789, Train: 74.25%, Valid: 48.25% Test: 42.64%
Epoch: 07, Loss: 0.9435, Train: 75.25%, Valid: 48.50% Test: 43.36%
Epoch: 08, Loss: 0.9039, Train: 77.75%, Valid: 48.00% Test: 43.67%
Epoch: 09, Loss: 0.8602, Train: 80.75%, Valid: 48.50% Test: 44.13%
Epoch: 10, Loss: 0.8128, Train: 84.75%, Valid: 49.25% Test: 44.42%
Epoch: 11, Loss: 0.7619, Train: 87.50%, Valid: 50.50% Test: 45.26%
Epoch: 12, Loss: 0.7083, Train: 89.50%, Valid: 51.00% Test: 46.18%
Epoch: 13, Loss: 0.6526, Train: 93.00%, Valid: 52.50% Test: 47.38%
Epoch: 14, Loss: 0.5955, Train: 93.50%, Valid: 53.50% Test: 48.39%
Epoch: 15, Loss: 0.5379, Train: 94.25%, Valid: 54.50% Test

In [289]:
best_valid_acc

0.5899999737739563

### Train & Evaluate with look-up embedding

In [294]:
num_clf_nodes = data[data.node_types[0]].x.shape[0]
embs = [None, None]
embs[0] = torch.Tensor([0,1])
embs[1] = torch.Tensor([1,0])
rel_embs_list = []
for i in range (len(embs)):
    rel_embs_list.append(torch.tensor(np.tile(embs[i], (num_clf_nodes, 1))))
print('Number of relation embeddings:', len(rel_embs_list))

Number of relation embeddings: 2


In [296]:
import copy

# Model Parameters
emb_dim = 128
hidden_dim = 128
output_dim = num_classes
num_layers = 3

model = SplitGCN(data.metadata(), emb_dim, hidden_dim, output_dim, num_layers, device, rel_embs_list)

model, data = model.to(device), data.to(device)

print(next(model.parameters()).device)

# Reset model parameters
# model.reset_parameters()

# Define hyperparameters
num_epochs = 30
optimizer = torch.optim.Adam(model.parameters())
loss_fn = F.cross_entropy

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + num_epochs):
    # train model
    loss = train(model, data, optimizer, loss_fn, rel_embs_list)
    
    # evaluate model 
    result = test(model, data, rel_embs_list)
    
    train_acc, valid_acc, test_acc = result
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_model = copy.deepcopy(model)
    print(f'Epoch: {epoch:02d}, '
            f'Loss: {loss:.4f}, '
            f'Train: {100 * train_acc:.2f}%, '
            f'Valid: {100 * valid_acc:.2f}% '
            f'Test: {100 * test_acc:.2f}%')

cpu
Epoch: 01, Loss: 1.0855, Train: 44.75%, Valid: 40.50% Test: 36.72%
Epoch: 02, Loss: 1.0672, Train: 44.50%, Valid: 39.75% Test: 36.52%
Epoch: 03, Loss: 1.0472, Train: 48.25%, Valid: 40.25% Test: 36.69%
Epoch: 04, Loss: 1.0235, Train: 54.25%, Valid: 40.75% Test: 37.29%
Epoch: 05, Loss: 0.9955, Train: 63.00%, Valid: 41.75% Test: 38.04%
Epoch: 06, Loss: 0.9629, Train: 67.75%, Valid: 43.50% Test: 39.36%
Epoch: 07, Loss: 0.9255, Train: 70.75%, Valid: 46.50% Test: 41.17%
Epoch: 08, Loss: 0.8832, Train: 74.25%, Valid: 47.50% Test: 42.90%
Epoch: 09, Loss: 0.8362, Train: 77.25%, Valid: 49.00% Test: 44.16%
Epoch: 10, Loss: 0.7851, Train: 84.75%, Valid: 49.50% Test: 45.83%
Epoch: 11, Loss: 0.7305, Train: 89.00%, Valid: 52.50% Test: 47.12%
Epoch: 12, Loss: 0.6732, Train: 93.00%, Valid: 54.00% Test: 48.39%
Epoch: 13, Loss: 0.6142, Train: 94.50%, Valid: 55.25% Test: 50.37%
Epoch: 14, Loss: 0.5548, Train: 96.50%, Valid: 57.75% Test: 51.67%
Epoch: 15, Loss: 0.4959, Train: 97.00%, Valid: 58.75% Test