In [2]:
from ontolearn.knowledge_base import KnowledgeBase
from utils.dataloader import CSDataLoader
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

In [3]:
from argparse import Namespace
import json
import torch, pandas as pd
with open("settings.json") as setting:
    args = json.load(setting)
args = Namespace(**args)

In [4]:
import numpy as np, time
from collections import defaultdict

In [5]:
def before_pad(arg):
    arg_temp = []
    for atm in arg:
        if atm == 'PAD':
            break
        arg_temp.append(atm)
    return arg_temp

In [6]:
def map_to_token(model, idx_array):
    return model.inv_vocab[idx_array]

In [7]:
def collate_batch(batch):
    pos_emb_list = []
    neg_emb_list = []
    target_tokens_list = []
    target_labels = []
    for pos_emb, neg_emb, label in batch:
        pos_emb_list.append(pos_emb)
        neg_emb_list.append(neg_emb)
        target_labels.append(label)
    pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0)
    neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0)
    target_labels = pad_sequence(target_labels, batch_first=True, padding_value=-100)
    return pos_emb_list, neg_emb_list, target_labels

In [8]:
def get_data(kb, embeddings, kwargs):
    data_test_path = f"datasets/{kb}/Test_data/Data.json"
    with open(data_test_path, "r") as file:
        data_test = json.load(file)
    data_test = list(data_test.items())
    test_dataset = CSDataLoader(data_test, embeddings, kwargs)
    print("Number of learning problems: ", len(test_dataset))
    test_dataloader = DataLoader(test_dataset, batch_size=kwargs.batch_size, num_workers=kwargs.num_workers, collate_fn=collate_batch, shuffle=False)
    return test_dataloader

In [9]:
def get_ensemble_prediction(models, x1, x2):
    for i,model in enumerate(models):
        model.eval()
        if i == 0:
            _, scores = model(x1, x2)
        else:
            _, sc = model(x1, x2)
            scores = scores + sc
    scores = scores/len(models)
    prediction = model.inv_vocab[scores.argmax(1)]
    return prediction, scores

In [10]:
kb = "carcinogenesis"
embeddings = pd.read_csv(f"embeddings/{kb}/ConEx_entity_embeddings.csv").set_index('Unnamed: 0')
#args.batch_size = 4
args.knowledge_base_path = "datasets/"+f"{kb}/{kb}.owl"
dataloader = get_data(kb, embeddings, args)



Number of learning problems:  98


In [11]:
pos_emb, neg_emb, _ = next(iter(dataloader))

In [39]:
def predict_single(model_name):
    model = torch.load(f"datasets/{kb}/Model_weights/{model_name}.pt", map_location=torch.device('cpu'))
    print(f"Predictions with {model_name}")
    return model(pos_emb, neg_emb)

## View some predictions

### Single model

In [40]:
predictions, scores = predict_single("SetTransformer")

Predictions with SetTransformer


In [41]:
predictions[0]

array(['Bond-2', ' ', '⊔', ' ', 'Di67a', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [42]:
predictions[1]

array(['Iodine', ' ', '⊔', ' ', '(', '∃', ' ', 'inBond', '.', '(',
       'Carbon-17', ' ', '⊔', ' ', 'Fluorine-92', ')', ')', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

### Ensemble

In [28]:
def predict_ensemble(model_names):
    models = [torch.load(f"datasets/{kb}/Model_weights/{name}.pt", map_location=torch.device('cpu'))\
              for name in model_names]
    print("Predictions with Ensemble model")
    return get_ensemble_prediction(models, pos_emb, neg_emb)

In [29]:
predictions_ens, scores = predict_ensemble(["SetTransformer", "GRU"])

Predictions with Ensemble model


In [26]:
predictions_ens[0]

array(['Bond-2', ' ', '⊔', ' ', 'Di67a', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [27]:
predictions_ens[1]

array(['Iodine', ' ', '⊔', ' ', '(', '∃', ' ', 'inBond', '.', '(',
       'Carbon-17', ' ', '⊔', ' ', 'Manganese', ')', ')', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [30]:
predictions_ens, scores = predict_ensemble(["LSTM", "GRU"])

Predictions with Ensemble model


In [31]:
predictions_ens[0]

array(['Bond-2', ' ', '⊔', ' ', 'Lead-121', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [32]:
predictions_ens[1]

array(['Iodine', ' ', '⊔', ' ', '(', '∃', ' ', 'inBond', '.', '(',
       'Carbon-19', ' ', '⊔', ' ', 'Manganese', ')', ')', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [33]:
predictions_ens, scores = predict_ensemble(["LSTM", "SetTransformer"])

Predictions with Ensemble model


In [34]:
predictions_ens[0]

array(['Bond-2', ' ', '⊔', ' ', 'Di67a', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [35]:
predictions_ens[1]

array(['Iodine', ' ', '⊔', ' ', '(', '∃', ' ', 'inBond', '.', '(',
       'Carbon-17', ' ', '⊔', ' ', 'Fluorine', ')', ')', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [36]:
predictions_ens, scores = predict_ensemble(["LSTM", "GRU", "SetTransformer"])

Predictions with Ensemble model


In [37]:
predictions_ens[0]

array(['Bond-2', ' ', '⊔', ' ', 'Di67a', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)

In [38]:
predictions_ens[1]

array(['Iodine', ' ', '⊔', ' ', '(', '∃', ' ', 'inBond', '.', '(',
       'Carbon-17', ' ', '⊔', ' ', 'Manganese', ')', ')', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',
       'PAD', 'PAD', 'PAD', 'PAD'], dtype=object)