## Evaluate NCES

In [1]:
import os, random
from utils.syntax_checker import SyntaxChecker
from utils.evaluator import Evaluator
from ontolearn.knowledge_base import KnowledgeBase
from nces import BaseConceptSynthesis
from nces.synthesizer import ConceptSynthesizer
from utils.data import Data
from owlapy.parser import DLSyntaxParser
from dataloader import CSDataLoader
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [2]:
from argparse import Namespace
import json
import torch, pandas as pd
with open("settings.json") as setting:
    args = json.load(setting)
args = Namespace(**args)

In [3]:
import numpy as np, time
from collections import defaultdict

In [4]:
def before_pad(arg):
    arg_temp = []
    for atm in arg:
        if atm == 'PAD':
            break
        arg_temp.append(atm)
    return arg_temp

In [5]:
def compute_accuracy(prediction, target):
    def soft(arg1, arg2):
        arg1_ = arg1
        arg2_ = arg2
        if isinstance(arg1_, str):
            arg1_ = set(before_pad(BaseConceptSynthesis.decompose(arg1_)))
        else:
            arg1_ = set(before_pad(arg1_))
        if isinstance(arg2_, str):
            arg2_ = set(before_pad(BaseConceptSynthesis.decompose(arg2_)))
        else:
            arg2_ = set(before_pad(arg2_))
        return 100*float(len(arg1_.intersection(arg2_)))/len(arg1_.union(arg2_))

    def hard(arg1, arg2):
        arg1_ = arg1
        arg2_ = arg2
        if isinstance(arg1_, str):
            arg1_ = before_pad(BaseConceptSynthesis.decompose(arg1_))
        else:
            arg1_ = before_pad(arg1_)
        if isinstance(arg2_, str):
            arg2_ = before_pad(BaseConceptSynthesis.decompose(arg2_))
        else:
            arg2_ = before_pad(arg2_)
        return 100*float(sum(map(lambda x,y: x==y, arg1_, arg2_)))/max(len(arg1_), len(arg2_))
    soft_acc = sum(map(soft, prediction, target))/len(target)
    hard_acc = sum(map(hard, prediction, target))/len(target)
    return soft_acc, hard_acc

In [6]:
def sample_examples(pos, neg, num_examples):
    if min(len(neg),len(pos)) >= num_examples//2:
        if len(pos) > len(neg):
            num_neg_ex = num_examples//2
            num_pos_ex = num_examples-num_neg_ex
        else:
            num_pos_ex = num_examples//2
            num_neg_ex = num_examples-num_pos_ex
    elif len(pos) > len(neg):
        num_neg_ex = len(neg)
        num_pos_ex = num_examples-num_neg_ex
    elif len(pos) < len(neg):
        num_pos_ex = len(pos)
        num_neg_ex = num_examples-num_pos_ex
    positive = random.sample(pos, num_pos_ex)
    negative = random.sample(neg, num_neg_ex)
    return positive, negative

In [7]:
def map_to_token(model, idx_array):
    return model.inv_vocab[idx_array]

In [8]:
def collate_batch(batch):
    pos_emb_list = []
    neg_emb_list = []
    target_tokens_list = []
    target_labels = []
    for pos_emb, neg_emb, label in batch:
        pos_emb_list.append(pos_emb)
        neg_emb_list.append(neg_emb)
        target_labels.append(label)
    pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0)
    neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0)
    target_labels = pad_sequence(target_labels, batch_first=True, padding_value=-100)
    return pos_emb_list, neg_emb_list, target_labels

In [9]:
def get_data(kb, embeddings, kwargs):
    data_test_path = f"datasets/{kb}/Test_data/Data.json"
    with open(data_test_path, "r") as file:
        data_test = json.load(file)
    data_test = list(data_test.items())
    test_dataset = CSDataLoader(data_test, embeddings, kwargs)
    print("Number of learning problems: ", len(test_dataset))
    test_dataloader = DataLoader(test_dataset, batch_size=kwargs.batch_size, num_workers=kwargs.num_workers, collate_fn=collate_batch, shuffle=False)
    return test_dataloader

In [10]:
def get_ensemble_prediction(models, x1, x2):
    for i,model in enumerate(models):
        model.eval()
        if i == 0:
            _, scores = model(x1, x2)
        else:
            _, sc = model(x1, x2)
            scores = scores + sc
    scores = scores/len(models)
    prediction = model.inv_vocab[scores.argmax(1)]
    return prediction

In [11]:
def predict_class_expressions(model_name, kb, args, ensemble=False):
    args.knowledge_base_path = "datasets/"+f"{kb}/{kb}.owl"
    embeddings = pd.read_csv(f"embeddings/{kb}/ConEx_entity_embeddings.csv").set_index('Unnamed: 0')
    dataloader = get_data(kb, embeddings, args)
    if ensemble:
        models = [torch.load(f"datasets/{kb}/Model_weights/{name}.pt", map_location=torch.device('cpu'))\
                  for name in ["SetTransformer", "GRU", "LSTM"]]
    if not ensemble:
        model = torch.load(f"datasets/{kb}/Model_weights/{model_name}.pt", map_location=torch.device('cpu'))
        model.eval()
    else:
        model = models[0]
        model.eval()
    soft_acc, hard_acc = 0.0, 0.0
    preds = []
    targets = []
    for x1, x2, labels in tqdm(dataloader):
        target_sequence = map_to_token(model, labels)
        if ensemble:
            pred_sequence = get_ensemble_prediction(models, x1, x2)
        else:
            pred_sequence, _ = model(x1, x2)
        preds.append(pred_sequence)
        targets.append(target_sequence)
        s_acc, h_acc = compute_accuracy(pred_sequence, target_sequence)
        soft_acc += s_acc
        hard_acc += h_acc
    print(f"Average syntactic accuracy, Soft: {soft_acc/len(dataloader)}%, Hard: {hard_acc/len(dataloader)}%")
    return np.concatenate(preds, 0), np.concatenate(targets, 0)

In [12]:
def evaluate_ensemble(kb_name, args, verbose=False):
    print('#'*50)
    print('NCES evaluation on {} KB:'.format(kb_name))
    print('#'*50)
    All_metrics = {"Ensemble": defaultdict(lambda: defaultdict(list))}
    print()
    kb = KnowledgeBase(path=f"datasets/{kb_name}/{kb_name}.owl")
    namespace = kb.ontology()._onto.base_iri
    if kb_name == 'family-benchmark':
        namespace = 'http://www.benchmark.org/family#'
    if kb_name == 'vicodi':
        namespace = 'http://vicodi.org/ontology#'
    print("KB namespace: ", namespace)
    print()
    syntax_checker = SyntaxChecker(kb)
    evaluator = Evaluator(kb)
    dl_parser = DLSyntaxParser(namespace = namespace)
    All_individuals = set(kb.individuals())
    with open(f"datasets/{kb_name}/Test_data/Data.json", "r") as file:
        data_test = json.load(file)
        
    t0 = time.time()
    predictions, targets = predict_class_expressions("Ensemble", kb_name, args, ensemble=True)
    t1 = time.time()
    duration = (t1-t0)/len(predictions)
    print()
    print(f"## Ensemble ##")
    print()
    for i, pb_str in enumerate(targets):
        pb_str = "".join(before_pad(pb_str))
        #examples = data_test[pb_str]
        #pos_examples = set(examples['positive examples'])
        #neg_examples = set(examples['negative examples'])
        try:
            end_idx = np.where(predictions[i] == 'PAD')[0][0] # remove padding token
        except IndexError:
            end_idx = 1
        pred = predictions[i][:end_idx]
        #print("Before parsing: ", pred.sum())
        succeed = False
        if (pred=='(').sum() > (pred==')').sum():
            for i in range(len(pred))[::-1]:
                try:
                    prediction = dl_parser.parse_expression("".join(pred.tolist().insert(i,')')))
                    succeed = True
                    break
                except Exception:
                    pass
            if not succeed:
                try:
                    pred = syntax_checker.correct(pred.sum())
                    pred = list(syntax_checker.get_suggestions(pred))[-1]
                    prediction = syntax_checker.get_concept(pred)
                except Exception:
                    print(f"Could not understand expression {pred}")
                    continue
        elif (pred==')').sum() > (pred=='(').sum():
            for i in range(len(pred)):
                try:
                    prediction = dl_parser.parse_expression("".join(pred.tolist().insert(i,'(')))
                    succeed = True
                    break
                except Exception:
                    pass
            if not succeed:
                try:
                    pred = syntax_checker.correct(pred.sum())
                    pred = list(syntax_checker.get_suggestions(pred))[-1]
                    prediction = syntax_checker.get_concept(pred)
                except Exception:
                    print(f"Could not understand expression {pred}")
                    continue
        else:
            try:
                prediction = dl_parser.parse_expression("".join(pred.tolist()))
            except Exception:
                try:
                    pred = syntax_checker.correct(pred.sum())
                    pred = list(syntax_checker.get_suggestions(pred))[-1]
                    prediction = syntax_checker.get_concept(pred)
                except Exception:
                    print(f"Could not understand expression {pred}")
                    continue
        target_expression = dl_parser.parse_expression(pb_str) # The target class expression
        try:
            positive_examples = {ind.get_iri().as_str().split("/")[-1] for ind in kb.individuals(target_expression)}
            negative_examples = All_individuals-positive_examples
            acc, f1 = evaluator.evaluate(prediction, positive_examples, negative_examples)
        except NotImplementedError:
            print("Invalid target or predicted expression, skipping")
            continue
        if verbose:
            print(f'Problem {i}, Target: {pb_str}, Prediction: {syntax_checker.renderer.render(prediction)}, Acc: {acc}, F1: {f1}')
            print()
        All_metrics["Ensemble"]['acc']['values'].append(acc)
        All_metrics["Ensemble"]['prediction']['values'].append(syntax_checker.renderer.render(prediction))
        All_metrics["Ensemble"]['f1']['values'].append(f1)
        All_metrics["Ensemble"]['time']['values'].append(duration)

    for metric in All_metrics["Ensemble"]:
        if metric != 'prediction':
            All_metrics["Ensemble"][metric]['mean'] = [np.mean(All_metrics["Ensemble"][metric]['values'])]
            All_metrics["Ensemble"][metric]['std'] = [np.std(All_metrics["Ensemble"][metric]['values'])]

    print("Ensemble"+' Speed: {}s +- {} / lp'.format(round(All_metrics["Ensemble"]['time']['mean'][0], 2),\
                                                           round(All_metrics["Ensemble"]['time']['std'][0], 2)))
    print("Ensemble"+' Avg Acc: {}% +- {} / lp'.format(round(All_metrics["Ensemble"]['acc']['mean'][0], 2),\
                                                           round(All_metrics["Ensemble"]['acc']['std'][0], 2)))
    print("Ensemble"+' Avg F1: {}% +- {} / lp'.format(round(All_metrics["Ensemble"]['f1']['mean'][0], 2),\
                                                           round(All_metrics["Ensemble"]['f1']['std'][0], 2)))
#        print("Ensemble"+' Avg Str_Acc: {}% +- {} / lp'.format(round(All_metrics["Ensemble"]['str_acc']['mean'][0], 2),\
#                                                               round(All_metrics["Ensemble"]['str_acc']['std'][0], 2)))
    
    print()

    with open("datasets/"+kb_name+"/Results/NCES_Ensemble.json", "w") as file:
        json.dump(All_metrics, file, indent=3, ensure_ascii=False)
    return All_metrics

In [13]:
def evaluate_nces(kb_name, models, args, verbose=False):
    print('#'*50)
    print('NCES evaluation on {} KB:'.format(kb_name))
    print('#'*50)
    desc = ""
    if args.shuffle_examples:
        desc = "_shuffle"
    All_metrics = {m: defaultdict(lambda: defaultdict(list)) for m in models}
    print()
    kb = KnowledgeBase(path=f"datasets/{kb_name}/{kb_name}.owl")
    namespace = kb.ontology()._onto.base_iri
    if kb_name == 'family-benchmark':
        namespace = 'http://www.benchmark.org/family#'
    if kb_name == 'vicodi':
        namespace = 'http://vicodi.org/ontology#'
    print("KB namespace: ", namespace)
    print()
    syntax_checker = SyntaxChecker(kb)
    evaluator = Evaluator(kb)
    dl_parser = DLSyntaxParser(namespace = namespace)
    All_individuals = set(kb.individuals())
    with open(f"datasets/{kb_name}/Test_data/Data.json", "r") as file:
        data_test = json.load(file)
    for model_name in models:
        t0 = time.time()
        predictions, targets = predict_class_expressions(model_name, kb_name, args)
        t1 = time.time()
        duration = (t1-t0)/len(predictions)
        print()
        print(f"##{model_name}##")
        print()
        for i, pb_str in enumerate(targets):
            pb_str = "".join(before_pad(pb_str))
            #examples = data_test[pb_str]
            #pos_examples = set(examples['positive examples'])
            #neg_examples = set(examples['negative examples'])
            try:
                end_idx = np.where(predictions[i] == 'PAD')[0][0] # remove padding token
            except IndexError:
                end_idx = 1
            pred = predictions[i][:end_idx]
            #print("Before parsing: ", pred.sum())
            succeed = False
            if (pred=='(').sum() > (pred==')').sum():
                for i in range(len(pred))[::-1]:
                    try:
                        prediction = dl_parser.parse_expression("".join(pred.tolist().insert(i,')')))
                        succeed = True
                        break
                    except Exception:
                        pass
                if not succeed:
                    try:
                        pred = syntax_checker.correct(pred.sum())
                        pred = list(syntax_checker.get_suggestions(pred))[-1]
                        prediction = syntax_checker.get_concept(pred)
                    except Exception:
                        print(f"Could not understand expression {pred}")
                        continue
            elif (pred==')').sum() > (pred=='(').sum():
                for i in range(len(pred)):
                    try:
                        prediction = dl_parser.parse_expression("".join(pred.tolist().insert(i,'(')))
                        succeed = True
                        break
                    except Exception:
                        pass
                if not succeed:
                    try:
                        pred = syntax_checker.correct(pred.sum())
                        pred = list(syntax_checker.get_suggestions(pred))[-1]
                        prediction = syntax_checker.get_concept(pred)
                    except Exception:
                        print(f"Could not understand expression {pred}")
                        continue
            else:
                try:
                    prediction = dl_parser.parse_expression("".join(pred.tolist()))
                except Exception:
                    try:
                        pred = syntax_checker.correct(pred.sum())
                        pred = list(syntax_checker.get_suggestions(pred))[-1]
                        prediction = syntax_checker.get_concept(pred)
                    except Exception:
                        print(f"Could not understand expression {pred}")
                        continue
            target_expression = dl_parser.parse_expression(pb_str) # The target class expression
            try:
                positive_examples = {ind.get_iri().as_str().split("/")[-1] for ind in kb.individuals(target_expression)}
                negative_examples = All_individuals-positive_examples
                acc, f1 = evaluator.evaluate(prediction, positive_examples, negative_examples)
                syntax_checker.renderer.render(prediction)
            except Exception:
                print("Invalid target or predicted expression, skipping")
                continue
            if verbose:
                print(f'Problem {i}, Target: {pb_str}, Prediction: {syntax_checker.renderer.render(prediction)}, Acc: {acc}, F1: {f1}')
                print()
            All_metrics[model_name]['acc']['values'].append(acc)
            All_metrics[model_name]['prediction']['values'].append(syntax_checker.renderer.render(prediction))
            All_metrics[model_name]['f1']['values'].append(f1)
            All_metrics[model_name]['time']['values'].append(duration)
            
        for metric in All_metrics[model_name]:
            if metric != 'prediction':
                All_metrics[model_name][metric]['mean'] = [np.mean(All_metrics[model_name][metric]['values'])]
                All_metrics[model_name][metric]['std'] = [np.std(All_metrics[model_name][metric]['values'])]
        
        print(model_name+' Speed: {}s +- {} / lp'.format(round(All_metrics[model_name]['time']['mean'][0], 2),\
                                                               round(All_metrics[model_name]['time']['std'][0], 2)))
        print(model_name+' Avg Acc: {}% +- {} / lp'.format(round(All_metrics[model_name]['acc']['mean'][0], 2),\
                                                               round(All_metrics[model_name]['acc']['std'][0], 2)))
        print(model_name+' Avg F1: {}% +- {} / lp'.format(round(All_metrics[model_name]['f1']['mean'][0], 2),\
                                                               round(All_metrics[model_name]['f1']['std'][0], 2)))
#        print(model_name+' Avg Str_Acc: {}% +- {} / lp'.format(round(All_metrics[model_name]['str_acc']['mean'][0], 2),\
#                                                               round(All_metrics[model_name]['str_acc']['std'][0], 2)))
#        print("\n")
        print()
        
        with open("datasets/"+kb_name+"/Results/NCES"+desc+".json", "w") as file:
            json.dump(All_metrics, file, indent=3, ensure_ascii=False)
    return All_metrics

# Carcinogenesis

In [14]:
F1_car = evaluate_nces("carcinogenesis", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on carcinogenesis KB:
##################################################





KB namespace:  http://dl-learner.org/carcinogenesis#

Number of learning problems:  98


100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


Average syntactic accuracy, Soft: 83.99277026828048%, Hard: 88.87130609819683%

##SetTransformer##

SetTransformer Speed: 0.04s +- 0.0 / lp
SetTransformer Avg Acc: 99.54% +- 2.94 / lp
SetTransformer Avg F1: 87.41% +- 25.34 / lp

Number of learning problems:  98


100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Average syntactic accuracy, Soft: 80.9322820037106%, Hard: 84.612996713837%

##GRU##

GRU Speed: 0.09s +- 0.0 / lp
GRU Avg Acc: 99.81% +- 0.63 / lp
GRU Avg F1: 87.02% +- 25.11 / lp

Number of learning problems:  98


100%|██████████| 1/1 [00:08<00:00,  8.11s/it]


Average syntactic accuracy, Soft: 74.26252319109464%, Hard: 81.5938496610765%

##LSTM##

LSTM Speed: 0.11s +- 0.0 / lp
LSTM Avg Acc: 99.69% +- 0.78 / lp
LSTM Avg F1: 82.46% +- 25.41 / lp



### After shuffling examples

In [15]:
args.shuffle_examples = True

In [16]:
F1_car_shuffle = evaluate_nces("carcinogenesis", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on carcinogenesis KB:
##################################################

KB namespace:  http://dl-learner.org/carcinogenesis#

Number of learning problems:  98


100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


Average syntactic accuracy, Soft: 83.99277026828048%, Hard: 88.87130609819683%

##SetTransformer##

SetTransformer Speed: 0.05s +- 0.0 / lp
SetTransformer Avg Acc: 99.54% +- 2.94 / lp
SetTransformer Avg F1: 87.41% +- 25.34 / lp

Number of learning problems:  98


100%|██████████| 1/1 [00:09<00:00,  9.18s/it]


Average syntactic accuracy, Soft: 82.22170686456401%, Hard: 86.21763857057972%

##GRU##

GRU Speed: 0.12s +- 0.0 / lp
GRU Avg Acc: 99.77% +- 0.65 / lp
GRU Avg F1: 86.22% +- 26.58 / lp

Number of learning problems:  98


100%|██████████| 1/1 [00:09<00:00,  9.44s/it]


Average syntactic accuracy, Soft: 73.32405689548548%, Hard: 81.96183018662006%

##LSTM##

LSTM Speed: 0.12s +- 0.0 / lp
LSTM Avg Acc: 99.6% +- 1.05 / lp
LSTM Avg F1: 79.76% +- 29.83 / lp



### Model ensembling

In [17]:
args.shuffle_examples = False

In [18]:
F1_car_ensemble = evaluate_ensemble("carcinogenesis", args, verbose=False)

##################################################
NCES evaluation on carcinogenesis KB:
##################################################

KB namespace:  http://dl-learner.org/carcinogenesis#

Number of learning problems:  98


100%|██████████| 1/1 [00:10<00:00, 10.12s/it]


Average syntactic accuracy, Soft: 89.04298082869512%, Hard: 91.59430438842202%

## Ensemble ##

Ensemble Speed: 0.13s +- 0.0 / lp
Ensemble Avg Acc: 99.93% +- 0.31 / lp
Ensemble Avg F1: 96.13% +- 10.65 / lp



# Mutagenesis

In [19]:
F1_mut = evaluate_nces("mutagenesis", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on mutagenesis KB:
##################################################

KB namespace:  http://dl-learner.org/mutagenesis#

Number of learning problems:  22


100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://dl-learner.org/mutagenesis#','Bond-2')))


Average syntactic accuracy, Soft: 77.1659779614325%, Hard: 81.58861340679523%

##SetTransformer##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
SetTransformer Speed: 0.17s +- 0.0 / lp
SetTransformer Avg Acc: 99.9% +- 0.19 / lp
SetTransformer Avg F1: 87.88% +- 26.34 / lp

Number of learning problems:  22


100%|██████████| 1/1 [00:03<00:00,  3.34s/it]


Average syntactic accuracy, Soft: 64.56709956709956%, Hard: 71.95018365472912%

##GRU##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
GRU Speed: 0.23s +- 0.0 / lp
GRU Avg Acc: 99.12% +- 2.66 / lp
GRU Avg F1: 60.64% +- 45.42 / lp

Number of learning problems:  22


100%|██████████| 1/1 [00:03<00:00,  3.53s/it]


Average syntactic accuracy, Soft: 60.954348681621404%, Hard: 68.00177095631642%

##LSTM##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
LSTM Speed: 0.24s +- 0.0 / lp
LSTM Avg Acc: 99.94% +- 0.1 / lp
LSTM Avg F1: 78.64% +- 39.35 / lp



### After shuffling examples

In [20]:
args.shuffle_examples = True

In [21]:
F1_mut_shuffle = evaluate_nces("mutagenesis", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on mutagenesis KB:
##################################################

KB namespace:  http://dl-learner.org/mutagenesis#

Number of learning problems:  22


100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://dl-learner.org/mutagenesis#','Bond-2')))


Average syntactic accuracy, Soft: 77.1659779614325%, Hard: 81.58861340679523%

##SetTransformer##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
SetTransformer Speed: 0.18s +- 0.0 / lp
SetTransformer Avg Acc: 99.9% +- 0.19 / lp
SetTransformer Avg F1: 87.88% +- 26.34 / lp

Number of learning problems:  22


100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


Average syntactic accuracy, Soft: 67.94765840220386%, Hard: 68.56978879706152%

##GRU##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
GRU Speed: 0.27s +- 0.0 / lp
GRU Avg Acc: 99.85% +- 0.38 / lp
GRU Avg F1: 75.93% +- 36.45 / lp

Number of learning problems:  22


100%|██████████| 1/1 [00:03<00:00,  3.42s/it]


Average syntactic accuracy, Soft: 59.63154269972452%, Hard: 66.86179981634528%

##LSTM##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
LSTM Speed: 0.23s +- 0.0 / lp
LSTM Avg Acc: 99.86% +- 0.38 / lp
LSTM Avg F1: 80.16% +- 35.68 / lp



### Model ensembling

In [22]:
args.shuffle_examples = False

In [23]:
F1_mut_ensemble = evaluate_ensemble("mutagenesis", args, verbose=False)

##################################################
NCES evaluation on mutagenesis KB:
##################################################

KB namespace:  http://dl-learner.org/mutagenesis#

Number of learning problems:  22


100%|██████████| 1/1 [00:06<00:00,  6.57s/it]
Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://dl-learner.org/mutagenesis#','Bond-2')))


Average syntactic accuracy, Soft: 77.12121212121211%, Hard: 81.5082644628099%

## Ensemble ##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Ensemble Speed: 0.39s +- 0.0 / lp
Ensemble Avg Acc: 99.96% +- 0.07 / lp
Ensemble Avg F1: 89.87% +- 27.7 / lp



# Family Benchmark

In [24]:
F1_fam = evaluate_nces("family-benchmark", ["SetTransformer", "GRU", "LSTM"], args, verbose=False)

##################################################
NCES evaluation on family-benchmark KB:
##################################################

KB namespace:  http://www.benchmark.org/family#

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


Average syntactic accuracy, Soft: 67.02903577903575%, Hard: 67.36635434979762%

##SetTransformer##

Invalid target or predicted expression, skipping
SetTransformer Speed: 0.03s +- 0.0 / lp
SetTransformer Avg Acc: 89.46% +- 7.25 / lp
SetTransformer Avg F1: 71.73% +- 20.01 / lp

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Average syntactic accuracy, Soft: 65.01602564102564%, Hard: 58.60224446549995%

##GRU##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
GRU Speed: 0.03s +- 0.0 / lp
GRU Avg Acc: 90.39% +- 7.91 / lp
GRU Avg F1: 75.68% +- 19.38 / lp

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


Average syntactic accuracy, Soft: 65.34623362748363%, Hard: 63.765963126576914%

##LSTM##

Invalid target or predicted expression, skipping
LSTM Speed: 0.02s +- 0.0 / lp
LSTM Avg Acc: 86.43% +- 19.56 / lp
LSTM Avg F1: 70.99% +- 26.35 / lp



### After shuffling examples

In [25]:
args.shuffle_examples = True

In [26]:
F1_fam_shuffle = evaluate_nces("family-benchmark", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on family-benchmark KB:
##################################################

KB namespace:  http://www.benchmark.org/family#

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.38s/it]


Average syntactic accuracy, Soft: 67.02903577903575%, Hard: 67.36635434979762%

##SetTransformer##

Invalid target or predicted expression, skipping
SetTransformer Speed: 0.03s +- 0.0 / lp
SetTransformer Avg Acc: 89.46% +- 7.25 / lp
SetTransformer Avg F1: 71.73% +- 20.01 / lp

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


Average syntactic accuracy, Soft: 65.864396020646%, Hard: 59.77977887639349%

##GRU##

Invalid target or predicted expression, skipping
GRU Speed: 0.02s +- 0.0 / lp
GRU Avg Acc: 88.02% +- 15.31 / lp
GRU Avg F1: 72.86% +- 24.45 / lp

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.35s/it]

Average syntactic accuracy, Soft: 64.38368055555553%, Hard: 62.222663549389914%

##LSTM##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
LSTM Speed: 0.03s +- 0.0 / lp
LSTM Avg Acc: 89.31% +- 14.74 / lp
LSTM Avg F1: 73.84% +- 24.74 / lp






### Model ensembling

In [27]:
args.shuffle_examples = False

In [28]:
F1_fam_ensemble = evaluate_ensemble("family-benchmark", args, verbose=False)

##################################################
NCES evaluation on family-benchmark KB:
##################################################

KB namespace:  http://www.benchmark.org/family#

Number of learning problems:  48


100%|██████████| 1/1 [00:01<00:00,  1.88s/it]


Average syntactic accuracy, Soft: 70.31446273633775%, Hard: 68.9711133684919%

## Ensemble ##

Invalid target or predicted expression, skipping
Ensemble Speed: 0.04s +- 0.0 / lp
Ensemble Avg Acc: 91.89% +- 7.22 / lp
Ensemble Avg F1: 78.06% +- 20.68 / lp



# Semantic Bible

In [29]:
F1_semb_shuffle = evaluate_nces("semantic_bible", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on semantic_bible KB:
##################################################

KB namespace:  http://semanticbible.org/ns/2006/NTNames#

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.31s/it]


Average syntactic accuracy, Soft: 65.36630036630036%, Hard: 77.55838932309521%

##SetTransformer##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
SetTransformer Speed: 0.06s +- 0.0 / lp
SetTransformer Avg Acc: 98.83% +- 1.09 / lp
SetTransformer Avg F1: 78.77% +- 21.43 / lp

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


Average syntactic accuracy, Soft: 66.06449106449107%, Hard: 68.71954842543079%

##GRU##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
GRU Speed: 0.09s +- 0.0 / lp
GRU Avg Acc: 97.31% +- 6.7 / lp
GRU Avg F1: 79.82% +- 28.13 / lp

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.81s/it]

Average syntactic accuracy, Soft: 67.26807380653534%, Hard: 70.95756900104729%

##LSTM##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
LSTM Speed: 0.08s +- 0.0 / lp
LSTM Avg Acc: 95.29% +- 10.98 / lp
LSTM Avg F1: 78.23% +- 30.29 / lp






### After shuffling examples

In [30]:
args.shuffle_examples = True

In [31]:
F1_semb_shuffle = evaluate_nces("semantic_bible", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on semantic_bible KB:
##################################################

KB namespace:  http://semanticbible.org/ns/2006/NTNames#

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.56s/it]


Average syntactic accuracy, Soft: 65.36630036630036%, Hard: 77.55838932309521%

##SetTransformer##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
SetTransformer Speed: 0.07s +- 0.0 / lp
SetTransformer Avg Acc: 98.83% +- 1.09 / lp
SetTransformer Avg F1: 78.77% +- 21.43 / lp

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.87s/it]


Average syntactic accuracy, Soft: 67.96037296037296%, Hard: 76.70186023127201%

##GRU##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
GRU Speed: 0.08s +- 0.0 / lp
GRU Avg Acc: 94.98% +- 10.13 / lp
GRU Avg F1: 75.17% +- 28.5 / lp

Number of learning problems:  26


100%|██████████| 1/1 [00:01<00:00,  1.74s/it]

Average syntactic accuracy, Soft: 58.97935397935397%, Hard: 63.218267875018455%

##LSTM##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
LSTM Speed: 0.08s +- 0.0 / lp
LSTM Avg Acc: 93.74% +- 12.21 / lp
LSTM Avg F1: 68.86% +- 32.98 / lp






### Model ensembling

In [32]:
args.shuffle_examples = False

In [33]:
F1_mut_ensemble = evaluate_ensemble("semantic_bible", args, verbose=False)

##################################################
NCES evaluation on semantic_bible KB:
##################################################

KB namespace:  http://semanticbible.org/ns/2006/NTNames#

Number of learning problems:  26


100%|██████████| 1/1 [00:03<00:00,  3.15s/it]


Average syntactic accuracy, Soft: 72.22610722610723%, Hard: 77.68773709950183%

## Ensemble ##

Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Invalid target or predicted expression, skipping
Ensemble Speed: 0.13s +- 0.0 / lp
Ensemble Avg Acc: 97.67% +- 6.7 / lp
Ensemble Avg F1: 84.83% +- 22.84 / lp



# Vicodi

In [34]:
F1_vic_shuffle = evaluate_nces("vicodi", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on vicodi KB:
##################################################

KB namespace:  http://vicodi.org/ontology#

Number of learning problems:  157


100%|██████████| 1/1 [00:03<00:00,  3.91s/it]


Average syntactic accuracy, Soft: 86.07894228913334%, Hard: 89.61791008625025%

##SetTransformer##



Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://vicodi.org/ontology#','Illness')))


SetTransformer Speed: 0.05s +- 0.0 / lp
SetTransformer Avg Acc: 99.89% +- 0.46 / lp
SetTransformer Avg F1: 89.33% +- 23.1 / lp

Number of learning problems:  157


100%|██████████| 1/1 [00:15<00:00, 15.19s/it]


Average syntactic accuracy, Soft: 78.61069751197141%, Hard: 85.87574887125275%

##GRU##

GRU Speed: 0.13s +- 0.0 / lp
GRU Avg Acc: 99.86% +- 0.53 / lp
GRU Avg F1: 86.94% +- 26.93 / lp

Number of learning problems:  157


100%|██████████| 1/1 [00:15<00:00, 15.07s/it]


Average syntactic accuracy, Soft: 73.6817658632945%, Hard: 81.62208446397271%

##LSTM##

LSTM Speed: 0.13s +- 0.0 / lp
LSTM Avg Acc: 99.89% +- 0.31 / lp
LSTM Avg F1: 84.0% +- 28.05 / lp



### After shuffling examples

In [35]:
args.shuffle_examples = True

In [36]:
F1_vic_shuffle = evaluate_nces("vicodi", ["SetTransformer", "GRU", "LSTM"], args)

##################################################
NCES evaluation on vicodi KB:
##################################################

KB namespace:  http://vicodi.org/ontology#

Number of learning problems:  157


100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Average syntactic accuracy, Soft: 86.07894228913334%, Hard: 89.61791008625025%

##SetTransformer##



Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://vicodi.org/ontology#','Illness')))


SetTransformer Speed: 0.06s +- 0.0 / lp
SetTransformer Avg Acc: 99.89% +- 0.46 / lp
SetTransformer Avg F1: 89.33% +- 23.1 / lp

Number of learning problems:  157


100%|██████████| 1/1 [00:13<00:00, 13.77s/it]


Average syntactic accuracy, Soft: 81.37928419775552%, Hard: 85.17549038530674%

##GRU##

GRU Speed: 0.12s +- 0.0 / lp
GRU Avg Acc: 99.91% +- 0.31 / lp
GRU Avg F1: 88.78% +- 25.2 / lp

Number of learning problems:  157


100%|██████████| 1/1 [00:13<00:00, 13.17s/it]


Average syntactic accuracy, Soft: 72.95332586733859%, Hard: 81.41845582085365%

##LSTM##

LSTM Speed: 0.11s +- 0.0 / lp
LSTM Avg Acc: 99.89% +- 0.34 / lp
LSTM Avg F1: 85.95% +- 23.99 / lp



### Model ensembling

In [37]:
args.shuffle_examples = False

In [38]:
F1_vic_ensemble = evaluate_ensemble("vicodi", args, verbose=False)

##################################################
NCES evaluation on vicodi KB:
##################################################

KB namespace:  http://vicodi.org/ontology#

Number of learning problems:  157


100%|██████████| 1/1 [00:13<00:00, 13.73s/it]


Average syntactic accuracy, Soft: 88.00231615518238%, Hard: 90.20052529794002%

## Ensemble ##



Object Complement Of not implemented at OWLObjectComplementOf(OWLClass(IRI('http://vicodi.org/ontology#','Illness')))


Ensemble Speed: 0.12s +- 0.0 / lp
Ensemble Avg Acc: 99.96% +- 0.15 / lp
Ensemble Avg F1: 95.53% +- 14.86 / lp

