## How often does ROCES outperform the state of the art without requiring all examples to be used?

In [1]:
import os, random
from utils.simple_solution import SimpleSolution
from utils.evaluator import Evaluator
from utils.data import Data
from ontolearn.knowledge_base import KnowledgeBase
from owlapy.render import DLSyntaxObjectRenderer
from roces import BaseConceptSynthesis
from roces.synthesizer import ConceptSynthesizer
from owlapy.parser import DLSyntaxParser
from utils.dataset import DatasetNoLabel
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from tqdm import tqdm
import json
import torch
import numpy as np, time
from collections import defaultdict
import re
import copy
from argparse import Namespace

In [98]:
def build_roces_vocabulary(data_train, data_test, kb, args):
    def add_data_values(path):
        print("\n*** Finding relevant data values ***")
        values = set()
        for ce, lp in data_train+data_test:
            if '[' in ce:
                for val in re.findall("\[(.*?)\]", ce):
                    values.add(val.split(' ')[-1])
        print("*** Done! ***\n")
        print("Added values: ", values)
        print()
        return list(values)
    renderer = DLSyntaxObjectRenderer()
    individuals = [ind.get_iri().as_str().split("/")[-1] for ind in kb.individuals()]
    atomic_concepts = list(kb.ontology().classes_in_signature())
    atomic_concept_names = [renderer.render(a) for a in atomic_concepts]
    role_names = [rel.get_iri().get_remainder() for rel in kb.ontology().object_properties_in_signature()] + \
                 [rel.get_iri().get_remainder() for rel in kb.ontology().data_properties_in_signature()]
    vocab = atomic_concept_names + role_names + ['⊔', '⊓', '∃', '∀', '¬', '⊤', '⊥', '.', ' ', '(', ')',\
                                                '⁻', '≤', '≥', 'True', 'False', '{', '}', ':', '[', ']',
                                                'double', 'integer', 'date', 'xsd']
    quantified_restriction_values = [str(i) for i in range(1,12)]
    data_values = add_data_values(args.knowledge_base_path)
    vocab = vocab + data_values + quantified_restriction_values
    vocab = sorted(set(vocab)) + ['PAD']
    print("Vocabulary size: ", len(vocab))
    num_examples = min(args.num_examples, kb.individuals_count()//2)
    return vocab, num_examples


def before_pad(arg):
    arg_temp = []
    for atm in arg:
        if atm == 'PAD':
            break
        arg_temp.append(atm)
    return arg_temp

num_examples = 1000
def collate_batch(batch):
    pos_emb_list = []
    neg_emb_list = []
    target_labels = []
    for pos_emb, neg_emb, label in batch:
        if pos_emb.ndim != 2:
            pos_emb = pos_emb.reshape(1, -1)
        if neg_emb.ndim != 2:
            neg_emb = neg_emb.reshape(1, -1)
        pos_emb_list.append(pos_emb)
        neg_emb_list.append(neg_emb)
        target_labels.append(label)
    pos_emb_list[0] = F.pad(pos_emb_list[0], (0, 0, 0, num_examples - pos_emb_list[0].shape[0]), "constant", 0)
    pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0)
    neg_emb_list[0] = F.pad(neg_emb_list[0], (0, 0, 0, num_examples - neg_emb_list[0].shape[0]), "constant", 0)
    neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0)
    target_labels = pad_sequence(target_labels, batch_first=True, padding_value=-100)
    return pos_emb_list, neg_emb_list, target_labels

def predict(kb, positives, negatives, models, embedding_models, args):
    args.path_to_triples = f"datasets/{kb}/Triples/"
    global num_examples
    num_examples = models[0].num_examples
    vocab = models[0].vocab
    inv_vocab = models[0].inv_vocab
    kb_embedding_data = Data(args)
    k = max(len(positives), len(negatives))
    Scores = []
    test_dataset = DatasetNoLabel([("dummy_key", {"positive examples": positives, "negative examples": negatives})], kb_embedding_data, k) #data, triples_data, k
    for i, (model, embedding_model) in enumerate(zip(models, embedding_models)):
        model = model.eval()
        scores = []
        test_dataset.load_embeddings(embedding_model.eval())
        test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False)
        for x1, x2 in tqdm(test_dataloader):
            _, sc = model(x1, x2)
            scores.append(sc.detach()) 
        scores = torch.cat(scores, 0)
        if i == 0:
            cum_scores = scores
        else:
            cum_scores = cum_scores + scores
    avg_scores = cum_scores / len(models)
    pred_sequence = model.inv_vocab[avg_scores.argmax(1)]
    return pred_sequence[0]


def initialize_synthesizer(vocab, num_examples, num_inds, args):
    args.num_inds = num_inds
    roces = ConceptSynthesizer(vocab, num_examples, args)
    roces.refresh()
    return roces.model, roces.embedding_model

def synthesize_class_expression(kb_name, vocab, positives, negatives, num_examples, num_inds, args):
    args.knowledge_base_path = "datasets/" + f"{kb_name}/{kb_name}.owl"
    embs = torch.load(f"datasets/{kb_name}/Model_weights/SetTransformer_{args.kb_emb_model}_Emb_inducing_points32.pt", map_location=torch.device("cpu"))
    setattr(args, 'num_entities', embs['emb_ent_real.weight'].shape[0])
    setattr(args, 'num_relations', embs['emb_rel_real.weight'].shape[0])
    models, embedding_models = [], []
    for inds in num_inds:
        model, embedding_model = initialize_synthesizer(vocab, num_examples, inds, args)
        if args.sampling_strategy != 'uniform':
            model.load_state_dict(torch.load(f"datasets/{kb_name}/Model_weights/{args.kb_emb_model}_SetTransformer_inducing_points{inds}.pt", map_location=torch.device("cpu")))
            embedding_model.load_state_dict(torch.load(f"datasets/{kb_name}/Model_weights/SetTransformer_{args.kb_emb_model}_Emb_inducing_points{inds}.pt", map_location=torch.device("cpu")))
        else:
            model.load_state_dict(torch.load(f"datasets/{kb_name}/Model_weights/{args.kb_emb_model}_SetTransformer_uniform_inducing_points{inds}.pt", map_location=torch.device("cpu")))
            embedding_model.load_state_dict(torch.load(f"datasets/{kb_name}/Model_weights/SetTransformer_{args.kb_emb_model}_Emb_uniform_inducing_points{inds}.pt", map_location=torch.device("cpu")))
        models.append(model)
        embedding_models.append(embedding_model)
    return predict(kb_name, positives, negatives, models, embedding_models, args)

def prepare_utilities_for_roces(kb_name, args):
    kb = KnowledgeBase(path=f"datasets/{kb_name}/{kb_name}.owl")
    with open(f"datasets/{kb_name}/Test_data/Data.json", "r") as file:
        test_data = json.load(file)
    with open(f"datasets/{kb_name}/Train_data/Data.json", "r") as file:
        train_data = json.load(file)
    vocab, num_examples = build_roces_vocabulary(train_data, test_data, kb, args)
    namespace = list(kb.individuals())[0].get_iri().get_namespace()
    print("KB namespace: ", namespace)
    print()
    simpleSolution = SimpleSolution(kb)
    evaluator = Evaluator(kb)
    dl_parser = DLSyntaxParser(namespace = namespace)
    all_individuals = set(kb.individuals())
    return kb, simpleSolution, evaluator, dl_parser, all_individuals, vocab

def prepare_utilities_search_based(kb_name, args):
    kb = KnowledgeBase(path=f"datasets/{kb_name}/{kb_name}.owl")
    namespace = list(kb.individuals())[0].get_iri().get_namespace()
    print("KB namespace: ", namespace)
    print()
    simpleSolution = SimpleSolution(kb)
    evaluator = Evaluator(kb)
    dl_parser = DLSyntaxParser(namespace = namespace)
    all_individuals = set(kb.individuals())
    return kb, simpleSolution, evaluator, dl_parser, all_individuals


def predict_with_roces(kb_name, vocab, positives, negatives, dl_parser, simpleSolution, args):
    ensemble_models = "+".join(["SetTransformer_I32", "SetTransformer_I64", "SetTransformer_I128"])
    num_inds = [int(model_name.split("I")[-1]) for model_name in ensemble_models.split("+")]
    pred = synthesize_class_expression(kb_name, vocab, positives, negatives, num_examples, num_inds, args)
    prediction = None
    try:
        end_idx = np.where(pred == 'PAD')[0][0] # remove padding token
    except IndexError:
        end_idx = -1
    pred = pred[:end_idx]
    try:
        prediction = dl_parser.parse("".join(pred.tolist()))
    except Exception as err:
        try:
            pred = simpleSolution.predict(pred.sum())
            prediction = dl_parser.parse(pred)
        except Exception:
            print(f"Could not understand expression {pred}")
    if prediction is None:
        prediction = dl_parser.parse('⊤')
    return prediction

def query_oracle(prediction, oracle, kb, positives, negatives, all_individuals, pos_diff, neg_diff, remove_wrong_examples, subset_size):
    if remove_wrong_examples:
        if len(positives)-pos_diff >= subset_size:
            positives = positives[:-pos_diff]
        if len(negatives)-neg_diff >= subset_size:
            negatives = negatives[:-neg_diff]
    true_positive_examples = set([ind.get_iri().as_str().split("/")[-1] for ind in kb.individuals(oracle)])
    true_negative_examples = all_individuals-true_positive_examples
    predicted_positives = set([ind.get_iri().as_str().split("/")[-1] for ind in kb.individuals(prediction)])
    covered_positives = predicted_positives.intersection(true_positive_examples)
    candidate_negatives = true_negative_examples.intersection(all_individuals.difference(predicted_positives))
    #num_neg_samples = min(len(positives), len(negatives), len(candidate_negatives))
    candidate_positives = true_positive_examples.intersection(covered_positives)
    if not candidate_positives:
        candidate_positives = all_individuals.difference(set(positives).union(set(negatives)))
    if not candidate_negatives:
        candidate_negatives = all_individuals.difference(set(negatives).union(set(positives)))
    new_positives = positives + random.sample(list(candidate_positives), min(subset_size, len(candidate_positives)))
    new_negatives = negatives + random.sample(list(candidate_negatives), min(subset_size, len(candidate_negatives)))
    if len(new_positives) < subset_size:
        new_positives = new_positives + list(candidate_positives)[:subset_size-1]
    if len(new_negatives) < subset_size:
        new_negatives = new_negatives + list(candidate_negatives)[:subset_size-1]
    #num_pos_samples = min(len(positives), len(negatives), len(candidate_positives))
    return new_positives, new_negatives
    
def evaluate_prediction(kb, prediction, oracle, evaluator, simpleSolution, all_individuals):
    positive_examples = set(kb.individuals(oracle))
    negative_examples = all_individuals-positive_examples
    try:
        _, f1 = evaluator.evaluate(prediction, positive_examples, negative_examples)
    except Exception as err:
        print(f"Parsing error on ", prediction)
        prediction = dl_parser.parse('⊤')
        _, f1 = evaluator.evaluate(prediction, positive_examples, negative_examples)
    try:
        prediction_str = simpleSolution.renderer.render(prediction)
    except:
        prediction_str = "Unknown"
    return prediction_str, f1

def start_active_learning(kb_name, oracle, positives, negatives, args, max_iter=10, subset_size=5, approach="roces"):
    ###### Improve here
    i = 0
    best_prediction = None
    best_score = None
    previous_score = None
    all_predictions = []
    F1 = []
    if approach == "roces":
        kb, simpleSolution, evaluator, dl_parser, all_individuals, vocab = prepare_utilities_for_roces(kb_name, args)
    else:
        kb, simpleSolution, evaluator, dl_parser, all_individuals = prepare_utilities_search_based(kb_name, args)
    all_individuals_str = set([ind.get_iri().as_str().split("/")[-1] for ind in all_individuals])
    oracle = dl_parser.parse(oracle)
    while i < max_iter:
        if approach == "roces":
            if i == 0:
                prediction = predict_with_roces(kb_name, vocab, positives, negatives, dl_parser, simpleSolution, args)
                prediction_str, f1 = evaluate_prediction(kb, prediction, oracle, evaluator, simpleSolution, all_individuals)
                best_prediction = prediction_str
                all_predictions.append(prediction_str)
                F1.append(f1)
                new_positives, new_negatives = positives, negatives
                pos_diff = len(new_positives) - len(positives)
                neg_diff = len(new_negatives) - len(negatives)
                previous_score = f1
                best_score = f1
            else:
                remove_wrong_examples = (previous_score > f1 or f1==0)
                copy_pos = new_positives
                copy_neg = new_negatives
                new_positives, new_negatives = query_oracle(prediction, oracle, kb, new_positives, new_negatives, all_individuals_str, pos_diff, neg_diff, remove_wrong_examples, subset_size)
                pos_diff = len(new_positives) - len(copy_pos)
                neg_diff = len(new_negatives) - len(copy_neg)
                print("new positives", len(new_positives))
                print("new negatives", len(new_negatives))
                #print("pos diff", pos_diff)
                #print("neg diff", neg_diff)
                prediction = predict_with_roces(kb_name, vocab, new_positives, new_negatives, dl_parser, simpleSolution, args)
                previous_score = f1
                prediction_str, f1 = evaluate_prediction(kb, prediction, oracle, evaluator, simpleSolution, all_individuals)
                all_predictions.append(prediction_str)
                F1.append(f1)
                if f1 > previous_score:
                    best_prediction = prediction_str
        else:
            pass
        if f1 > best_score:
            print("improved performance")
            best_score = f1
        i += 1
        print()
        if f1 == 100:
            break
    return best_prediction, all_predictions, F1

In [3]:
with open("config.json") as config:
    nces_args = json.load(config)
    nces_args = Namespace(**nces_args)
nces_args.kb_emb_model = "ConEx"
nces_args.sampling_strategy = "original"

In [4]:
kb_name = "semantic_bible"
with open(f"datasets/{kb_name}/Test_data/Data.json") as file:
    test_data = json.load(file)
    test_lps = [lp for lp,examples in test_data]
    full_examples = [examples for lp,examples in test_data]
    print("\n".join(test_lps[:5]))
oracle = test_lps[0]
full_positives, full_negatives = full_examples[0]["positive examples"], full_examples[0]["negative examples"]

CognitiveAgent ⊔ LandArea
Agent ⊓ (Human ⊔ (¬God)) ⊓ (∀ spouseOf.Woman)
Series ⊔ (∃ relativeOf.(Woman ⊓ (∀ visitedPlace.⊥)))
(LandArea ⊓ (¬City)) ⊔ (∃ member.Agent)
(Mountain ⊓ (∀ location.⊥)) ⊔ (∃ knows.(¬SonOfGod))


In [20]:
positives = random.sample(full_positives, 5)
negatives = random.sample(full_negatives, 5)

In [96]:
oracle

'CognitiveAgent ⊔ LandArea'

In [31]:
positives

['NTNames#Nain',
 'NTNames#JosephSonOfMattathias',
 'NTNames#Zebulun',
 'NTNames#Olivet',
 'NTNames#Alexander']

In [30]:
negatives

['NTNames#NeapolisGeodata',
 'NTNames#Zealot',
 'NTNames#PhoenixGeodata',
 'NTNames#Ephesians',
 'NTNames#SycharGeodata']

In [104]:
best_prediction, all_predictions, F1 = start_active_learning(kb_name, oracle, positives, negatives, nces_args, max_iter=50, subset_size=2*len(positives), approach="roces")


*** Finding relevant data values ***
*** Done! ***

Added values:  {'35.583', '41.013', '44.42'}

Vocabulary size:  126
KB namespace:  http://semanticbible.org/ns/2006/NTNames#






100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.27it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]



new positives 15
new negatives 15


100%|██████████| 1/1 [00:00<00:00,  4.14it/s]
100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
100%|██████████| 1/1 [00:00<00:00,  4.26it/s]



new positives 25
new negatives 25


100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
100%|██████████| 1/1 [00:00<00:00,  3.44it/s]


improved performance

new positives 35
new negatives 35


100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
100%|██████████| 1/1 [00:00<00:00,  3.21it/s]



new positives 45
new negatives 45


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]



new positives 45
new negatives 45


100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
100%|██████████| 1/1 [00:00<00:00,  3.25it/s]


improved performance

new positives 55
new negatives 55


100%|██████████| 1/1 [00:00<00:00,  3.09it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]



new positives 55
new negatives 55


100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]



new positives 65
new negatives 65


100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]



new positives 65
new negatives 65


100%|██████████| 1/1 [00:00<00:00,  3.15it/s]
100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]



new positives 75
new negatives 75


100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]



new positives 85
new negatives 85


100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]



new positives 95
new negatives 95


100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]



new positives 105
new negatives 105


100%|██████████| 1/1 [00:00<00:00,  4.14it/s]
100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
100%|██████████| 1/1 [00:00<00:00,  3.08it/s]



new positives 115
new negatives 115


100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]



new positives 125
new negatives 125


100%|██████████| 1/1 [00:00<00:00,  3.90it/s]
100%|██████████| 1/1 [00:00<00:00,  3.47it/s]
100%|██████████| 1/1 [00:00<00:00,  4.14it/s]



new positives 135
new negatives 135


100%|██████████| 1/1 [00:00<00:00,  3.66it/s]
100%|██████████| 1/1 [00:00<00:00,  4.29it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]



new positives 145
new negatives 145


100%|██████████| 1/1 [00:00<00:00,  3.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.19it/s]



new positives 155
new negatives 155


100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.68it/s]



new positives 165
new negatives 165


100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]



new positives 175
new negatives 175


100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  3.15it/s]
100%|██████████| 1/1 [00:00<00:00,  3.10it/s]



new positives 185
new negatives 185


100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  3.26it/s]



new positives 195
new negatives 195


100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
100%|██████████| 1/1 [00:00<00:00,  3.41it/s]
100%|██████████| 1/1 [00:00<00:00,  3.56it/s]



new positives 205
new negatives 205


100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]



new positives 215
new negatives 215


100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]



new positives 225
new negatives 225


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]



new positives 235
new negatives 235


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]



new positives 245
new negatives 245


100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
100%|██████████| 1/1 [00:00<00:00,  3.34it/s]



new positives 255
new negatives 255


100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  2.54it/s]



new positives 265
new negatives 265


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
100%|██████████| 1/1 [00:00<00:00,  3.25it/s]



new positives 275
new negatives 275


100%|██████████| 1/1 [00:00<00:00,  3.12it/s]
100%|██████████| 1/1 [00:00<00:00,  2.93it/s]
100%|██████████| 1/1 [00:00<00:00,  3.20it/s]



new positives 285
new negatives 285


100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.31it/s]



new positives 295
new negatives 295


100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.19it/s]



new positives 305
new negatives 305


100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
100%|██████████| 1/1 [00:00<00:00,  2.65it/s]



new positives 315
new negatives 315


100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
100%|██████████| 1/1 [00:00<00:00,  2.99it/s]
100%|██████████| 1/1 [00:00<00:00,  3.17it/s]



new positives 325
new negatives 325


100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
100%|██████████| 1/1 [00:00<00:00,  3.07it/s]
100%|██████████| 1/1 [00:00<00:00,  2.70it/s]



new positives 335
new negatives 335


100%|██████████| 1/1 [00:00<00:00,  2.36it/s]
100%|██████████| 1/1 [00:00<00:00,  3.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]



new positives 345
new negatives 345


100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]



new positives 355
new negatives 355


100%|██████████| 1/1 [00:00<00:00,  3.21it/s]
100%|██████████| 1/1 [00:00<00:00,  2.42it/s]
100%|██████████| 1/1 [00:00<00:00,  3.04it/s]



new positives 365
new negatives 365


100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
100%|██████████| 1/1 [00:00<00:00,  3.41it/s]



new positives 375
new negatives 375


100%|██████████| 1/1 [00:00<00:00,  3.16it/s]
100%|██████████| 1/1 [00:00<00:00,  3.04it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]



new positives 385
new negatives 385


100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]



new positives 395
new negatives 395


100%|██████████| 1/1 [00:00<00:00,  3.05it/s]
100%|██████████| 1/1 [00:00<00:00,  3.02it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]



new positives 405
new negatives 405


100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  2.98it/s]
100%|██████████| 1/1 [00:00<00:00,  2.66it/s]



new positives 415
new negatives 415


100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.18it/s]
100%|██████████| 1/1 [00:00<00:00,  3.07it/s]



new positives 425
new negatives 425


100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
100%|██████████| 1/1 [00:00<00:00,  3.21it/s]



new positives 435
new negatives 435


100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.75it/s]



new positives 445
new negatives 445


100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
100%|██████████| 1/1 [00:00<00:00,  3.18it/s]
100%|██████████| 1/1 [00:00<00:00,  3.00it/s]



new positives 455
new negatives 455


100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
100%|██████████| 1/1 [00:00<00:00,  3.04it/s]
100%|██████████| 1/1 [00:00<00:00,  2.68it/s]



new positives 465
new negatives 465


100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  2.64it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]







In [105]:
F1

[0.0,
 0.0,
 35.881,
 35.881,
 0.0,
 84.962,
 35.881,
 35.881,
 0.0,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962,
 84.962]

In [101]:
all_predictions

['CitizenshipAttribute',
 'Mountain',
 'FixedHoliday',
 'FixedHoliday ⊔ (∃ location.Woman)',
 'SonOfGod',
 'EthnicGroup',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 'CognitiveAgent ⊔ BeliefGroup',
 'CognitiveAgent ⊔ BeliefGroup',
 'CognitiveAgent ⊔ BeliefGroup',
 'CognitiveAgent',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 '⊤',
 'CognitiveAgent',
 '⊤',
 'CognitiveAgent ⊔ BeliefGroup',
 'CognitiveAgent ⊔ BeliefGroup',
 'CognitiveAgent ⊔ BeliefGroup']

In [95]:
best_prediction

'CognitiveAgent ⊔ LandArea'

In [6]:
import json

with open("./datasets/carcinogenesis/Train_data/Data.json") as f:
    data = json.load(f)

In [7]:
for ce, example in data:
    if "≤" in ce:
        print(ce)

≤ 9 hasAtom.⊤
≤ 9 hasAtom.⊤
≤ 9 hasStructure.⊤
≤ 9 hasStructure.⊤
Compound ⊓ (≤ 9 hasBond.⊤)
Compound ⊓ (≤ 9 hasBond.⊤)
≤ 9 hasBond.⊤
≤ 9 hasBond.⊤
