In [1]:
import os
from owlapy.parser import DLSyntaxParser
from nir.utils import read_embs, read_training_data, timeit, read_embs_and_apply_agg,read_and_prepare_pma_data
from nir.executer import Execute
from nir.models import NIRComposite, NIRLSTM
from nir.models.pmanet import PMAnet
import torch

In [2]:
from nir.config import NIRConfig

model_config = NIRConfig()

In [3]:
cwd = os.getcwd()
parent_dir = os.path.dirname(os.path.dirname(cwd))
print("Parent dir: ",parent_dir)
dataset_dir = os.path.join(parent_dir, "datasets\datasets\mutagenesis")
print("Dataset dir: ",dataset_dir)
output_dir = os.path.join(parent_dir, "output")
print("Output dir: ",output_dir)

Parent dir:  D:\PycharmProjects\CoNeuralReasoner
Dataset dir:  D:\PycharmProjects\CoNeuralReasoner\datasets\datasets\mutagenesis
Output dir:  D:\PycharmProjects\CoNeuralReasoner\output


#### test composite

In [4]:
model_config

NIRConfig {
  "batch_training": true,
  "device": "cpu",
  "embedding_dim": 256,
  "hidden_size": 128,
  "individual_size": 256,
  "input_size": 256,
  "max_length": 128,
  "model_type": "nir",
  "num_attention_heads": 4,
  "num_encoder_layers": 6,
  "num_rnn_layers": 3,
  "output_size": 1,
  "pad_token_id": 0,
  "pe_dropout": 0.1,
  "transformers_version": "4.46.3",
  "vocab_size": 1000
}

In [5]:
pma_net = PMAnet(model_config.embedding_dim, model_config.num_attention_heads, 1)
pma_net.load_state_dict(torch.load(os.path.join(output_dir, "model.pt")))

<All keys matched successfully>

In [6]:
data = read_training_data(dataset_dir, remove_atomic_concepts=True)


Running `<function read_training_data>`...
Function read_training_data with  Args:[<class 'str'>] | Kwargs:{'remove_atomic_concepts': <class 'bool'>} took 3.2102 seconds


In [7]:
kb, all_individuals, embeddings = read_embs_and_apply_agg(dataset_dir, nn_agg=pma_net, merge=True)


Running `<function read_embs_and_apply_agg>`...

Building atomic concept embeddings with PMA: PMAnet(
  (mab): MAB(
    (fc_q): Linear(in_features=256, out_features=256, bias=True)
    (fc_k): Linear(in_features=256, out_features=256, bias=True)
    (fc_v): Linear(in_features=256, out_features=256, bias=True)
    (fc_o): Linear(in_features=256, out_features=256, bias=True)
  )
  (linear): Sequential(
    (0): Linear(in_features=512, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): GELU(approximate='none')
    (4): Linear(in_features=512, out_features=1, bias=True)
  )
  (activation): Sigmoid()
  (loss): BCELoss()
)
(A permutation-invariant architecture)


Running `<function read_embs>`...
Function read_embs with  Args:[<class 'str'>, <class 'bool'>] | Kwargs:{} took 0.9553 seconds
Function read_embs_and_apply_agg with  Args:[<class 'str'>] | Kwargs:{'nn_agg': <class 'nir.models.pmanet.PMAnet'>, 'merge': <class 'bool'

In [8]:
kb_namespace = list(kb.ontology.classes_in_signature())[0].str
if "#" in kb_namespace:
            kb_namespace = kb_namespace.split("#")[0] + "#"
elif "/" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind("/")] + "/"
elif ":" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind(":")] + ":"
expression_parser = DLSyntaxParser(kb_namespace)
concept_to_instance_set = {expr: set(
            [ind.str.split("/")[-1] for ind in kb.individuals(expression_parser.parse(expr))])
            for expr in data}

In [9]:
model = NIRComposite(model_config)

In [13]:
from nir.trainer import Trainer
"""
def __init__(self, model, tokenizer, data, embeddings, all_individuals, concept_to_instance_set,
                 num_examples, th, optimizer, pretrained_model_path=None, output_dir=None, epochs=300, batch_size=256,
                 num_workers=4, lr=3.5e-4, train_test_split=True)
"""
trainer = Trainer(model, None, data, embeddings,
                               all_individuals, concept_to_instance_set,
                               100, 0.5, "adamw",None,
                               output_dir, 1, 16,
                               1, 3.5e-4, True)
trainer.train()


Train: <Batch: 0/548, Loss: nan, F1: 0, Jaccard: 0>:   0%|          | 0/548 [00:00<?, ?it/s][A
Train: <Batch: 1/548, Loss: 0.6944, F1: 0.20, Jaccard: 0.12>:   0%|          | 0/548 [01:31<?, ?it/s][A
Train: <Batch: 1/548, Loss: 0.6944, F1: 0.20, Jaccard: 0.12>:   0%|          | 0/548 [01:31<?, ?it/s][A
Train: <Batch: 1/548, Loss: 0.6944, F1: 0.20, Jaccard: 0.12>:   0%|          | 1/548 [01:31<13:56:29, 91.75s/it][A
Train: <Batch: 2/548, Loss: 0.7070, F1: 0.15, Jaccard: 0.09>:   0%|          | 1/548 [01:33<13:56:29, 91.75s/it][A
Train: <Batch: 2/548, Loss: 0.7070, F1: 0.15, Jaccard: 0.09>:   0%|          | 1/548 [01:33<13:56:29, 91.75s/it][A
Train: <Batch: 2/548, Loss: 0.7070, F1: 0.15, Jaccard: 0.09>:   0%|          | 2/548 [01:33<5:53:54, 38.89s/it] [A
Train: <Batch: 3/548, Loss: 0.6867, F1: 0.25, Jaccard: 0.16>:   0%|          | 2/548 [01:35<5:53:54, 38.89s/it][A
Train: <Batch: 3/548, Loss: 0.6867, F1: 0.25, Jaccard: 0.16>:   0%|          | 2/548 [01:35<5:53:54, 38.89s/it][A

####################################################################################################

===>Train: <Epoch 1/1 - Loss: 0.6425510939672916, F1: 0.2920984392487836, Jaccard: 0.21848383859208262 - Lr: 0.000349614666716641>

===>Validation: <Epoch 1/1 - Loss: 0.5395877957344055, F1: 0.388662075136612, Jaccard: 0.3200132508586>

Time taken:  1616.3997611999512
#################################################################################################### 


Best F1 score:  0.388662075136612


#### test lstm

In [None]:
data = read_training_data(dataset_dir, remove_atomic_concepts=True)

In [None]:
kb, all_individuals, embeddings = read_embs_and_apply_agg(dataset_dir, merge=True)

In [None]:
kb_namespace = list(kb.ontology.classes_in_signature())[0].str
if "#" in kb_namespace:
            kb_namespace = kb_namespace.split("#")[0] + "#"
elif "/" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind("/")] + "/"
elif ":" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind(":")] + ":"
expression_parser = DLSyntaxParser(kb_namespace)
concept_to_instance_set = {expr: set(
            [ind.str.split("/")[-1] for ind in kb.individuals(expression_parser.parse(expr))])
            for expr in data}

In [None]:
model = NIRLSTM(model_config)

In [None]:
#get tokenizer
from tokenizers import Tokenizer
from owlapy.render import DLSyntaxObjectRenderer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import AutoTokenizer, AutoModel, AutoConfig, PreTrainedTokenizerFast
renderer = DLSyntaxObjectRenderer()
atomic_concept_names = frozenset(
    [renderer.render(a) for a in kb.ontology.classes_in_signature()])
role_names = frozenset([r.str.split("/")[-1].split("#")[-1] for r in
                        kb.ontology.object_properties_in_signature()] +
                       [r.str.split("/")[-1].split("#")[-1] for r in
                        kb.ontology.data_properties_in_signature()])
Vocab = ['⊔', '⊓', '∃', '∀', '¬', '⊤', '⊥', ')', '(', '.', '>=', '<=', 'True', 'False',
         '[', ']', '{', '}', '⁻'] + list(atomic_concept_names) + list(role_names)
tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.train_from_iterator(Vocab, trainer)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
tokenizer.pad_token = "[PAD]"

In [None]:
from nir.trainer import Trainer
trainer = Trainer(model=model, tokenizer= tokenizer, data = data, embeddings = embeddings,
                               all_individuals = all_individuals, concept_to_instance_set= concept_to_instance_set,
                               num_examples = 40, th = 0.5, optimizer = "adamw",
                               output_dir = output_dir, epochs = 5, batch_size = 4,
                               num_workers = 1, lr = 3.5e-4, train_test_split = True)
trainer.train()

#### test gru

In [7]:
data = read_training_data(dataset_dir, remove_atomic_concepts=True)


Running `<function read_training_data>`...
Function read_training_data with  Args:[<class 'str'>] | Kwargs:{'remove_atomic_concepts': <class 'bool'>} took 4.5412 seconds


In [8]:
kb, all_individuals, embeddings = read_embs_and_apply_agg(dataset_dir, merge=True)


Running `<function read_embs_and_apply_agg>`...

Running `<function read_embs>`...
Function read_embs with  Args:[<class 'str'>, <class 'bool'>] | Kwargs:{} took 1.0813 seconds
Function read_embs_and_apply_agg with  Args:[<class 'str'>] | Kwargs:{'merge': <class 'bool'>} took 36.7965 seconds


In [9]:
kb_namespace = list(kb.ontology.classes_in_signature())[0].str
if "#" in kb_namespace:
            kb_namespace = kb_namespace.split("#")[0] + "#"
elif "/" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind("/")] + "/"
elif ":" in kb_namespace:
            kb_namespace = kb_namespace[:kb_namespace.rfind(":")] + ":"
expression_parser = DLSyntaxParser(kb_namespace)
concept_to_instance_set = {expr: set(
            [ind.str.split("/")[-1] for ind in kb.individuals(expression_parser.parse(expr))])
            for expr in data}

In [10]:
from nir.models import NIRGRU
model = NIRGRU(model_config)

In [11]:
#get tokenizer
from tokenizers import Tokenizer
from owlapy.render import DLSyntaxObjectRenderer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import AutoTokenizer, AutoModel, AutoConfig, PreTrainedTokenizerFast
renderer = DLSyntaxObjectRenderer()
atomic_concept_names = frozenset(
    [renderer.render(a) for a in kb.ontology.classes_in_signature()])
role_names = frozenset([r.str.split("/")[-1].split("#")[-1] for r in
                        kb.ontology.object_properties_in_signature()] +
                       [r.str.split("/")[-1].split("#")[-1] for r in
                        kb.ontology.data_properties_in_signature()])
Vocab = ['⊔', '⊓', '∃', '∀', '¬', '⊤', '⊥', ')', '(', '.', '>=', '<=', 'True', 'False',
         '[', ']', '{', '}', '⁻'] + list(atomic_concept_names) + list(role_names)
tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.train_from_iterator(Vocab, trainer)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
tokenizer.pad_token = "[PAD]"

In [12]:
from nir.trainer import Trainer
trainer = Trainer(model, tokenizer, data, embeddings,
                               all_individuals, concept_to_instance_set,
                                100, 0.5, "adamw",
                               output_dir, 2, 200,
                               1, 3.5e-4, True)
trainer.train()

Train: <Batch: 44/44, Loss: 0.6938, F1: 0.23, Jaccard: 0.15>: 100%|██████████| 44/44 [12:44<00:00, 17.37s/it]
Validation: <Batch: 5/5, Loss: 0.6933, F1: 0.24, Jaccard: 0.16>: 100%|██████████| 5/5 [02:11<00:00, 26.37s/it]


####################################################################################################
Train: <Epoch 1/2 - Loss: 0.7017512619495392, F1: 0.22007067423788787, Jaccard: 0.1388556906197887>
Validation: <Epoch 1/2 - Loss: 0.694165825843811, F1: 0.24063956571428574, Jaccard: 0.1573073934583727>
Time taken:  906.5308048725128
#################################################################################################### 



Train: <Batch: 44/44, Loss: 0.6911, F1: 0.23, Jaccard: 0.16>: 100%|██████████| 44/44 [12:12<00:00, 16.64s/it]
Validation: <Batch: 5/5, Loss: 0.6857, F1: 0.25, Jaccard: 0.17>: 100%|██████████| 5/5 [02:12<00:00, 26.43s/it]


####################################################################################################
Train: <Epoch 2/2 - Loss: 0.6869967159899798, F1: 0.24505944862547627, Jaccard: 0.16419948802937717>
Validation: <Epoch 2/2 - Loss: 0.6843363165855407, F1: 0.24800898857142856, Jaccard: 0.16860678620547165>
Time taken:  874.8139200210571
#################################################################################################### 

Best F1 score:  0.24800898857142856


#### test pma

In [None]:
import os
from nir.utils import read_and_prepare_pma_data

In [None]:
cwd = os.getcwd()
parent_dir = os.path.dirname(os.path.dirname(cwd))
print("Parent dir: ",parent_dir)
dataset_dir = os.path.join(parent_dir, "datasets/datasets/mutagenesis")
print("Dataset dir: ",dataset_dir)
output_dir = os.path.join(parent_dir, "output")
print("Output dir: ",output_dir)

In [None]:
emb, data, instances, valid_inds, class_emb, kb = read_and_prepare_pma_data(dataset_dir)

In [None]:
from nir.models.pmanet import PMAnet
import torch
import torch.optim as optim
model = PMAnet(256, 4, 1)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from nir.trainer import PMATrainer
import datetime
import time

start_time = time.time()
print("\nStarting training...")
print(f"Start time:{datetime.datetime.now()}\n")
trainer = PMATrainer(model, optimizer, data, valid_inds,
                                  instances, emb, output_dir, 0.4,
                                  16, 2, device,1000)
trainer.train()


In [None]:
trainer.test()