In [1]:
from rdflib import Graph
from rdflib.term import URIRef, BNode, Identifier, _unique_id
from cmatcher.module_search.pagerank import gen_pagerank_sparql_queries
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM
import torch
from rdflib.namespace import RDF
from cmatcher.rag.rag_reduce import ont_query_reduce, reduce_ont, get_detailed_instruct, gen_doc
from cmatcher.rag.prompt_gen import gen_prompt
from cmatcher.rag.prompt_to_edoal import match
from tqdm.auto import tqdm
from uuid import UUID
import gc
import dill
import subprocess
import os
import re
import random
import itertools
import torch.nn.functional as F
import difflib
from typing import Optional, Callable

from uuid import uuid4

# define deterministic behavior
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)

random_generator = random.Random(0)

In [2]:
def _serial_number_generator() -> Callable[[], str]:
    """
    Generates UUID4-based but ncname-compliant identifiers.
    """   

    def _generator():
        return UUID(bytes=random_generator.randbytes(16), version=4).hex

    return _generator

def bnew(
    cls,
    value: Optional[str] = None,
    _sn_gen: Callable[[], str] = _serial_number_generator(),
    _prefix: str = _unique_id(),
) -> "BNode":
    """
    # only store implementations should pass in a value
    """
    if value is None:
        # so that BNode values do not collide with ones created with
        # a different instance of this module at some other time.
        node_id = _sn_gen()
        value = "%s%s" % (_prefix, node_id)
    else:
        # TODO: check that value falls within acceptable bnode value range
        # for RDF/XML needs to be something that can be serialzed
        # as a nodeID for N3 ??  Unless we require these
        # constraints be enforced elsewhere?
        pass  # assert is_ncname(str(value)), "BNode identifiers
        # must be valid NCNames" _:[A-Za-z][A-Za-z0-9]*
        # http://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#nodeID
    # type error: Incompatible return value type (got "Identifier", expected "BNode")
    return Identifier.__new__(cls, value)  # type: ignore[return-value]

BNode.__new__ = bnew

In [3]:
def gen_prompt(r1, r2, query, sample1='', sample2=''):
    sample_prompt = 'A complex correspondence is a mapping between similar entities in the ontologies with cardinality n:m.\nEDOAL stands for Expressive and Declarative Ontology Alignment Language and is a format based on XML to write complex alignments between ontologies.\nProblem: Write the complex correspondences between the three pairs of ontology1 and ontology2 in EDOAL containing the alignments of each pair.\nSolution:\n'

    if sample1 == '' and sample2 == '':
        sample_prompt = ''

    return f'''{sample_prompt}
{sample1}
---
{sample2}
---
<ontology1>
{r1}    
</ontology1>    
<ontology2>
{r2}
</ontology2>
<result>
'''

In [4]:
def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        yield batch
        
        
    

def embed(model, passages, prompt, max_length=4096, batch_size=2):
    with torch.no_grad():
        passage_embeddings = []

        for p in batched(passages, batch_size):
            passage_embeddings.append(model.encode(p, instruction=prompt, max_length=max_length))

        passage_embeddings = torch.cat(passage_embeddings, dim=0)

    return passage_embeddings



def gen_docs(g, max_entities=10):
    ls = list(filter(lambda x: (x, RDF.first, None) not in g, set(g.subjects())))
    docs = []
    for s in ls:
        docs.append(gen_doc(s, g, max_entities=max_entities))
    return ls, docs


def gen_prompts():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    model = AutoModel.from_pretrained(
        'nvidia/NV-Embed-v2',
        quantization_config=quantization_config,
        trust_remote_code=True,
        device_map='auto',
    )
    model.eval()

    base_path = '/projets/melodi/gsantoss/data/complex/conference/ont/'
    o1 = Graph().parse(base_path + 'cmt.owl')
    o2 = Graph().parse(base_path + 'conference.owl')

    queries1 = gen_pagerank_sparql_queries(o1)

    prompt = 'Given the following SPARQL query, retrieve relevant entities that are related to the query'
    with open('cmatcher/prompt_examples/sample1.txt', 'r') as f:
        sample1 = f.read()

    with open('cmatcher/prompt_examples/sample2.txt', 'r') as f:
        sample2 = f.read()

    prompts = []
    
    ls1, docs1 = gen_docs(o1)
    ls2, docs2 = gen_docs(o2)
    
    query_prefix = f"Instruct: {prompt}\nQuery: "
    
    embeddings1 = embed(model, docs1, '')
    embeddings2 = embed(model, docs2, '')

    for query in tqdm(queries1):
        
        queries = [
            query,
        ]
            
        query_embeddings = embed(model, queries, query_prefix)   

        
        scores1 = query_embeddings @ embeddings1.T
        scores2 = query_embeddings @ embeddings2.T
        
        module1 = reduce_ont(ls1, scores1, o1, top_n=2, i_max_depth=1, o_max_depth=2)
        module2 = reduce_ont(ls2, scores2, o2, top_n=2, i_max_depth=1, o_max_depth=2)

        prompts.append(gen_prompt(module1, module2, None, sample1, sample2))

    return prompts



In [5]:
prompts = gen_prompts()

with open('/projets/melodi/gsantoss/tmp/prompts2.pkl', 'wb') as f:
    dill.dump(prompts, f)


KeyboardInterrupt



In [6]:
with open('/projets/melodi/gsantoss/tmp/prompts1.pkl', 'rb') as f:
    prompts1 = dill.load(f)
    
with open('/projets/melodi/gsantoss/tmp/prompts2.pkl', 'rb') as f:
    prompts2 = dill.load(f)
    
for i, (p1, p2) in enumerate(zip(prompts1, prompts2)):
    diff = difflib.ndiff(p1.splitlines(), p2.splitlines())
    for line in diff:
        if line.startswith('- ') or line.startswith('+ '):
            print(line)

+ Based on the examples of the task of complex ontology alignment between the ontologies below with the results written in EDOAL format:
- Given the two ontologies below:
- <ontology1>
- @prefix owl: <http://www.w3.org/2002/07/owl#> .
- @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
- @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
- @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
- 
- <http://cmt#AssociatedChair> rdfs:subClassOf <http://cmt#ConferenceMember> .
- 
- <http://cmt#Author> rdfs:subClassOf <http://cmt#ConferenceMember> .
- 
- <http://cmt#ConferenceChair> rdfs:subClassOf <http://cmt#ConferenceMember> .
- 
- <http://cmt#ProgramCommitteeMember> rdfs:subClassOf <http://cmt#ConferenceMember> .
- 
- <http://cmt#Reviewer> rdfs:subClassOf <http://cmt#ConferenceMember> .
- 
- <http://cmt#acceptsHardcopySubmissions> rdfs:domain <http://cmt#Conference> .
- 
- <http://cmt#detailsEnteredBy> rdfs:domain <http://cmt#Conference> .
- 
- <http://cmt#enableVirtualMeet

In [7]:
queries1 = None
model = None
tokenizer = None
# prompts = None
prompt = None
sample1 = None
sample2 = None
module1 = None
module2 = None
quantization_config = None
o1 = None
o2 = None
uobj = gc.collect()
torch.cuda.empty_cache()

In [8]:
with open('/projets/melodi/gsantoss/tmp/prompts2.pkl', 'rb') as f:
    prompts = dill.load(f)

In [9]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_tokenizer.eos_token = llm_tokenizer.eos_token if llm_tokenizer.eos_token is not None else llm_tokenizer.pad_token
llm_quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
    quantization_config=llm_quantization_config,

)
llm_model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [10]:
outputs = []
for prompt in tqdm(prompts):
    outputs.append(match(prompt, llm_tokenizer, llm_model))

with open('/projets/melodi/gsantoss/tmp/outputs.pkl', 'wb') as f:
    dill.dump(outputs, f)

  0%|          | 0/9 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention ma

In [11]:
with open('/projets/melodi/gsantoss/tmp/outputs.pkl', 'rb') as f:
    outputs = dill.load(f)

In [12]:
def is_valid_edoal(txt):
    return txt.endswith('</rdf:RDF>')


def can_repair(txt):
    return txt.rfind('<map>') > 0


def merge_edoals(outputs):
    repaired_edoals = []
    for output in outputs:

        if not output.startswith('<?xml version'):
            output = '''<?xml version='1.0' encoding='utf-8' standalone='no'?>
<rdf:RDF xmlns='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#'
         xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'
         xmlns:xsd='http://www.w3.org/2001/XMLSchema#'
         xmlns:alext='http://exmo.inrialpes.fr/align/ext/1.0/'
         xmlns:align='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#'
         xmlns:edoal='http://ns.inria.org/edoal/1.0/#'>\n''' + output

        output = re.sub(r'<Ontology rdf:about="([^"]+)" />',
                        r'<Ontology rdf:about="\1"><location>\1</location><formalism><Formalism align:name="owl" align:uri="http://www.w3.org/TR/owl-guide/"/></formalism></Ontology>',
                        output)
        if not is_valid_edoal(output) and can_repair(output):
            last_map_index = output.rfind('<map>')
            repaired_edoals.append(output[:last_map_index] + '\n\t</Alignment>\n</rdf:RDF>')
        else:
            repaired_edoals.append(output)

    final_edoal = None
    if len(repaired_edoals) > 1:
        final_edoal = ''
        first = repaired_edoals[0]
        final_edoal += first[:first.find('<map>')]
        for e in repaired_edoals[1:]:
            final_edoal += e[e.find('<map>'):e.rfind('</map>')] + '\n\t</map>'

        final_edoal += '\n\t</Alignment>\n</rdf:RDF>'

    elif len(repaired_edoals) == 1:
        final_edoal = repaired_edoals[0]

    return final_edoal

In [13]:
final_edoal = merge_edoals(outputs)

os.makedirs('/projets/melodi/gsantoss/tmp/cct1', exist_ok=True)
with open('/projets/melodi/gsantoss/tmp/cct1/final_edoal.edoal', 'w') as f:
    f.write(final_edoal)
#  -jar  $no1 $no2 $o1 $o2 $mo $cqa $out1
base_java = '/projets/melodi/gsantoss/canarde/jdk-21.0.1/bin/java'
base_eval = '/projets/melodi/gsantoss/canarde/evaluator.jar'
base_onts = '/projets/melodi/gsantoss/data/complex/conference_100/ont/'
base_cqas = '/projets/melodi/gsantoss/data/complex/conference_100/CQAs/'
base_al = '/projets/melodi/gsantoss/tmp/cct1'
base_out = '/projets/melodi/gsantoss/tmp/ccres'

os.makedirs(base_out, exist_ok=True)
with subprocess.Popen(
        [base_java, '-jar', base_eval, 'cmt', 'conference', base_onts + 'cmt.owl', base_onts + 'conference.owl',
         base_al, base_cqas, base_out]) as proc:
    proc.communicate()

!cat /projets/melodi/gsantoss/tmp/ccres/cmt_conference.csv

Evaluator
final_edoal.edoal,CQAs,0.06896551724137931,0.2413793103448276,0.15517241379310345,0.2413793103448276,0.1957821556140998
classical,recall-oriented,precision-oriented,overlap,query f-measure
MEAN,CQAs,0.068966,0.241379,0.155172,0.241379,0.195782


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
