# Attributes
- Attributes and values for the most part do not work very well for the retrieval. An exception is `Type,generic` dataset which can map to `Subtype` in ILCD.

In [3]:
# Define attribute-schema mapping
attribute_schema_mapping = {
    "EPD_DataSet": [
        "UUID",
        "Version",
        "Name (de)",
        "Name (en)",
        "Kategorie (original)",
        "Kategorie (en)",
        "Konformität",
        "Laenderkennung",
        "Typ",
        "Referenzjahr",
        "Gueltig bis",
        "URL",
        "Declaration owner",
        "Veroeffentlicht am",
        "Registrierungsnummer",
        "Registrierungsstelle",
        "UUID des Vorgängers",
        "Version des Vorgängers",
        "URL des Vorgängers",
        "Modul",
        "Szenario",
        "Szenariobeschreibung",
    ],
    # "EPD_FlowDataSet": [
    #     "PERE",
    #     "PERM",
    #     "PERT",
    #     "PENRE",
    #     "PENRM",
    #     "PENRT",
    #     "SM",
    #     "RSF",
    #     "NRSF",
    #     "FW",
    #     "HWD",
    #     "NHWD",
    #     "RWD",
    #     "CRU",
    #     "MFR",
    #     "MER",
    #     "EEE",
    #     "EET",
    # ],
    # "ILCD_FlowPropertyDataSet": [
        # "Bezugsgroesse",
        # "Bezugseinheit",
        # "Referenzfluss-UUID",
        # "Referenzfluss-Name",
        # "Schuettdichte (kg/m3)",
        # "Flaechengewicht (kg/m2)",
        # "Rohdichte (kg/m3)",
        # "Schichtdicke (m)",
        # "Ergiebigkeit (m2)",
        # "Laengengewicht (kg/m)",
        # "Stueckgewicht (kg)",
        # "Umrechungsfaktor auf 1kg",
        # "biogener Kohlenstoffgehalt in kg",
        # "biogener Kohlenstoffgehalt (Verpackung) in kg",
    # ],
    # "ILCD_LCIAMethodDataSet": [
        # "GWP",
        # "ODP",
        # "POCP",
        # "AP",
        # "EP",
        # "ADPE",
        # "ADPF",
        # "AP (A2)",
        # "GWPtotal (A2)",
        # "GWPbiogenic (A2)",
        # "GWPfossil (A2)",
        # "GWPluluc (A2)",
        # "ETPfw (A2)",
        # "PM (A2)",
        # "EPmarine (A2)",
        # "EPfreshwater (A2)",
        # "EPterrestrial (A2)",
        # "HTPc (A2)",
        # "HTPnc (A2)",
        # "IRP (A2)",
        # "SOP (A2)",
        # "ODP (A2)",
        # "POCP (A2)",
        # "ADPF (A2)",
        # "ADPE (A2)",
        # "WDP (A2)",
    # ],
}

In [None]:
# Attributes and Values EPD_DataSet
# attribute_schema_mapping = {
    # "EPD_DataSet": [
    #     'UUID,8ef7b1b9-da21-43c9-a7d1-9a14d9dc7bcf',
    #     'Version,00.01.002',
    #     'Name (de),"Beton C30/37 XC4 XF1 XA1 F3 16 M ECOPact, Rezept Nummer DA5234-DSZK Version 1, Transportbetonwerk Aachen-Haaren, Germany"',
    #     'Name (en),"Beton C30/37 XC4 XF1 XA1 F3 16 M ECOPact, Rezept Nummer DA5234-DSZK Version 1, Transportbetonwerk Aachen-Haaren, Germany"',
    #     "Kategorie (original),'Mineralische Baustoffe' / 'Mörtel und Beton' / 'Beton'",
    #     "Kategorie (en),'Mineral building products' / 'Binder' / 'Cement'",
    #     "Konformität,'EN 15804+A2' / 'ISO 14025'",
    #     'Laenderkennung,DE',
    #     'Typ,specific dataset',
    #     'Referenzjahr,2023',
    #     'Gueltig bis,2028',
    #     'URL,https://www.oekobaudat.de/OEKOBAU.DAT/resource/processes/8ef7b1b9-da21-43c9-a7d1-9a14d9dc7bcf?version=00.01.002',
    #     'Declaration owner,Holcim (Deutschland) GmbH',
    #     'Veroeffentlicht am,2023-10-19',
    #     'Registrierungsnummer,IBU-CEI-HOL-2205120-DE2023000648-1SUG001-DE',
    #     'Registrierungsstelle,Institut Bauen und Umwelt e.V.',
    #     'UUID des Vorgängers,8ef7b1b9-da21-43c9-a7d1-9a14d9dc7bcf',
    #     'Version des Vorgängers,00.01.001',
    #     'URL des Vorgängers,https://www.oekobaudat.de/OEKOBAU.DAT/resource/processes/8ef7b1b9-da21-43c9-a7d1-9a14d9dc7bcf?version=00.01.001',
    # ],
# }

# One Attribute at a time

In [12]:
import json
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings

# Define JSON schema
json_schema = {
    "title": "AlignmentResponse",
    "description": "Response containing alignment mappings.",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "attribute": {
                "type": "string",
                "description": "The exact attribute name from dataset A without additional information",
            },
            "match_type": {
                "type": "string",
                "description": "The type of SKOS match",
                "enum": ["skos:exactMatch", "skos:closeMatch", "skos:relatedMatch"],
            },
            "field_name": {
                "type": "string",
                "description": "The exact 'Field Name (en)' from Schema B without additional information",
            },
        },
        "required": ["attribute", "match_type", "field_name"],
    },
}


def query_system(attribute, vectorstore_path, schema_filter):
    embeddings = OllamaEmbeddings(model="bge-m3:latest")
    vectorstore = FAISS.load_local(
        vectorstore_path, embeddings=embeddings, allow_dangerous_deserialization=True
    )

    retriever = vectorstore.as_retriever(
        search_kwargs={"filter": {"schema_type": schema_filter}}
    )
    retrieved_docs = retriever.invoke(attribute)

    if not retrieved_docs:
        return None

    context = "\n".join(doc.page_content for doc in retrieved_docs)

    prompt_template = ChatPromptTemplate.from_template(
        """
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attribute from dataset A to its corresponding attribute in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Attribute Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
{context}
</context>

Answer the following question:
Match the attribute '{attribute}' from dataset A to one and only one attribute from Schema B.

Return the response in JSON format adhering to the defined schema.
"""
    )

    final_prompt = prompt_template.format_prompt(
        context=context, attribute=attribute
    ).to_string()

    # Write the final prompt to a text file
    with open(f"../data/prompts/prompts_{llm_model_name}_1by1.txt", "a") as prompt_file:
        prompt_file.write(f"Final Prompt for {attribute}:\n")
        prompt_file.write(final_prompt + "\n")
        prompt_file.write("-" * 50 + "\n")

    print(final_prompt)

    model = ChatOllama(model=llm_model)
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)

    structured_response = raw_response.get("parsed", None)

    print(structured_response)
    print("-" * 50 + "\n")

    return structured_response



llm_model = "deepseek-r1:8b" # llama3.1:8b, granite3-dense:8b, marco-o1:latest, dolphin3:8b
llm_model_name = llm_model.split(":")[0]
vectorstore_path = "../embeddings/bge-m3/row_cs3000_co0_faiss_index"
output_file = f"../data/responses/response_{llm_model_name}_1by1.json"

# Reset the JSON file
with open(output_file, "w") as file:
    json.dump([], file)

all_responses = []

for schema, attributes in attribute_schema_mapping.items():
    for attribute in attributes:
        response = query_system(attribute, vectorstore_path, schema)
        if response:
            all_responses.extend(response)

with open(output_file, "w") as file:
    json.dump(all_responses, file, indent=2)

print(f"All responses saved to {output_file}")

Human: 
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attribute from dataset A to its corresponding attribute in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Attribute Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
'UUID des Datensatzes','UUID of Process data set','UUID','UUID','UUID des Datensatzes. Zusammen mit der Versionsnummer in "Datensatzversion" wird der Datensatz damit eindeutig identifizert','~','Automatically generated Universally Unique Identifier of this data set. Together with the "Data set version", the UUID uniquely 

# All Attributes per Schema File at a time

- Maximal Marginal Relevance (MMR) Retrieval works better thena the default similarity retrieval

In [133]:
import json
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings

# Define JSON schema
json_schema = {
    "title": "AlignmentResponse",
    "description": "Response containing alignment mappings.",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "attribute": {
                "type": "string",
                "description": "The attribute from dataset A without additional information",
            },
            "match_type": {
                "type": "string",
                "description": "The type of SKOS match",
                "enum": ["skos:exactMatch", "skos:closeMatch", "skos:relatedMatch"],
            },
            "field_name": {
                "type": "string",
                "description": "The exact Field Name (en) from Schema B without additional information",
            },
        },
        "required": ["attribute", "match_type", "field_name"],
    },
}


def query_system(attributes, vectorstore_path, schema_filter):
    embeddings = OllamaEmbeddings(model=embed_model)
    vectorstore = FAISS.load_local(
        vectorstore_path, embeddings=embeddings, allow_dangerous_deserialization=True
    )

    # retriever = vectorstore.as_retriever(
    #     search_kwargs={"filter": {"schema_type": schema_filter}, "k": len(attributes)}
    # )
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "filter": {"schema_type": schema_filter},
            "k": 2 * len(attributes),
            "fetch_k": 50,
            "lambda_mult": 1
        }
    )
    retrieved_docs = retriever.invoke("\n".join(attributes))

    print(f"Number of attributes: {len(attributes)}")
    print(f"Number of retrieved documents: {len(retrieved_docs)}")

    if not retrieved_docs:
        return None

    context = "\n\n".join(doc.page_content for doc in retrieved_docs)

    prompt_template = ChatPromptTemplate.from_template(
        """
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attributes from dataset A to their corresponding fields in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Definition Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
{context}
</context>

Match the following attributes to the data under <context> in Schema B:
[
{attributes}
]

Return the response in JSON format adhering to the defined schema.
"""
    )

    final_prompt = prompt_template.format_prompt(
        context=context, attributes="\n".join(attributes)
    ).to_string()

    with open(f"{ollama_prompts}", "a") as prompt_file:
        prompt_file.write(final_prompt + "\n\n" + ("-" * 50) + "\n\n")

    print(final_prompt)

    model = ChatOllama(model=llm_model)
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)

    structured_response = raw_response.get("parsed", None)

    print(structured_response)
    print("-" * 50 + "\n")

    return structured_response


if __name__ == "__main__":
    llm_model = "dolphin3:8b" # llama3.1:8b, granite3-dense:8b, marco-o1:latest
    embed_model = "snowflake-arctic-embed2:latest" # bge-m3:latest

    if embed_model == "bge-m3:latest" or embed_model == "bge-large:latest":
        embed_model_name = embed_model.split(":")[0].replace("/", "_")
    else:
        embed_model_name = embed_model.split("-")[0].replace("/", "_")  
    chunk_strat = "row_cs1000_co0"
    vectorstore_path = f"../embeddings/{embed_model_name}/{chunk_strat}_faiss_index"
    output_file = f"../data/responses/response_{llm_model.split(":")[0]}.json"

    # Reset the JSON file
    with open(output_file, "w", encoding='utf-8') as file:
        json.dump([], file)

    # Reset the prompts text file
    ollama_prompts = "../data/prompts/prompts_ollama_csv02.txt"
    with open(f"{ollama_prompts}", "w") as prompt_file:
        prompt_file.write("")

    all_responses = []

    for schema, attributes in attribute_schema_mapping.items():
        response = query_system(attributes, vectorstore_path, schema)
        if response:
            all_responses.extend(response)

    with open(output_file, "w", encoding='utf-8') as file:
        json.dump(all_responses, file, indent=2)

    print(f"All responses saved to {output_file}")


Number of attributes: 8
Number of retrieved documents: 16
Human: 
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attributes from dataset A to their corresponding fields in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Definition Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
'UUID des Datensatzes','UUID of Process data set','UUID','UUID','UUID des Datensatzes. Zusammen mit der Versionsnummer in "Datensatzversion" wird der Datensatz damit eindeutig identifizert','~','Automatically generated Universally Unique Identifier of this data se

### Similarity Score

In [42]:
import json
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings

# Define JSON schema
json_schema = {
    "title": "AlignmentResponse",
    "description": "Response containing alignment mappings.",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "attribute": {
                "type": "string",
                "description": "The attribute from dataset A without additional information",
            },
            "match_type": {
                "type": "string",
                "description": "The type of SKOS match",
                "enum": ["skos:exactMatch", "skos:closeMatch", "skos:relatedMatch"],
            },
            "field_name": {
                "type": "string",
                "description": "The exact Field Name (en) from Schema B without additional information",
            },
        },
        "required": ["attribute", "match_type", "field_name"],
    },
}


def query_system(attributes, vectorstore_path, schema_filter):
    embeddings = OllamaEmbeddings(model="bge-m3:latest")
    vectorstore = FAISS.load_local(
        vectorstore_path, embeddings=embeddings, allow_dangerous_deserialization=True
    )

    # Retrieve documents with scores
    retrieved_docs_with_scores = vectorstore.similarity_search_with_score(
        "\n".join(attributes), k=2 * len(attributes), search_kwargs={
            "filter": {"schema_type": schema_filter},}
    )

    print(f"Number of attributes: {len(attributes)}")
    print(f"Number of retrieved documents: {len(retrieved_docs_with_scores)}")

    # Print retrieved document scores
    for doc, score in retrieved_docs_with_scores:
        print(f"Score: {score:.4f}")
        print(f"Content: {doc.page_content}")
        print("-" * 50)

    if not retrieved_docs_with_scores:
        return None

    context = "\n\n".join(doc.page_content for doc, _ in retrieved_docs_with_scores)

    prompt_template = ChatPromptTemplate.from_template(
        """
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attributes from dataset A to their corresponding fields in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Definition Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
{context}
</context>

Match the following attributes to the data under <context> in Schema B:
[Attribute,Value
{attributes}
]

Return the response in JSON format adhering to the defined schema.
"""
    )

    final_prompt = prompt_template.format_prompt(
        context=context, attributes="\n".join(attributes)
    ).to_string()

    with open(f"{ollama_prompts}", "a", encoding='utf-8') as prompt_file:
        prompt_file.write(final_prompt + "\n\n" + ("-" * 50) + "\n\n")

    print(final_prompt)

    model = ChatOllama(model="dolphin3:8b")

    # Add structured output to the LLM model
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)

    structured_response = raw_response.get("parsed", None)

    print(structured_response)
    print("-" * 50 + "\n")

    return structured_response


if __name__ == "__main__":
    embed_model_name = ""
    vectorstore_path = "../embeddings/bge-m3_row_cs3000_faiss_index"
    output_file = "../data/responses/response_ollama.json"

    # Reset the JSON file
    with open(output_file, "w", encoding='utf-8') as file:
        json.dump([], file)

    # Reset the prompts text file
    ollama_prompts = "../data/prompts/prompts_ollama_csv02.txt"
    with open(f"{ollama_prompts}", "w", encoding='utf-8') as prompt_file:
        prompt_file.write("")

    all_responses = []

    for schema, attributes in attribute_schema_mapping.items():
        response = query_system(attributes, vectorstore_path, schema)
        if response:
            all_responses.extend(response)

    with open(output_file, "w", encoding='utf-8') as file:
        json.dump(all_responses, file, indent=2)

    print(f"All responses saved to {output_file}")


Number of attributes: 19
Number of retrieved documents: 38
Score: 1.0038
Content: 'Hintergrundbericht','Data set LCA report, background info','referenceToExternalDocumentation','GlobalReferenceType','Hier können relevante Dokumente wie z.B. Sicherheitsdatenblätter, Hintergrundbericht o.ä. angehängt werden','Relevant information such as safety declarations or background report can be attached','"Source data set(s)" of detailed LCA study on the process or product represented by this data set, as well as documents / files with overarching documentative information on technology, geographical and / or time aspects etc. (e.g. basic engineering studies, process simulation results, patents, plant documentation, model behind the parameterisation of the "Mathematical model" section, etc.) (Note: can indirectly reference to digital file.)'
--------------------------------------------------
Score: 1.0155
Content: 'Datenquellen','Data source(s) used for this data set','referenceToDataSource','Glob

# Testing Data

In [1]:
# Attributes and Chunks
attribute_schema_mapping = {
    "EPD_DataSet": [
        "UUID",
        "Version",
        "Name (de)",
        "Name (en)",
        "Kategorie (original)",
        "Kategorie (en)",
        "Konformität",
        "Laenderkennung",
        "Typ",
        "Referenzjahr",
        "Gueltig bis",
        "URL",
        "Declaration owner",
        "Veroeffentlicht am",
        "Registrierungsnummer",
        "Registrierungsstelle",
        "Version des Vorgängers",
        "Modul",
        "Szenario",
        "Szenariobeschreibung",
    ],
}

# German and English
attribute_to_expected_chunk = {
    "UUID": "'UUID des Datensatzes','UUID of Process data set','UUID','UUID','UUID des Datensatzes. Zusammen mit der Versionsnummer in \"Datensatzversion\" wird der Datensatz damit eindeutig identifiziert','~','Automatically generated Universally Unique Identifier of this data set. Together with the \"Data set version\", the UUID uniquely identifies each data set.'",    
    "Version": "'Datensatzversion','Data set version','dataSetVersion','Version','Versionsnummer des Datensatzes','~','Version number of data set. First two digits refer to major updates, the second two digits to minor revisions and error corrections etc. The third three digits are intended for automatic and internal counting of versions during data set development. Together with the data set's UUID, the \"Data set version\" uniquely identifies each data set.'",    
    "Name (de)": "'Name','Name','baseName','StringMultiLang','Allgemeiner Name des Produkts oder Systems','~','General descriptive name of the process and/or its main good(s) or service(s) and/or its level of processing.'",
    "Name (en)": "'Name','Name','baseName','StringMultiLang','Allgemeiner Name des Produkts oder Systems','~','General descriptive name of the process and/or its main good(s) or service(s) and/or its level of processing.'",    
    "Kategorie (original)": "'Klassifizierung','Classification','classification','nan','ein Gliederungssystem mit Gliederungsklassen','~','Optional statistical or other classification of the data set. Typically also used for structuring LCA databases.'",
    "Kategorie (en)": "'Klassifizierung','Classification','classification','nan','ein Gliederungssystem mit Gliederungsklassen','~','Optional statistical or other classification of the data set. Typically also used for structuring LCA databases.'",    
    "Konformität": "'Konformität','Compliance','compliance','nan','eine Konformitätsdeklaration','~','one compliance declaration'",    
    "Laenderkennung": "'Ort','Location','@location','NullableString','Region, für die der Datensatz repräsentativ ist ISO 3166-Ländercode oder Regionalcode','Region, for which the data set is representative / relevant. ISO 3166 country code or regional code','Location, country or region the data set represents. [Note 1: This field does not refer to e.g. the country in which a specific site is located that is represented by this data set but to the actually represented country, region, or site. Note 2: Entry can be of type \"two-letter ISO 3166 country code\" for countries, \"seven-letter regional codes\" for regions or continents, or \"market areas and market organisations\", as predefined for the ILCD. Also a name for e.g. a specific plant etc. can be given here (e.g. \"FR, Lyon, XY Company, Z Site\"; user defined). Note 3: The fact whether the entry refers to production or to consumption / supply has to be stated in the name-field \"Mix and location types\" e.g. as \"Production mix\".]'",    
    "Typ": "'Subtyp','Subtype','epd:subType','generic datasetGenerischer Datensatzrepresentative datasetRepräsentativer Datensatzaverage datasetDurchschnittsdatensatzspecific dataset(Hersteller-) Spezifischer Datensatztemplate datasetMusterdatensatz','Gibt den Datensatztypen hinsichtlich Repräsentativität an. Einer der folgenden vordefinierten Datensatztypen muss ausgewählt werden: -specific dataset(spezifischer Datensatz) - hersteller-(unternehmens-) spezifischer Datensatz für ein konkretes Produkt eines Werkes -average dataset(Durchschnittsdatensatz) - durchschnittliche Datensätze von Industrieverbänden, mehreren Firmen, mehreren Werken oder mehreren Produkten (d.h. auf Grundlage von Daten der Industrieproduktion von Unternehmen) -representative dataset(repräsentativer Datensatz) - representative dataset – Daten, die repräsentativ für ein Land / eine Region sind (z.B. Durchschnitt DE) -template dataset(Muster-EPD-Datensatz) - unspezifische Datensätze für spezifische Produkte, die auf Basis einer „Muster-EPD“ erstellt wurden -generic dataset(generischer Datensatz) - generische Daten gemäß EN 15804 sowie andere, nicht auf Basis von Industriedaten modellierte Daten (z.B. auf der Basis von Literatur, Expertenwissen etc.)','Indicates the type of data set regarding its representativeness. One of the following predefined data types has to be chosen: -specific dataset- vendor (company) specific data for a specific product from one production site -average dataset- avarage datasets from industry associations, multiple manufacturers, multiple production sites or multiple products, i.e. modelled based on industry data from an manufacturer -representative dataset- data that is representative for a country or region (e.g. average for Germany) -template dataset- sample EPD, unspecific datasets for specific products, that were created based on a sample EPD -generic dataset- generic data acc. to EN 15804 and data based on other non-industry data sources (e.g. literature, expert knowledge)','nan'",
    "Referenzjahr": "'Referenzjahr','Reference year','referenceYear','Year','Das erste Jahr der Gültigkeits des Datensatzes.','Start year of the time period for which the data set is valid (until year of \"Data set valid until\")','Start year of the time period for which the data set is valid (until year of \"Data set valid until:\"). For data sets that combine data from different years, the most representative year is given regarding the overall environmental impact. In that case, the reference year is derived by expert judgement.'",
    "Gueltig bis": "'Gültig bis','Data set valid until:','dataSetValidUntil','Year','Ende des Zeitabschnitts, bis zu dem der Datensatz gültig ist.','End year of the time period for which the data set is valid.','End year of the time period for which the data set is still valid / sufficiently representative. This date also determines when a data set revision / remodelling is required or recommended due to expected relevant changes in environmentally or technically relevant inventory values, including in the background system.'",
    "URL": "'Permanente Datensatz-URI','Permanent data set URI','permanentDataSetURI','anyURI','URI zum Original dieses Datensatzes','~','URI (i.e. an internet address) of the original of this data set. [Note: This equally globally unique identifier supports users and software tools to identify and retrieve the original version of a data set via the internet or to check for available updates. The URI must not represent an existing WWW address, but it should be unique and point to the data access point, e.g. by combining the data owner's www path with the data set's UUID, e.g. http://www.mycompany.com/lca/processes/50f12420-8855-12db-b606-0900210c9a66.]'",
    "Declaration owner": "'Eigner des Datensatzes','Owner of data set','referenceToOwnershipOfDataSet','GlobalReferenceType','\"Contact\"-Datensatz zum Eigentümer des Datensatzes','~','\"Contact data set\" of the person or entity who owns this data set. (Note: this is not necessarily the publisher of the data set.)'",
    "Veroeffentlicht am": "'Veröffentlichungsdatum der EPD','publication date of EPD','publicationDateOfEPD','xs:date','Datum der Veröffentlichung der EPD in der Form \"YYYY-MM-DD\".','Exact date of publication of the EPD in the form \"YYYY-MM-DD\".','nan'",
    "Registrierungsnummer": "'Registrierungsnummer','Registration number','registrationNumber','String','ID-Nummer der EPD oder des Projekts','ID number of EPD or project','A unique identifying number for this data set issued by the registration authority.'",
    "Registrierungsstelle": "'Herausgeber','Issuer','referenceToRegistrationAuthority','GlobalReferenceType','Kontaktdaten des Herausgebers des Datensatzes (z.B. EPD-Programmbetreiber)','~','\"Contact data set\" of the authority that has registered this data set.'",
    "Version des Vorgängers": "'Vorhergehende Datensatzversion','Preceding data set version','referenceToPrecedingDataSetVersion','GlobalReferenceType','nan','nan','Last preceding data set, which was replaced by this version. Either a URI of that data set (i.e. an internet address) or its UUID plus version number is given (or both).'",
    "Modul": "'Modul/Phase','Module/Phase','@epd:module','String','Modul oder Phase (z.B. \"A1-A3\")','Module or phase (e.g. \"A1-A3\")','nan'",
    "Szenario": "'Szenario','Scenario','@epd:scenario','String','Verweis auf die oben definierte ID eines Szenarios (falls definiert), für das dieser Wert gilt.','References ID of a scenario defined above','nan'",
    "Szenariobeschreibung": "'Beschreibung','Description','epd:description','FTMultiLang','Beschreibung des Szenarios','Description of the scenario','nan'",
}


# English-only
# attribute_to_expected_chunk = {
#     "UUID": "'UUID of Process data set','UUID','UUID','~','Automatically generated Universally Unique Identifier of this data set. Together with the \"Data set version\", the UUID uniquely identifies each data set.'",
#     "Version": "'Data set version','dataSetVersion','Version','~','Version number of data set. First two digits refer to major updates, the second two digits to minor revisions and error corrections etc. The third three digits are intended for automatic and internal counting of versions during data set development. Together with the data set's UUID, the \"Data set version\" uniquely identifies each data set.'",
# 	"Name (de)": "'Name','baseName','StringMultiLang','~','General descriptive name of the process and/or its main good(s) or service(s) and/or its level of processing.'",
#     "Name (en)": "'Name','baseName','StringMultiLang','~','General descriptive name of the process and/or its main good(s) or service(s) and/or its level of processing.'",
#     "Kategorie (original)": "'Classification','classification','nan','~','Optional statistical or other classification of the data set. Typically also used for structuring LCA databases.'",
#     "Kategorie (en)": "'Classification','classification','nan','~','Optional statistical or other classification of the data set. Typically also used for structuring LCA databases.'",
#     "Konformität": "'Compliance','compliance','nan','~','one compliance declaration'",
#     "Laenderkennung": "'Location','@location','NullableString','Region, for which the data set is representative / relevant. ISO 3166 country code or regional code','Location, country or region the data set represents. [Note 1: This field does not refer to e.g. the country in which a specific site is located that is represented by this data set but to the actually represented country, region, or site. Note 2: Entry can be of type \"two-letter ISO 3166 country code\" for countries, \"seven-letter regional codes\" for regions or continents, or \"market areas and market organisations\", as predefined for the ILCD. Also a name for e.g. a specific plant etc. can be given here (e.g. \"FR, Lyon, XY Company, Z Site\"; user defined). Note 3: The fact whether the entry refers to production or to consumption / supply has to be stated in the name-field \"Mix and location types\" e.g. as \"Production mix\".]'",
#     "Typ": "'Subtype','epd:subType','generic datasetGenerischer Datensatzrepresentative datasetRepräsentativer Datensatzaverage datasetDurchschnittsdatensatzspecific dataset(Hersteller-) Spezifischer Datensatztemplate datasetMusterdatensatz','Indicates the type of data set regarding its representativeness. One of the following predefined data types has to be chosen: -specific dataset- vendor (company) specific data for a specific product from one production site -average dataset- avarage datasets from industry associations, multiple manufacturers, multiple production sites or multiple products, i.e. modelled based on industry data from an manufacturer -representative dataset- data that is representative for a country or region (e.g. average for Germany) -template dataset- sample EPD, unspecific datasets for specific products, that were created based on a sample EPD -generic dataset- generic data acc. to EN 15804 and data based on other non-industry data sources (e.g. literature, expert knowledge)','nan'",
#     "Referenzjahr": "'Reference year','referenceYear','Year','Start year of the time period for which the data set is valid (until year of \"Data set valid until\")','Start year of the time period for which the data set is valid (until year of \"Data set valid until:\"). For data sets that combine data from different years, the most representative year is given regarding the overall environmental impact. In that case, the reference year is derived by expert judgement.'",
#     "Gueltig bis": "'Data set valid until:','dataSetValidUntil','Year','End year of the time period for which the data set is valid.','End year of the time period for which the data set is still valid / sufficiently representative. This date also determines when a data set revision / remodelling is required or recommended due to expected relevant changes in environmentally or technically relevant inventory values, including in the background system.'",
#     "URL": "'Permanent data set URI','permanentDataSetURI','anyURI','~','URI (i.e. an internet address) of the original of this data set. [Note: This equally globally unique identifier supports users and software tools to identify and retrieve the original version of a data set via the internet or to check for available updates. The URI must not represent an existing WWW address, but it should be unique and point to the data access point, e.g. by combining the data owner's www path with the data set's UUID, e.g. http://www.mycompany.com/lca/processes/50f12420-8855-12db-b606-0900210c9a66.]'",
#     "Declaration owner": "'Owner of data set','referenceToOwnershipOfDataSet','GlobalReferenceType','~','\"Contact data set\" of the person or entity who owns this data set. (Note: this is not necessarily the publisher of the data set.)'",
#     "Veroeffentlicht am": "'publication date of EPD','publicationDateOfEPD','xs:date','Exact date of publication of the EPD in the form \"YYYY-MM-DD\".','nan'",
#     "Registrierungsnummer": "'Registration number','registrationNumber','String','ID number of EPD or project','A unique identifying number for this data set issued by the registration authority.'",
#     "Registrierungsstelle": "'Issuer','referenceToRegistrationAuthority','GlobalReferenceType','~','\"Contact data set\" of the authority that has registered this data set.'",
#     "Version des Vorgängers": "'Preceding data set version','referenceToPrecedingDataSetVersion','GlobalReferenceType','nan','Last preceding data set, which was replaced by this version. Either a URI of that data set (i.e. an internet address) or its UUID plus version number is given (or both).'",
#     "Modul": "'Module/Phase','@epd:module','String','Module or phase (e.g. \"A1-A3\")','nan'",
#     "Szenario": "'Scenario','@epd:scenario','String','References ID of a scenario defined above','nan'",
#     "Szenariobeschreibung": "'Description','epd:description','FTMultiLang','Description of the scenario','nan'",
# }

# Context with rerank

In [2]:
# Context Creation and Validation

import os
import datetime
import pandas as pd
from typing import List, Dict, Optional, Set
import torch
import gc

# Sentence-Transformers
from sentence_transformers import SentenceTransformer, CrossEncoder

# LangChain & Community modules
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings


###############################################################################
# 1) Custom SentenceTransformer Embeddings Wrapper
###############################################################################
class CustomSentenceTransformerEmbeddings(Embeddings):
    """
    Allows using a SentenceTransformer model within a LangChain-based FAISS store.
    """

    def __init__(self, model: SentenceTransformer):
        self.model = model

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def unload_model(self):
        """
        Remove the model from memory after processing to free up GPU resources.
        """
        if self.model:
            del self.model  # Delete the model instance
            self.model = None  # Ensure the reference is cleared
            torch.cuda.empty_cache()  # Clear the GPU cache
            gc.collect()  # Run garbage collection
        else:
            print("[DEBUG] Embedding model was already None or not set.")


###############################################################################
# 2) Reranker Class (CrossEncoder)
###############################################################################
class Reranker:
    """
    A reranker that uses a CrossEncoder to score (query, doc) pairs and reorder them.
    """

    def __init__(self, model_name: str, device: str = "cuda"):
        self.model_name = model_name
        self.reranker = CrossEncoder(
            model_name,
            automodel_args={"torch_dtype": "auto"},
            trust_remote_code=True,
            device=device,
            revision="main",
        )

    def rerank(
        self, query: str, documents: List[str], top_n: Optional[int] = None
    ) -> List[Dict]:
        """
        Return a list of dicts: [{"text": doc_text, "score": float, "rank": int}, ...],
        sorted descending by score. If top_n is given, truncate the list.
        """
        if not documents:
            return []
        pairs = [[query, doc] for doc in documents]
        scores = self.reranker.predict(pairs)  # shape: [num_docs]

        # Sort by descending score
        doc_score_pairs = sorted(
            zip(documents, scores), key=lambda x: x[1], reverse=True
        )
        # Build result
        results = []
        for i, (text, score) in enumerate(doc_score_pairs):
            results.append({"text": text, "score": float(score), "rank": i + 1})
        if top_n is not None:
            results = results[:top_n]
        return results

    def evaluate_rerank(
        self,
        expected_chunk: str,
        reranked_results: List[Dict],
        comparison_length: int = 100,
    ) -> Dict:
        """
        Evaluate how well the reranker found the `expected_chunk`.
        Returns: {'found': bool, 'rank': int or None, 'mrr': float}
        """
        target_prefix = expected_chunk[:comparison_length].strip()
        found_rank = None
        for item in reranked_results:
            doc_prefix = item["text"][:comparison_length].strip()
            if doc_prefix == target_prefix:
                found_rank = item["rank"]
                break

        if found_rank is None:
            return {"found": False, "rank": None, "mrr": 0.0}
        else:
            return {"found": True, "rank": found_rank, "mrr": 1.0 / found_rank}

    def unload_model(self):
        """
        Remove the model from memory after processing to free up GPU resources.
        """
        if self.reranker:
            del self.reranker  # Delete the model instance
            self.reranker = None  # Ensure the reference is cleared
            torch.cuda.empty_cache()  # Clear the GPU cache
            gc.collect()  # Run garbage collection


###############################################################################
# 3) Utility Functions
###############################################################################
def evaluate_retrieval(
    retrieved_chunks: List[str],
    expected_chunks: List[str],
    comparison_length: int = 100,
):
    """
    Compare the first 'comparison_length' chars between retrieved vs. expected chunk(s)
    for an exact-match check.
    """
    expected_prefixes = set(c[:comparison_length].strip() for c in expected_chunks)
    retrieved_prefixes = set(c[:comparison_length].strip() for c in retrieved_chunks)

    correctly_retrieved = expected_prefixes.intersection(retrieved_prefixes)
    missed_chunks = expected_prefixes.difference(retrieved_prefixes)

    return correctly_retrieved, missed_chunks


def evaluate_ranked_retrieval(
    docs_with_ranks: List, expected_chunks: List[str], comparison_length: int = 100
):
    """
    For each expected chunk, see where it appears among docs_with_ranks (rank, doc).
    Then compute mean_rank, mrr, etc.
    """
    ranks = []
    for chunk in expected_chunks:
        chunk_prefix = chunk[:comparison_length].strip()
        matched_positions = [
            rank
            for (rank, doc) in docs_with_ranks
            if doc.page_content[:comparison_length].strip() == chunk_prefix
        ]
        ranks.append(matched_positions[0] if matched_positions else None)

    found_ranks = [r for r in ranks if r is not None]
    missed_count = sum(r is None for r in ranks)
    total_expected = len(expected_chunks)

    if found_ranks:
        if len(found_ranks) == 1:
            # Directly assign the single rank as integer
            mean_rank = found_ranks[0]
            rank = mean_rank
        else:
            mean_rank = sum(found_ranks) / len(found_ranks)
        mrr = sum((1.0 / r) for r in found_ranks) / len(found_ranks)
    else:
        mean_rank = None
        mrr = None

    return {
        "found": len(found_ranks),
        "missed": missed_count,
        "mean_rank": mean_rank,
        "rank": rank if "rank" in locals() else None,
        "mrr": mrr,
    }


def build_context(reranked_results: List[Dict], threshold: float = 0.2) -> Set[str]:
    """
    From a reranked list, build a 'context' set by including docs whose
    score is within 'threshold' of top doc's score.
    Returns a set of doc strings (unique).
    """
    print("\nInclude chunk in context threshold:", threshold)

    if not reranked_results:
        return set()

    sorted_docs = sorted(reranked_results, key=lambda x: x["score"], reverse=True)
    top_score = sorted_docs[0]["score"]
    context_set = set()
    for item in sorted_docs:
        score_diff = top_score - item["score"]
        if score_diff <= threshold:
            context_set.add(item["text"])
        else:
            break
    return context_set


def inspect_context(
    context_set: Set[str],
    expected_chunks: Dict[str, str],
    comparison_length: int = 100,
):
    """
    Inspect if the first 'comparison_length' characters of every expected chunk
    are present within the context.
    """
    print(
        f"\n## Inspection of Contexts for Expected Chunks (First {comparison_length} Characters)\n"
    )

    all_found = True
    missing_chunks = set()

    for attribute, expected_chunk_str in expected_chunks.items():
        expected_prefix = expected_chunk_str[:comparison_length].strip()

        found = any(
            chunk[:comparison_length].strip() == expected_prefix
            for chunk in context_set
        )
        status = "FOUND" if found else "MISSING"

        if status == "MISSING":
            missing_chunks.add(expected_prefix)
            all_found = False

        print(f"  - Attribute: {attribute} => {status}")
        print(f'    • Expected Prefix: "{expected_prefix}"')

    if all_found:
        print("  **All expected chunks are present in the context.**\n")
    else:
        print("  **Some expected chunks are missing in the context.**")
        for missing_chunk in missing_chunks:
            print(f"     - {missing_chunk}")


def print_retrieval_debug(
    attribute: str,
    k_value: int,
    retrieved_chunks: List[str],
    retrieval_eval: Dict,
    comparison_length: int,
    correctly_retrieved: Set[str],
    missed_chunks: Set[str],
):
    # Print original retrieval debug info (like old code)
    print(f"\n=== Attribute: {attribute} | k={k_value} ===")
    print(f"Query: {attribute}")
    print(f"Retrieved docs: {len(retrieved_chunks)}")

    # Exact-match
    found_count = retrieval_eval["found"]
    missed_count = retrieval_eval["missed"]
    total = found_count + missed_count

    print(f"\nExact-Match Evaluation (First {comparison_length} chars):")
    print(f"  Correctly retrieved: {found_count}/{total}")
    for prefix in correctly_retrieved:
        print(f"    ✔ {prefix}")
    print(f"  Missed: {missed_count}/{total}")
    for prefix in missed_chunks:
        print(f"    ✘ {prefix}")


def print_ranking_debug(rank_eval: Dict):
    # Print old style ranking debug
    print("\nRanked Evaluation:")
    found = rank_eval["found"]
    missed = rank_eval["missed"]
    total = found + missed

    mean_rank = rank_eval["mean_rank"]
    rank = rank_eval.get("rank")
    mrr = rank_eval.get("mrr")

    print(f"  Found: {found}/{total}")
    print(f"  Missed: {missed}/{total}")
    print(f"  Mean Rank: {mean_rank if mean_rank else 'N/A'}")
    print(f"  Rank: {rank if rank else 'N/A'}")
    print(f"  MRR: {mrr if mrr else 'N/A'}")


def print_reranking_debug(
    reranker_label: str,
    reranked: List[Dict],
    context_set: Set[str],
    k_value: int,
):
    # Show top k_value docs from reranked
    print(f"\n[Using Reranker: {reranker_label}]")
    for item in reranked[:k_value]:
        snippet_preview = item["text"][:70].replace("\n", " ")
        print(f"  R{item['rank']} | Score={item['score']:.4f} | {snippet_preview}...")

    # Show the context set
    print("\n[Context Set]:")
    for chunk in context_set:
        snippet_preview = chunk[:70].replace("\n", " ")
        print(f"  - {snippet_preview}...")


###############################################################################
# 4) Master Function to Create Context
###############################################################################
def create_context_pipeline(
    attributes: List[str],
    vectorstore_path: str,
    schema: str,
    expected_chunks: Dict[str, str],
    embedding_model: Embeddings,
    reranker: Reranker,
    context_threshold: float,
    k_value: int = 24,
    comparison_length: int = 100,
) -> Set[str]:
    """
    Creates context by retrieving and reranking documents for the given attributes.
    """
    context_set = set()

    # Load vectorstore
    vectorstore = FAISS.load_local(
        vectorstore_path,
        embeddings=embedding_model,
        allow_dangerous_deserialization=True,
    )

    # For each attribute, perform retrieval and reranking
    for attribute in attributes:
        expected_chunk = expected_chunks.get(attribute)
        if not expected_chunk:
            print(f"[Warning] No expected chunk for '{attribute}'. Skipping.")
            continue

        # Retrieve using MMR
        retriever = vectorstore.as_retriever(
            search_type="mmr",
            search_kwargs={
                "filter": {"schema_type": schema},
                "k": k_value,
                "fetch_k": 100,
                "lambda_mult": 1.0,
            },
        )
        docs = retriever.invoke(attribute)
        retrieved_chunks = [d.page_content for d in docs]

        # Evaluate original retrieval
        correctly_retrieved, missed_chunks = evaluate_retrieval(
            retrieved_chunks, [expected_chunk], comparison_length
        )
        docs_with_ranks = list(enumerate(docs, start=1))
        rank_eval = evaluate_ranked_retrieval(
            docs_with_ranks, [expected_chunk], comparison_length
        )

        # Debugging output
        retrieval_eval = {
            "found": len(correctly_retrieved),
            "missed": len(missed_chunks),
        }
        print_retrieval_debug(
            attribute,
            k_value,
            retrieved_chunks,
            retrieval_eval,
            comparison_length,
            correctly_retrieved,
            missed_chunks,
        )
        print_ranking_debug(rank_eval)

        # Rerank the top k_value documents
        top_docs_for_rerank = retrieved_chunks[:k_value]
        print(f"\n[Original Top Docs (first {len(top_docs_for_rerank)})]")
        for i, doc_text in enumerate(top_docs_for_rerank, start=1):
            print(f"  {i}. {doc_text[:80].replace('\n',' ')}...")

        reranked = reranker.rerank(attribute, top_docs_for_rerank, top_n=None)

        # Evaluate rerank
        rerank_eval = reranker.evaluate_rerank(
            expected_chunk, reranked, comparison_length
        )
        # Build context based on threshold
        context_subset = build_context(reranked, threshold=context_threshold)
        context_set.update(context_subset)

        # Determine if expected chunk is in context
        expected_prefix = expected_chunk[:comparison_length].strip()
        context_contains_expected = any(
            chunk[:comparison_length].strip() == expected_prefix
            for chunk in context_subset
        )

        # Debugging output for reranking
        print_reranking_debug(reranker.model_name, reranked, context_subset, k_value)

    # Inspect the final context
    inspect_context(context_set, expected_chunks, comparison_length=comparison_length)

    return context_set


###############################################################################
# 5) Model Selection and Pipeline Execution
###############################################################################
# Configuration Parameters
EMBEDDING_MODEL_NAME = "jinaai/jina-embeddings-v3"  # Change as needed
RERANKER_MODEL_NAME = "jinaai/jina-reranker-v2-base-multilingual"  # jinaai/jina-reranker-v2-base-multilingual, BAAI/bge-reranker-v2-m3, Alibaba-NLP/gte-multilingual-reranker-base
CONTEXT_THRESHOLD = 0.2  # Adjustable threshold
K_VALUE = 10  # Number of top documents to consider
COMPARISON_LENGTH = 100  # Number of characters for comparison

# Paths and Mappings (Update these as per your environment)
vectorstore_paths = {
    "jinaai/jina-embeddings-v3": "../embeddings/jinaai_jina-embeddings-v3/row_cs2000_co0_faiss_index_COS",
    # Add other vectorstore paths if necessary
}


# Initialize Embedding Model
def initialize_embedding_model(model_name: str, device: str = "cuda") -> Embeddings:
    if model_name.startswith("jinaai"):
        st_model = SentenceTransformer(
            model_name,
            trust_remote_code=True,
            device=device,
            model_kwargs={"use_flash_attn": False},
        )
        embeddings = CustomSentenceTransformerEmbeddings(st_model)
    elif model_name.startswith("HIT-TMG"):
        embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={
                "trust_remote_code": True,
                "device": device,
            },
        )
    else:
        embeddings = OllamaEmbeddings(model=model_name)
    print(f"Initialized embedding model: {model_name}")
    return embeddings


# Initialize Reranker Model
def initialize_reranker(model_name: str, device: str = "cuda") -> Reranker:
    reranker = Reranker(model_name=model_name, device=device)
    print(f"Initialized reranker model: {model_name}")
    return reranker


# Select Single Embedding and Reranker Models
selected_embedding_model_name = EMBEDDING_MODEL_NAME
selected_reranker_model_name = RERANKER_MODEL_NAME

embedding_model = initialize_embedding_model(selected_embedding_model_name)
reranker = initialize_reranker(selected_reranker_model_name)

# Initialize Context Map
context_map: Dict[str, Set[str]] = {}

# Execute Context Creation Pipeline
all_contexts = set()

for schema, attributes in attribute_schema_mapping.items():
    print(f"\nProcessing Schema: {schema}")
    context = create_context_pipeline(
        attributes=attributes,
        vectorstore_path=vectorstore_paths[selected_embedding_model_name],
        schema=schema,
        expected_chunks={
            attr: attribute_to_expected_chunk[attr] for attr in attributes
        },
        embedding_model=embedding_model,
        reranker=reranker,
        context_threshold=CONTEXT_THRESHOLD,
        k_value=K_VALUE,
        comparison_length=COMPARISON_LENGTH,
    )
    all_contexts.update(context)

# Unload Models to Free GPU Memory
print("\n>>> Unloading Models from GPU Memory...")
embedding_model.unload_model()
reranker.unload_model()
print(">>> All models have been unloaded.")

# Final Context
print(f"\n>>> Final Context Size: {len(all_contexts)} chunks")

Initialized embedding model: jinaai/jina-embeddings-v3
Initialized reranker model: jinaai/jina-reranker-v2-base-multilingual

Processing Schema: EPD_DataSet

=== Attribute: UUID | k=10 ===
Query: UUID
Retrieved docs: 10

Exact-Match Evaluation (First 100 chars):
  Correctly retrieved: 1/1
    ✔ 'UUID des Datensatzes','UUID of Process data set','UUID','UUID','UUID des Datensatzes. Zusammen mit
  Missed: 0/1

Ranked Evaluation:
  Found: 1/1
  Missed: 0/1
  Mean Rank: 1
  Rank: 1
  MRR: 1.0

[Original Top Docs (first 10)]
  1. 'UUID des Datensatzes','UUID of Process data set','UUID','UUID','UUID des Datens...
  2. 'Eindeutiger Klassenidentifizierer','Unique class identifier','@classId','string...
  3. 'Permanente Datensatz-URI','Permanent data set URI','permanentDataSetURI','anyUR...
  4. 'Datensatzversion','Data set version','dataSetVersion','Version','Versionsnummer...
  5. 'Produktsystem-ID','Product system ID','@epd:productsystem-id','string','ID des ...
  6. 'CAS-Nummer','CAS number'

In [3]:
print(all_contexts)

{'\'Referenzjahr\',\'Reference year\',\'referenceYear\',\'Year\',\'Das erste Jahr der Gültigkeits des Datensatzes.\',\'Start year of the time period for which the data set is valid (until year of "Data set valid until")\',\'Start year of the time period for which the data set is valid (until year of "Data set valid until:"). For data sets that combine data from different years, the most representative year is given regarding the overall environmental impact. In that case, the reference year is derived by expert judgement.\'', "'Inhaltsangabe','Content Declaration','epd2:contentDeclaration','nan','Inhaltsangabe gemäss EN 15804/ISO 21930.','Content declaration according to EN 15804/ISO 219301.The content declaration may contain component, material and/or substance elements, which may (but do not have to) be nested.','nan'", "'Name','Name','@epd:name','string','Name des Szenarios','Name of the scenario','nan'", "'Veröffentlichung und Eigentümer','Publication and ownership','publicationAnd

In [10]:
# Passing Context to the LLM

import json
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings

# Define JSON schema for LLM response
json_schema = {
    "title": "AlignmentResponse",
    "description": "Response containing alignment mappings.",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "attribute": {
                "type": "string",
                "description": "The attribute from dataset A without additional information",
            },
            "match_type": {
                "type": "string",
                "description": "The type of SKOS match",
                "enum": ["skos:exactMatch", "skos:closeMatch", "skos:relatedMatch"],
            },
            "field_name": {
                "type": "string",
                "description": "The exact Field Name (en) from Schema B without additional information",
            },
        },
        "required": ["attribute", "match_type", "field_name"],
    },
}

def query_system(
    attributes: List[str],
    context: Set[str],
    llm_model: str,
    prompts_file: str,
):
    """
    Queries the LLM with the provided context and attributes.
    """
    # Prepare the context string
    context_str = "\n\n".join(context)

    # Define the prompt template
    prompt_template = ChatPromptTemplate.from_template(
        """
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attributes from dataset A to their corresponding fields in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Definition Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
{context}
</context>

Match the following attributes to the data under <context> in Schema B:
[
{attributes}
]

Return the response in JSON format adhering to the defined schema.
"""
    )

    final_prompt = prompt_template.format_prompt(
        context=context_str, attributes="\n".join(attributes)
    ).to_string()

    # Save the prompt for auditing or debugging
    with open(prompts_file, "a") as prompt_file:
        prompt_file.write(final_prompt + "\n\n" + ("-" * 50) + "\n\n")

    print("Final Prompt:\n", final_prompt)

    # Initialize the LLM model
    model = ChatOllama(model=llm_model)
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )

    # Invoke the LLM
    raw_response = structured_llm.invoke(final_prompt)

    # Extract the structured response
    structured_response = raw_response.get("parsed", None)

    print("Structured Response:\n", json.dumps(structured_response, indent=2))
    print("-" * 50 + "\n")

    return structured_response


###############################################################################
# Example Usage
###############################################################################
if __name__ == "__main__":
    # Configuration Parameters
    LLM_MODEL = "command-r7b:latest"  # falcon3:7b-instruct-q4_K_M, command-r7b:latest

    # Paths for prompts and responses
    prompts_file = f"../data/prompts/prompts_ollama_{LLM_MODEL.split(':')[0]}.txt"
    output_file = f"../data/responses/response_{LLM_MODEL.split(':')[0]}.json"

    # Initialize the response list
    all_responses = []

    # Assuming 'all_contexts' is available from Cell 1
    # If running in separate cells, ensure 'all_contexts' is accessible here
    # For example, by saving to a file in Cell 1 and loading here
    # Here, we assume both cells are run in the same session

    # Convert all_contexts to list for processing
    context_list = list(all_contexts)

    # Iterate over each schema and its attributes
    for schema, attributes in attribute_schema_mapping.items():
        print(f"\nProcessing Schema: {schema}")
        response = query_system(
            attributes=attributes,
            context=set(context_list),
            llm_model=LLM_MODEL,
            prompts_file=prompts_file,
        )
        if response:
            all_responses.extend(response)

    # Save all responses to the output JSON file
    with open(output_file, "w", encoding='utf-8') as file:
        json.dump(all_responses, file, indent=2)

    print(f"All responses saved to {output_file}")



Processing Schema: EPD_DataSet
Final Prompt:
 Human: 
You are an expert in semantic data alignment and ontology matching. Your task is to map the provided attributes from dataset A to their corresponding fields in Schema B. Use the SKOS relationship types to indicate the alignment:
- skos:exactMatch: Attributes are identical in meaning.
- skos:closeMatch: Attributes are strongly similar, differing only in minor details.
- skos:relatedMatch: Attributes are conceptually related but not hierarchically or equivalently aligned.

Definition Schema B:
<headers>
'Field Name (de)','Field Name (en)','Element/Attribute Name','Datatype','Definition (de)','Definition (en)','Original ILCD Format Definition'
</headers>
<context>
'EC-Nummer','EC number','@epd2:ECNumber','String with pattern 000-000-0','EC-Nummer des Materials oder der Substanz','EC Number of the material or substance','nan'

'Referenzjahr','Reference year','referenceYear','Year','Das erste Jahr der Gültigkeits des Datensatzes.','Star