In [None]:
import os
import json
import time
import ollama

In [None]:

def read_text_file(file_path):
    """Reads text content from a file."""
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def split_text_into_chunks(text, max_chunk_size):
    """Splits text into chunks based on the maximum token size."""
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# Ontology description

In [None]:
ontology_description = f"""
The ontology prefix: TetraOnto: <http://www.semanticweb.org/fethi/ontologies/2024/1/TetraOnto.owl#>\n\n 
Classes :
    - "geographicZone",
    - "mainContractor", "projectOwner", "otherActor",
    - Migratory species classes:
        "amphibioticMigrant", 
   
    - Fishway types:
        "basinPasses", "successivesBasinsPass", "preBarrage", "speedBumpsPass", 
        "fishLift", "fishLock, "artificialRiver", "verticalSlotPassage", "baffleFishway",
        "eelLadder", "trapPass", "studSubstrate", "baffleBrush", 
    
    - Structure types: 
        '"dam", "weir", "nozzle", "pondDyke", "gate",
    
    - Water body types: 
        "affluent", "arm", "counterDrainageChannel", "pool", "river", 
    
    - Restoration measure types: 
        "reconnection", "poolCreation", "de-silting", "regularFlooding",
        "floodRetention", "poolRestoration", "riverbankRestoration", "channelCreation"\n

  Objects Properties:
    - "relizedIn", means that a "restoration measure" is relizedIn a "geographic zone",
    - "associatedTo", means that a "restoration measure" is associatedTo a "structure"
    - "isLocatedOn", means that a "structure" isLocatedOn a "water body"
    - "isManagedBy", means that a project of "restoration measure" isManagedBy by "project "owner" and/or by "other actor"
    - "containsTechnicalElement", means that a "restoration measure" contains a "technical element"
    - "hasMainContractor",
    - "concerns", means that a "restoration measure" belongs to a "type of restoration"

  Data Properties:
    - "hasCost": a project of restoration measure has some cost
    - "startsAt": a restoration measure starts at a date time
    - "endssAt": a restoration measure starts at a date time
    - "hasHeight": a "structure" hasHeight "decimal"
    - "isImpassable": to say if a "structure" is considered impassable (true) if its height is greater than 1.0 m)
    - "hasMaxLowWaterFlow": a river has Max Low Water Flow
    - "hasMixLowWaterFlow": a river has Min Low Water Flow
    - "hasPoolLength": the pool length
    - "hasPoolWidth": the pool width  
    - "hasNotchHight": the noch hight 
    - "hasNotchWidth": the noch width 
    - "numberOfPools": e.g., a river or artificial river instance has a number of pools
    - "numberOfBasins": the number of basins
    - "averageSlope": a river or artificial river instance has a number of average slope
    - "hasBiologicalMonitoring", a restoration mesure is subject or no to Biological Monitoring
    - "hasHydromorphologicaMonitoring", a restoration mesure is subject or no to Hydromorphologica Monitoring
"""

# Input text files & LLMs list

In [None]:
# Folder containing input text files
folder_path = "/.../base_test"  

# List of models to apply
models = ["qwen2.5:14b","qwen2:7b", "llama3.2:latest", "llama3.1:8b", "llama3:latest", "mistral:latest"]  

# Zero-shot without template

In [None]:

output_folder = "/.../zero_shot_without_Template_outputs" 

os.makedirs(output_folder, exist_ok=True)

system_content = f"""You are provided with an ontology description and a text input. Create instances (individuals) extracted from the text based on this ontology.
The extracted indivuduals output will be in turtle format.
Do not include any additional information\n

Here is the current ontology description:
{ontology_description}\n\n

Text input:\n
"""

# Process each text file in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    # Skip non-text files
    if not file_name.endswith(".txt"):
        continue

    # Read the content of the file
    user_content = read_text_file(file_path)

    for model in models:
        print(f"Processing file '{file_name}' with model '{model}'...")

        # Start timing the execution
        start_time = time.time()

        response = ollama.chat(
            model=model,  
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_content}
            ]
        )

        # Stop timing the execution
        end_time = time.time()

        # Calculate execution time
        execution_time = end_time - start_time
        execution_times[(file_name, model)] = execution_time

        minutes, seconds = divmod(execution_time, 60)
        formatted_time = f"{int(minutes)} minutes and {seconds:.2f} seconds"
        
        print(f"Execution time for model '{model}' on file '{file_name}': {formatted_time}")

        output_file_name = f"{os.path.splitext(file_name)[0]}_{model.replace(':', '_')}.ttl"
        output_file_path = os.path.join(output_folder, output_file_name)

        # LLM response content
        turtle_data = response["message"]['content']

        with open(output_file_path, "a", encoding="utf-8") as output_file: 
            output_file.write("@prefix : <http://www.semanticweb.org/fethi/ontologies/2024/1/TetraOnto.owl#> .\n")
            output_file.write("@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n")
            output_file.write("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n")
            output_file.write(turtle_data)

print("Processing complete for all files and models.")

# Zero-shot with template

In [None]:
template = read_text_file('/.../template.txt')

output_folder = "/.../zero_shot_template_output"  

os.makedirs(output_folder, exist_ok=True)

system_content = f"""You are provided with an ontology description and a text input. Create instances (individuals) extracted from the text based on the ontology description and the provided following template:
Do not include any additional information\n

Here is the current ontology description:
{ontology_description}\n\n

Here is the given template to follow:\n
{template}\n\n

Text input:\n
"""

# Process each text file in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    # Skip non-text files
    if not file_name.endswith(".txt"):
        continue

    # Read the content of the file
    user_content = read_text_file(file_path)

    for model in models:
        print(f"Processing file '{file_name}' with model '{model}'...")

        # Start timing the execution
        start_time = time.time()

        response = ollama.chat(
            model=model,  
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_content}
            ]
        )

        # Stop timing the execution
        end_time = time.time()

        # Calculate execution time
        execution_time = end_time - start_time
        execution_times[(file_name, model)] = execution_time

        minutes, seconds = divmod(execution_time, 60)
        formatted_time = f"{int(minutes)} minutes and {seconds:.2f} seconds"
        
        print(f"Execution time for model '{model}' on file '{file_name}': {formatted_time}")

        output_file_name = f"{os.path.splitext(file_name)[0]}_{model.replace(':', '_')}.ttl"
        output_file_path = os.path.join(output_folder, output_file_name)

        # LLM response content
        turtle_data = response["message"]['content']

        with open(output_file_path, "a", encoding="utf-8") as output_file: 
            output_file.write("@prefix : <http://www.semanticweb.org/fethi/ontologies/2024/1/TetraOnto.owl#> .\n")
            output_file.write("@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n")
            output_file.write("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n")
            output_file.write(turtle_data)

print("Processing complete for all files and models.")

# One-shot

In [None]:

def generate_response_with_context(chunks, model, ontology_summary, assistant_message):
    # Initialise le contexte avec un résumé de l'ontologie
    context = ontology_summary
    responses = []

    for i, chunk in enumerate(chunks):
        prompt = f"\n\nTexte :\n{chunk}\n\n"

        # Appel du modèle avec le prompt
        response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": ontology_summary},
            {"role": "user", "content": chunk}
        ]
    )
        responses.append(response['message']['content'])
        context += f"Réponse pour le chunk {i+1} : {response['message']['content']}\n"
    
    return responses

system_message = f"""
You are an experienced instance extractor. 
Extract all instances from the following text and map them to the appropriate entities on the provided ontology description.\n
Do not include any additional information.\n

Here is the ontology description:\n
{ontology_description}\n\n
For each instance, write the result as follows:

    IndividualName is a :<class>
        :<ObjectProperty> to <AnotherIndividualName>
        :<DataProperty> : PropertyValue 
        label: "Individual Name"
        
    <AnotherIndividualName> is a :<class>
        label : "Another Individual Name" \n

Create the relationship between <IndividualName> and <AnotherIndividualName>.\n

Example Text: 
Mesure de restauration : Passe à poissons de Strasbourg
Masse(s) d'eau: Le Rhin
Catégorie: Continuité longitudinale
Pays: France
Département/Land: Bas-Rhin
Commune(s): Strasbourg
Maître d'oeuvre: Centre d'Ingénierie Hydraulique d'EDF, EDF R&D
Maître d'ouvrage: Electricité De France (EDF) Unité de production est
Autres acteurs: Agence de l'eau Rhin-Meuse, DREAL Alsace, ONEMA
Début des travaux: 2012
Fin des travaux: Prévue en 2015
Pression(s) sur la zone: Le barrage de la centrale hydroélectrique est l’un des obstacles d’une série. Il perturbe la remonté du Rhin vers les zones favorables à la vie et à la reproduction des poissons. 
Travaux et aménagements: Création de la passe à poissons, avec :
   • Une succession de 18 bassins en bétons (hauteur entre bassins : 20cm)
Suivi biologique: oui
Suivi hydromorphologique: non
Coût total de l'opération: 15000000.0

Answer:\n
PasseAPoissonsDeStrasbourg a :successivesBasinsPass 
    :associatedTo BarrageCentraleHydroélectrique
    :isManagedBy EDFUnitéDeProductionEst
    :hasMainContractor CentreIngénierieHydrauliqueEDFandEDF_R&D
    :numberOfBasins "18"^^xsd:integer 
    :startsAt "2012"^^xsd:dateTime 
    :endsAt "2015"^^xsd:dateTime
    :hasCost "15000000.0"^^xsd:decimal 
    :hasBiologicalMonitoring "true"^^xsd:boolean
    :hasHydromorphologicalMonitoring "false"^^xsd:boolean
    :concerns :LongitudinalContinuity 

BarrageCentraleHydroélectrique is a :dam 
    :isLocatedOn BasRhinStrasbourgFrance
    :isLocatedOn LeRhin
    :isImpassable "true"^^xsd:boolean
    label : "Barrage Centrale Hydroélectrique"

EDFUnitéDeProductionEst is a :mainContractor
    label : "EDF Unité de production est"

LeRhin is a :river 
    :isLocatedOn DépartementduBasRhin
    label : "Le Rhin"

CentreIngénierieHydrauliqueEDFandEDF_R&D is a :projectOwner 
    label : "Centre Ingénierie Hydraulique EDF, EDF R&D"

AgenceEauRhin-Meuse_DREALAlsace_ONEMA is a :otherActor 
    label : "Agence de l'eau Rhin-Meuse, DREAL Alsace, ONEMA"

BasRhinStrasbourgFrance is a :geographicZone
    label : "Bas-Rhin, Strasbourg, France"
"""


# Parameters
few_shot_examples = read_text_file('few_shot_3.txt')
assistant_message = few_shot_examples

ontology_summary = system_message

max_chunk_size = 3000  

# Folder to save outputs
output_folder = "/.../one_shot_outputs"  

os.makedirs(output_folder, exist_ok=True)

# Process each text file in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    # Skip non-text files
    if not file_name.endswith(".txt"):
        continue

    # Read the content of the file
    user_content = read_text_file(file_path)

    # Split the text into manageable chunks
    chunks = split_text_into_chunks(user_content, max_chunk_size)

    # Apply each model to the chunks
    for model in models:
        print(f"Processing file '{file_name}' with model '{model}'...")

        # Start timing the execution
        start_time = time.time()

        # Generate responses for all chunks using the current model
        responses = generate_response_with_context(chunks, model, ontology_summary, assistant_message)

        # Save the responses to an output file
        output_file_name = f"{os.path.splitext(file_name)[0]}_{model.replace(':', '_')}.txt"
        output_file_path = os.path.join(output_folder, output_file_name)

        with open(output_file_path, "w", encoding="utf-8") as output_file:
            for i, response in enumerate(responses):
                output_file.write(f"Response for chunk {i + 1}:\n\n{response}\n\n")
        
        individuals = read_text_file(output_file_path)


        prompt2 = f"""Follow the format of generated individuals below to translate the following input instances\n
        Do not include any additional information\n
        Be sure to link all the generated individuals (no isolated individual)\n

        # Example of generated individuals:\n
        {assistant_message}\n\n

        # Input Instances to be translated:\n\n
        """
        response2 = ollama.chat(model=model, messages=[
            {"role": "system", "content": prompt2},
            {"role": "user", "content": individuals}
        ])


        # Stop timing the execution
        end_time = time.time()

        # Calculate execution time
        execution_time = end_time - start_time
        execution_times[(file_name, model)] = execution_time

        minutes, seconds = divmod(execution_time, 60)
        formatted_time = f"{int(minutes)} minutes and {seconds:.2f} seconds"
        
        print(f"Execution time for model '{model}' on file '{file_name}': {formatted_time}")

        output_file_name = f"{os.path.splitext(file_name)[0]}_{model.replace(':', '_')}.ttl"
        output_file_path = os.path.join(output_folder, output_file_name)

        # LLM response content
        turtle_data = response2["message"]['content']

        with open(output_file_path, "w", encoding="utf-8") as output_file: 
            output_file.write(turtle_data)

        print(f"Saved responses to '{output_file_path}'.")

print("Processing complete for all files and models.")
