# DEMO-INTER-AutomatedTool (Jupyter Notebook)

This notebook provides a step-by-step guide for automatically matching ontology classes.  
It allows you to:
- Load an ontology from a URL or a local file.
- Extract and compare ontology class IRIs, labels, synonyms, and definitions.
- Display matches in a table and download the results as a CSV.

**Getting Started:**
Run the cells below in sequence, following the provided instructions.

## find matching class IRIs

In [1]:
# Uncomment the following line if you need to install packages in your Notebook
!pip install rdflib requests pandas

import io
import base64
import csv
import requests
import rdflib
import pandas as pd
from rdflib import Graph
from IPython.display import display, HTML

def load_ontology_from_url(ontology_url):
    """
    Fetches an ontology file from a URL and loads it into an RDF graph.
    """
    response = requests.get(ontology_url)
    if response.status_code == 200:
        graph = Graph()
        graph.parse(data=response.text, format="xml")
        print(f"✅ Successfully fetched ontology from: {ontology_url}")
        return graph
    else:
        raise Exception(f"❌ Error fetching ontology from {ontology_url}: {response.status_code}")

def load_ontology_from_file(file_path):
    """
    Loads an ontology file from a local file path.
    """
    graph = Graph()
    graph.parse(file_path, format="xml")
    print(f"✅ Successfully loaded ontology from local file: {file_path}")
    return graph

def extract_filtered_classes(graph, excluded_base_uris):
    """
    Extracts class IRIs, labels, and definitions from the ontology while filtering out upper-level ontology classes.
    """
    excluded_filters = " || ".join(
        [f"STRSTARTS(STR(?class), '{base_uri}')" for base_uri in excluded_base_uris]
    )
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX iao: <http://purl.obolibrary.org/obo/IAO_>

    SELECT DISTINCT ?class ?label ?definition WHERE {{
        ?class a owl:Class .
        OPTIONAL {{ ?class rdfs:label ?label . }}
        OPTIONAL {{ ?class iao:0000115 ?definition . }}
        OPTIONAL {{ ?class rdfs:definition ?definition . }}
        FILTER (!({excluded_filters}))  # Exclude specified base URIs
    }}
    """
    results = graph.query(query)
    return {
        str(row["class"]): {
            "label": str(row["label"]) if row["label"] else "(no label)",
            "definition": str(row["definition"]) if row["definition"] else "(no definition)",
        }
        for row in results
    }

def find_common_class_iris(classes1, classes2):
    """
    Identifies common class IRIs between two ontologies.
    """
    return set(classes1.keys()).intersection(set(classes2.keys()))

def save_results_to_csv(common_class_iris, classes1, classes2, filename='matching_IRIs.csv'):
    """
    Saves common class IRIs, labels, and definitions to a CSV file and provides a download link.
    """
    df = pd.DataFrame([
        {
            'Class IRI': iri,
            'Label in Ontology 1': classes1[iri]['label'],
            'Label in Ontology 2': classes2[iri]['label'],
            'Definition in Ontology 1': classes1[iri]['definition'],
            'Definition in Ontology 2': classes2[iri]['definition']
        }
        for iri in common_class_iris
    ])
    
    df.to_csv(filename, index=False)
    
    print(f"\n✅ Matching class IRIs have been saved to: {filename}")
    
    # Display the results as a table in the Notebook
    display(HTML(df.to_html(index=False)))

    # Provide a download link for the CSV file
    csv_data = df.to_csv(index=False)
    b64 = base64.b64encode(csv_data.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
    display(HTML(href))

# --- MAIN EXECUTION ---

print("Choose an option for the first ontology:")
print("1: Provide a URL")
print("2: Provide a file path")
option1 = input("Enter 1 or 2: ").strip()

if option1 == "1":
    ontology_url1 = input("Enter the URL of the first ontology: ").strip()
    ontology1 = load_ontology_from_url(ontology_url1)
elif option1 == "2":
    file_path1 = input("Enter the path to the first ontology file: ").strip()
    ontology1 = load_ontology_from_file(file_path1)
else:
    raise Exception("❌ Invalid selection. Please enter either 1 or 2.")

print("\nChoose an option for the second ontology:")
print("1: Provide a URL")
print("2: Provide a file path")
option2 = input("Enter 1 or 2: ").strip()

if option2 == "1":
    ontology_url2 = input("Enter the URL of the second ontology: ").strip()
    ontology2 = load_ontology_from_url(ontology_url2)
elif option2 == "2":
    file_path2 = input("Enter the path to the second ontology file: ").strip()
    ontology2 = load_ontology_from_file(file_path2)
else:
    raise Exception("❌ Invalid selection. Please enter either 1 or 2.")

# Define base URIs to exclude (e.g., upper-level ontology classes)
excluded_base_uris = [
    "http://purl.obolibrary.org/obo/IAO_",
    "http://purl.obolibrary.org/obo/BFO_"
]

# Extract class details while filtering out excluded base URIs
classes1 = extract_filtered_classes(ontology1, excluded_base_uris)
classes2 = extract_filtered_classes(ontology2, excluded_base_uris)

# Identify common class IRIs
common_class_iris = find_common_class_iris(classes1, classes2)

# Display the number of common class IRIs found
print(f"\n🔍 Total common class IRIs found: {len(common_class_iris)}")

# Save results to CSV, display results in a table, and provide a download link
if common_class_iris:
    save_results_to_csv(common_class_iris, classes1, classes2, filename="matching_IRIs.csv")
else:
    print("❌ No common class IRIs found.")


Choose an option for the first ontology:
1: Provide a URL
2: Provide a file path


Enter 1 or 2:  1
Enter the URL of the first ontology:  https://raw.githubusercontent.com/HumanBehaviourChangeProject/ontologies/refs/heads/master/Behaviour/bcio_behaviour.owl


✅ Successfully fetched ontology from: https://raw.githubusercontent.com/HumanBehaviourChangeProject/ontologies/refs/heads/master/Behaviour/bcio_behaviour.owl

Choose an option for the second ontology:
1: Provide a URL
2: Provide a file path


Enter 1 or 2:  2
Enter the path to the second ontology file:  copper2.rdf


✅ Successfully loaded ontology from local file: copper2.rdf

🔍 Total common class IRIs found: 5

✅ Matching class IRIs have been saved to: matching_IRIs.csv


Class IRI,Label in Ontology 1,Label in Ontology 2,Definition in Ontology 1,Definition in Ontology 2
http://humanbehaviourchange.org/ontology/BCIO_050300,personal attribute,personal attribute,A specifically dependent continuant that inheres in a person.,A specifically dependent continuant that inheres in a person.
http://humanbehaviourchange.org/ontology/BCIO_036042,physical performance behaviour,physical performance behaviour,"A health-related behaviour that involves maintenance or improvement of flexibility, strength, balance or cardiovascular fitness.",A life enhancement behaviour that has a physical fitness function.
http://purl.obolibrary.org/obo/MF_0000016,(no label),person,(no definition),A member of the species Homo Sapiens.
http://humanbehaviourchange.org/ontology/BCIO_006085,location,Location,A spatial <quality> that inheres in a bearer by virtue of its position relative to other entities.,A spatial quality that inheres in a bearer by virtue of its position relative to other entities.
http://humanbehaviourchange.org/ontology/BCIO_006099,social influence behaviour,social influence behaviour,An <inter-personal behaviour> where a person exerts an influence on the behaviour of another.,An inter-personal behaviour where a person exerts an influence on the behaviour of another.


## Compare two ontologies and identify synonyms or the same labels where class IRIs are different

In [2]:
# Uncomment the following line if you need to install packages in your Notebook
# !pip install rdflib requests pandas

import io
import base64
import csv
import requests
import os
import rdflib
import pandas as pd
from rdflib import Graph
from IPython.display import display, HTML

def load_ontology_from_url(ontology_url):
    """
    Fetches an ontology file from a URL and loads it into an RDF graph.
    """
    response = requests.get(ontology_url)
    if response.status_code == 200:
        graph = Graph()
        graph.parse(data=response.text, format="xml")
        print(f"✅ Successfully fetched ontology from: {ontology_url}")
        return graph
    else:
        raise Exception(f"❌ Error fetching ontology from {ontology_url}: {response.status_code}")

def load_ontology_from_file(file_path):
    """
    Loads an ontology file from a local file path.
    """
    graph = Graph()
    graph.parse(file_path, format="xml")
    print(f"✅ Successfully loaded ontology from local file: {file_path}")
    return graph

def extract_class_details(graph):
    """
    Extracts class labels, synonyms, and definitions from the ontology.
    """
    query = """
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX iao: <http://purl.obolibrary.org/obo/IAO_>
    PREFIX go: <http://www.geneontology.org/formats/oboInOwl#>

    SELECT ?class ?label ?synonym ?definition
    WHERE {
        ?class a owl:Class .
        OPTIONAL { ?class rdfs:label ?label . }
        OPTIONAL { ?class iao:0000115 ?definition . }
        OPTIONAL { ?class rdfs:definition ?definition . }
        OPTIONAL { ?class go:hasBroadSynonym ?synonym . }    
        OPTIONAL { ?class go:hasNarrowSynonym ?synonym . }    
        OPTIONAL { ?class go:hasExactSynonym ?synonym . }    
        OPTIONAL { ?class go:hasRelatedSynonym ?synonym . }    
    }
    """
    results = graph.query(query)
    class_info = {}

    # Store extracted ontology class details in a dictionary
    for row in results:
        class_uri = str(row['class'])
        label = str(row['label']) if row['label'] else None
        synonym = str(row['synonym']) if row['synonym'] else None
        definition = str(row['definition']) if row['definition'] else None

        if class_uri not in class_info:
            class_info[class_uri] = {'label': label, 'synonyms': set(), 'definition': definition}

        # Store synonyms as a lowercase set for case-insensitive matching
        if synonym:
            class_info[class_uri]['synonyms'].add(synonym.lower())

    return class_info

def find_matching_classes(classes1, classes2):
    """
    Compares classes between two ontologies based on labels and synonyms to find matches.
    """
    matches = []
    for uri1, info1 in classes1.items():
        for uri2, info2 in classes2.items():
            if uri1 == uri2:
                continue  # Skip identical class URIs, as they are already matched

            # Check if labels match exactly
            if info1['label'] and info2['label'] and info1['label'].lower() == info2['label'].lower():
                matches.append((uri1, info1['label'], info1['definition'], uri2, info2['label'], info2['definition']))

            # Check if label in one ontology appears as a synonym in the other
            elif info1['label'] and info2['synonyms'] and info1['label'].lower() in info2['synonyms']:
                matches.append((uri1, info1['label'], info1['definition'], uri2, info2['label'], info2['definition']))

            elif info2['label'] and info1['synonyms'] and info2['label'].lower() in info1['synonyms']:
                matches.append((uri1, info1['label'], info1['definition'], uri2, info2['label'], info2['definition']))

    return matches

def save_matches_to_csv(matching_classes, filename='matching_classes.csv'):
    """
    Saves matching classes to a CSV file and provides a download link.
    """
    df = pd.DataFrame(matching_classes, columns=[
        'Ontology 1 Class URI', 'Ontology 1 Class Label', 'Ontology 1 Class Definition',
        'Ontology 2 Class URI', 'Ontology 2 Class Label', 'Ontology 2 Class Definition'
    ])
    
    # Save matches to a CSV file
    df.to_csv(filename, index=False)
    
    print(f"\n✅ Matching classes have been saved to: {filename}")

    # Display the matches as an interactive table in the Notebook
    display(HTML(df.to_html(index=False)))

    # Provide a download link for the CSV file
    csv_data = df.to_csv(index=False)
    b64 = base64.b64encode(csv_data.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
    display(HTML(href))

# --- MAIN EXECUTION ---

print("Choose an option for the first ontology:")
print("1: Provide a URL")
print("2: Provide a file path")
option1 = input("Enter 1 or 2: ").strip()

if option1 == "1":
    ontology_url1 = input("Enter the URL of the first ontology: ").strip()
    ontology1 = load_ontology_from_url(ontology_url1)
elif option1 == "2":
    file_path1 = input("Enter the path to the first ontology file: ").strip()
    ontology1 = load_ontology_from_file(file_path1)
else:
    raise Exception("❌ Invalid selection. Please enter either 1 or 2.")

print("\nChoose an option for the second ontology:")
print("1: Provide a URL")
print("2: Provide a file path")
option2 = input("Enter 1 or 2: ").strip()

if option2 == "1":
    ontology_url2 = input("Enter the URL of the second ontology: ").strip()
    ontology2 = load_ontology_from_url(ontology_url2)
elif option2 == "2":
    file_path2 = input("Enter the path to the second ontology file: ").strip()
    ontology2 = load_ontology_from_file(file_path2)
else:
    raise Exception("❌ Invalid selection. Please enter either 1 or 2.")

# Extract class details from both ontologies
classes1 = extract_class_details(ontology1)
classes2 = extract_class_details(ontology2)

# Find matching classes between the two ontologies
matching_classes = find_matching_classes(classes1, classes2)

# Display the number of matches found
print(f"\n🔍 Number of matching class labels/synonyms found: {len(matching_classes)}")

# Save results to CSV, display results as a table, and provide a download link
save_matches_to_csv(matching_classes, filename="matching_labelsSynonyms.csv")


Choose an option for the first ontology:
1: Provide a URL
2: Provide a file path


Enter 1 or 2:  1
Enter the URL of the first ontology:  https://raw.githubusercontent.com/HumanBehaviourChangeProject/ontologies/refs/heads/master/Behaviour/bcio_behaviour.owl


✅ Successfully fetched ontology from: https://raw.githubusercontent.com/HumanBehaviourChangeProject/ontologies/refs/heads/master/Behaviour/bcio_behaviour.owl

Choose an option for the second ontology:
1: Provide a URL
2: Provide a file path


Enter 1 or 2:  2
Enter the path to the second ontology file:  copper2.rdf


✅ Successfully loaded ontology from local file: copper2.rdf

🔍 Number of matching class labels/synonyms found: 1

✅ Matching classes have been saved to: matching_classes.csv


Ontology 1 Class URI,Ontology 1 Class Label,Ontology 1 Class Definition,Ontology 2 Class URI,Ontology 2 Class Label,Ontology 2 Class Definition
http://humanbehaviourchange.org/ontology/BCIO_036108,walking,"A locomotive behaviour that involves moving at a regular pace by lifting and setting down each foot in turn, never having both feet off the ground at once.",http://COPPER/ontology/COPPER_1020,Walking,"A physical performance behaviour that involves moving at a regular pace by lifting and setting down each foot in turn, never having both feet off the ground at once."


# find matching definitions for target classes using language model
This script automates the process of identifying semantically similar class definitions between a list of target classes provided in a CSV file and class definitions extracted from an ontology file. It uses a combination of techniques: semantic pre-filtering with cosine similarity and prompt-based evaluation using a LLaMA language model. The output is provided as a longlist for human review. This tool:
✅ Allows users to provide ontology files via URL or file upload
✅ Uses a language model to compare definitions
✅ Applies pre-filtering using cosine similarity
✅ Displays results in a table and provides a download link for a CSV file

In [None]:
import time
import csv
import requests
import os
from rdflib import Graph
from mlx_lm import load, generate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Try to import IPython.display for a clickable download link in a notebook.
try:
    from IPython.display import display, HTML
    NOTEBOOK = True
except ImportError:
    NOTEBOOK = False

SYSTEM_MSG = "You are an ontology expert."

def generateFromPrompt(promptStr, model, tokenizer, maxTokens=50):
    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": promptStr}
    ]
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    prompt = tokenizer.decode(input_ids)

    start_time = time.time()
    response = generate(model, tokenizer, prompt=prompt, max_tokens=maxTokens)
    elapsed_time = time.time() - start_time
    return response, elapsed_time

def extract_class_definitions(source, base_iri):
    g = Graph()
    # If the source is a URL, fetch it via requests; otherwise treat it as a local file path.
    if source.startswith("http"):
        response = requests.get(source)
        if response.status_code == 200:
            g.parse(data=response.text, format="xml")
        else:
            raise Exception(f"Failed to load ontology from URL: {source}")
    else:
        g.parse(source)
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX iao: <http://purl.obolibrary.org/obo/IAO_>

    SELECT ?class ?label ?definition
    WHERE {{
        ?class a owl:Class .
        ?class rdfs:label ?label .
        OPTIONAL {{ ?class iao:0000115 ?definition . }}
        OPTIONAL {{ ?class rdfs:definition ?definition . }}
        FILTER (STRSTARTS(str(?class), "{base_iri}"))
    }}
    """

    class_definitions = []
    for row in g.query(query):
        class_definitions.append({
            'class_uri': str(row['class']),
            'label': str(row['label']),
            'definition': str(row.get('definition', 'No definition provided'))
        })
    return class_definitions

def prefilter_classes(target, defs, embedder, threshold=0.5):
    target_label = target['label'].lower()
    target_uri = target['class_uri']
    target_definition = target['definition']

    target_synonyms = set()  # Add synonyms if needed

    # Compute embedding for target definition
    target_embedding = embedder.encode(target_definition)

    candidates = []
    for def_ in defs:
        if def_['class_uri'] == target_uri:
            continue
        if def_['label'] and def_['label'].lower() == target_label:
            continue
        if def_['label'].lower() in target_synonyms:
            continue

        compared_embedding = embedder.encode(def_['definition'])
        similarity = cosine_similarity([target_embedding], [compared_embedding])[0][0]
        if similarity > threshold:
            candidates.append(def_)
    return candidates

def create_prompt_for_class(target_class, defs):
    prompts = []
    for def_ in defs:
        prompt = f"""Target Label: {target_class['label']}
Target Definition: {target_class['definition']}
Compared Label: {def_['label']}
Compared Definition: {def_['definition']}
Same meaning? (yes/no)"""
        prompts.append((prompt, target_class, def_))
    return prompts

def process_prompts(prompts, model, tokenizer):
    total_elapsed_time = 0
    similar_matches = []
    for prompt, target_class, compared_class in prompts:
        response, elapsed_time = generateFromPrompt(prompt, model, tokenizer, maxTokens=50)
        total_elapsed_time += elapsed_time
        if "yes" in response.lower():
            similar_matches.append((target_class, compared_class))
    print(f"Total time to generate all responses: {total_elapsed_time:.2f} seconds")
    return similar_matches

def write_results_to_csv(matches, filename='matching_definitions.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['HBO Class IRI', 'HBO Class Label', 'HBO Class Definition',
                         'COPPER Class IRI', 'COPPER Class Label', 'COPPER Class Definition'])
        for target, match in matches:
            writer.writerow([target['class_uri'], target['label'], target['definition'],
                             match['class_uri'], match['label'], match['definition']])
    print(f"✅ Results saved to: {filename}")

def read_target_classes_from_csv(filename):
    target_classes = []
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        print(f"CSV Headers: {reader.fieldnames}")
        for row in reader:
            target_classes.append({
                'class_uri': row['IRI'],
                'label': row['\ufeffLabel'],
                'definition': row['Definition']
            })
    return target_classes

def display_results_table(matches):
    data = []
    for target, match in matches:
        data.append({
            "HBO Class IRI": target['class_uri'],
            "HBO Class Label": target['label'],
            "HBO Class Definition": target['definition'],
            "COPPER Class IRI": match['class_uri'],
            "COPPER Class Label": match['label'],
            "COPPER Class Definition": match['definition']
        })
    df = pd.DataFrame(data)
    # Display as an HTML table in the Notebook
    display(HTML(df.to_html(index=False)))
    return df

# Main script
if __name__ == "__main__":
    # Load the model and the embedding model
    model_name = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
    model, tokenizer = load(model_name)
    embedder = SentenceTransformer('all-MiniLM-L6-v2')

    # Prompt for ontology input: local file or URL
    ontology_choice = input("Enter 1 for local ontology file or 2 for ontology URL: ").strip()
    if ontology_choice == "1":
        ontology_source = input("Enter the path to your ontology file (e.g., COPPER2.rdf): ").strip()
    elif ontology_choice == "2":
        ontology_source = input("Enter the URL to your ontology file: ").strip()
    else:
        print("Invalid choice. Exiting.")
        exit(1)

    # Prompt for the target classes CSV file and base IRI
    target_csv = input("Enter the path to your target classes CSV file (e.g., hboRR.csv): ").strip()
    base_iri = input("Enter the base IRI of the ontology to be searched (e.g., http://COPPER/ontology/COPPER_): ").strip()

    # Extract all class definitions from the ontology
    all_defs = extract_class_definitions(ontology_source, base_iri)

    # Read target classes from the CSV
    target_classes = read_target_classes_from_csv(target_csv)

    # Process each target class
    all_matches = []
    for target_class in target_classes:
        filtered_defs = prefilter_classes(target_class, all_defs, embedder)
        prompts = create_prompt_for_class(target_class, filtered_defs)
        similar_matches = process_prompts(prompts, model, tokenizer)
        all_matches.extend(similar_matches)

    print(f"Total number of matches found: {len(all_matches)}")
    
    # Display results in a table in the console
    results_df = display_results_table(all_matches)
    
    # Write CSV file
    csv_filename = "matchingDefinitions.csv"
    write_results_to_csv(all_matches, filename=csv_filename)
    
    # Provide a download link using base64 encoding
    csv_data = results_df.to_csv(index=False)
    b64 = base64.b64encode(csv_data.encode()).decode()
    download_html = f'<a href="data:file/csv;base64,{b64}" download="{csv_filename}">Download CSV File</a>'
    display(HTML(download_html))
