# import

In [73]:
import pprint
import os
import sys

from dotenv import load_dotenv

from langchain_ollama import OllamaLLM, ChatOllama, OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever

from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

import psycopg
from langchain_postgres.vectorstores import PGVector
from SPARQLWrapper import SPARQLWrapper, JSON, POST, N3
from urllib.parse import urljoin

from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from threading import Lock

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, OWL, FOAF, XSD, SKOS, DCTERMS
import re
from collections import defaultdict
import json
import datetime

# loading env variables

In [74]:
load_dotenv()

True

# Langain & Ollama

## check ollama status

In [5]:
# !curl --location 'http://127.0.0.1:11434/api/generate' \
# --header 'Content-Type: application/json' \
# --data '{ \
#     "model": "llama3.2:3b", \
#     "prompt": "hello llama!",  \
#     "options": { \
#         "temperature": 0 \
#     } \
# }' \
# | python -m json.tool

In [6]:
# !curl http://localhost:11434/api/tags

## Ollama model: configuration

In [6]:
llm = OllamaLLM(model="llama3.2:3b", temperature=0)
# llm = OllamaLLM(model="deepseek-r1:8b", temperature=0)
llm

OllamaLLM(model='llama3.2:3b', temperature=0.0)

In [9]:
chat_ollama = ChatOllama(model="llama3.2:3b", temperature=0)
chat_ollama

ChatOllama(model='llama3.2:3b', temperature=0.0)

## testing llm: invoke


In [7]:
# llm.invoke(input="tell me a joke")
response = llm.invoke("hello ollama!")

# response = llm.invoke("Create an agent that uses Ollama function calling in Langchain.")

print(response)

Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


In [10]:
messages = [
    ("system", "You are a helpful translator. Translate the user sentence to French."),
    ("human", "I love programming."),
]
chat_ollama.invoke(messages)

AIMessage(content='Je aime programmer.', additional_kwargs={}, response_metadata={'model': 'llama3.2:3b', 'created_at': '2025-04-14T19:07:45.0711592Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2673019600, 'load_duration': 2457519600, 'prompt_eval_count': 42, 'prompt_eval_duration': 153058000, 'eval_count': 5, 'eval_duration': 57947000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-87d0096c-961f-4def-8049-07069bb59937-0', usage_metadata={'input_tokens': 42, 'output_tokens': 5, 'total_tokens': 47})

## testing llm: chat prompt template

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a world class technical documentation writer."),
    ("user", "{input}")
])

chain = chat_prompt_template | llm

response = chain.invoke({"input": "how can langsmith help with testing?"})

print(response)

<think>
Okay, so I'm trying to figure out how LangSmith can help with testing. I remember that LangSmith is some kind of AI tool related to language processing, maybe for writing or something like that. But I'm not exactly sure about its specific features beyond generating text.

The user mentioned testing in their question, so I guess they're asking if LangSmith has any features that assist in testing processes. Testing can be a broad term—like software testing, quality assurance, user acceptance testing, etc.—so I need to think about how an AI tool like LangSmith might fit into these contexts.

First, maybe LangSmith can help with automated testing. If it's capable of generating text based on inputs, perhaps it can create test cases or scenarios automatically. That would save time compared to manual testing. But I'm not sure if LangSmith has that feature or not.

Another angle is using LangSmith for functional testing. If you're testing a system's functionality, maybe LangSmith can s

## testing llm: chat prompt template & StrOutputParser

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a world class technical documentation writer."),
    ("user", "{input}")
])

output_parser = StrOutputParser()

chain = chat_prompt_template | llm | output_parser

response = chain.invoke({"input": "how can langsmith help with testing?"})

print(response)

<think>
Okay, so I'm trying to figure out how LangSmith can help with testing. I remember that LangSmith is some kind of AI tool related to language processing, maybe for writing or something like that. But I'm not exactly sure about its features beyond generating text.

The user mentioned testing in their question, so I guess they're asking if LangSmith can be used for testing purposes. Hmm, how does that work? Well, testing usually involves checking if a system works as expected, right? So maybe LangSmith can help test other AI systems or applications?

Wait, but LangSmith is more about generating text. Maybe it's used to create test cases or scenarios for testing something else. Or perhaps it can simulate user interactions to see how well another system responds. That could be useful for testing chatbots or other language-based applications.

Another thought: maybe LangSmith can help in automating tests. Like, if you have a lot of test cases, LangSmith could generate the necessary i

## create vector store & a retriever

In [None]:
# 1. select a specfic datasource. In this case a web page.
# 2. save extracted content from the web page as docs.
# 3. index the docs using FAISS vector store.
# 4. convert the vector store to retriever.

web_base_loader = WebBaseLoader("https://docs.smith.langchain.com/user_guide")

docs = web_base_loader.load()

# print(f"type(docs) : {type(docs)} \n")
# print(f"len(docs) : {len(docs)}\n")
# print(f"docs: {docs} \n")
# type(f"docs[0] : {docs[0]} \n")
# print(f"docs[0].page_content : {docs[0].page_content} \n")

recursive_character_text_splitter = RecursiveCharacterTextSplitter()
documents = recursive_character_text_splitter.split_documents(documents=docs)


# print(type(documents))
# print(len(documents))
# print(documents)
# print(documents[0])
# print(documents[2])

ollama_embedding = OllamaEmbeddings(model="llama3.2:3b")
vector_store = FAISS.from_documents(
    documents=documents, embedding=ollama_embedding)


# print(f"vector_store.index.ntotal: {vector_store.index.ntotal}")
# print(f"vector_store._get_retriever_tags() : {vector_store._get_retriever_tags()}")
# print(f"vector_store.index_to_docstore_id : {vector_store.index_to_docstore_id}")
# print(f"type(vector_store.index_to_docstore_id) : {type(vector_store.index_to_docstore_id)}")

vector_store_retriever = vector_store.as_retriever()
print(f"vector_store_retriever: {vector_store_retriever}")

vector_store_retriever: tags=['FAISS', 'OllamaEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000233810D3DD0> search_kwargs={}


## document chain

In [None]:
# 5. create a chat prompt template
# 6. create a stuff document chain that accepts a llm model and chat prompt template & we can also run stuff document chain by passing in documents directly

chat_prompt_template = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}"""
)

documents_chain = create_stuff_documents_chain(
    llm=llm, prompt=chat_prompt_template)
response = documents_chain.invoke(
    {
        "input": "how can langsmith help with testing?",
        "context": documents
    }
)
print(response)

There is no information provided in the context about LangSmith's capabilities or features related to testing. The text only mentions that LangSmith is a project or organization (indicated by the "LangSmith SDK" and "LangChain Python Docs" links), but it does not provide any details on how it can be used for testing.


## retrieval chain

In [None]:
# 7. create a document retrieval chain that takes vector store retriever and stuff document chain

retrieval_chain = create_retrieval_chain(
    vector_store_retriever, documents_chain)
response = retrieval_chain.invoke(
    {"input": "how can langsmith help with testing?"})

# print(type(response))
pprint.pprint(response, indent=4)

{   'answer': 'There is no information provided in the context about '
              "LangSmith's capabilities or features related to testing. The "
              'text only mentions that LangSmith is a project or organization '
              '(indicated by the "LangSmith SDK" and "LangChain Python Docs" '
              'links), but it does not provide any details on how it can be '
              'used for testing.',
    'context': [   Document(metadata={'source': 'https://docs.smith.langchain.com/user_guide', 'title': '🦜️🛠️ LangSmith', 'language': 'en'}, page_content='🦜️🛠️ LangSmith\n\n\n\n\n\n\nSkip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!API ReferenceRESTPythonJS/TSSearchRegionUSEUGo to AppPage Not FoundWe could not find what you were looking for.Head back to our main docs page or use the search bar to find the page you need.CommunityDiscordTwitterGitHubDocs CodeLangSmith SDKPythonJS/TSMoreHomepageBlogLangChain Pytho

## conversation retrieval chain

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", "Given the above conversation, generate a search query to look up to get information relevant to the conversation")
])

history_aware_retriever_chain = create_history_aware_retriever(
    llm, vector_store_retriever, chat_prompt_template)

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system",
     "Answer the user's questions based on the below context:\n\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}")
])

document_chain = create_stuff_documents_chain(llm, chat_prompt_template)
retrieval_chain = create_retrieval_chain(
    history_aware_retriever_chain, document_chain)

chat_history = [HumanMessage(
    content="Can LangSmith help test my LLM applications?"), AIMessage(content="Yes!")]

response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "tell me how"
})

pprint.pprint(response)

{'answer': "We'd be happy to help you test your Large Language Model (LLM) "
           'applications. Here are some ways we can assist:\n'
           '\n'
           '1. **Conversational Testing**: We can engage in conversations with '
           'your LLM, providing it with a variety of prompts and scenarios to '
           'test its understanding, accuracy, and response quality.\n'
           '2. **Error Identification**: Our team can help identify errors or '
           "biases in your LLM's responses, such as incorrect information, "
           'inconsistencies, or inappropriate content.\n'
           '3. **Performance Evaluation**: We can evaluate the performance of '
           'your LLM on specific tasks, such as answering questions, '
           'generating text, or completing tasks.\n'
           '4. **Data Quality Assessment**: We can assess the quality and '
           "relevance of the data used to train your LLM, ensuring it's "
           'accurate, diverse, and up-to-da

# embeddings

### initialize embedding model

In [12]:
# ollama_embedding = OllamaEmbeddings(model="mxbai-embed-large:335m")
# ollama_embedding = OllamaEmbeddings(model="nomic-embed-text:latest")
ollama_embedding = OllamaEmbeddings(model="bge-m3:567m")

### connect to pgvector

In [13]:
# Format: postgresql+psycopg2://user:password@host:port/dbname
# Database Connection Details
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

CONNECTION_STRING = f"postgresql+psycopg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
COLLECTION_NAME = "dbpedia_docs"

In [None]:
print(f"\nConnecting to PGVector '{COLLECTION_NAME}'...")
try:
    # If the collection table doesn't exist, PGVector will try to create it.
    vectorstore = PGVector(
        connection=CONNECTION_STRING,
        embeddings=ollama_embedding,
        collection_name=COLLECTION_NAME,
        use_jsonb=True
        # pre_delete_collection=True
        # Use pre_delete_collection=True if you want to clear the collection on every run (USE WITH CAUTION!)
        # pre_delete_collection=False,
    )
    print(f"connection successfull!")
except psycopg.OperationalError as e:
    print(f"\nDatabase Connection Error: {e}")
    exit(1)
except Exception as e:
    print(f"\nAn error occurred during PGVector connection: {e}")
    exit(1)


Connecting to PGVector 'dbpedia_docs'...
connection successfull!


### connect to ontotext graph db and fetch all the entities

In [71]:
GRAPHDB_BASE_URL = os.getenv("GRAPHDB_BASE_URL")
GRAPHDB_REPOSITORY = os.getenv("GRAPHDB_REPOSITORY")

# Format: {base_url}/repositories/{repository_id}
SPARQL_ENDPOINT = urljoin(GRAPHDB_BASE_URL.strip('/') + '/', f"repositories/{GRAPHDB_REPOSITORY}")
MAX_THREADS = os.cpu_count() or 4

In [None]:
# query = r"""
# PREFIX owl: <http://www.w3.org/2002/07/owl#>

# SELECT ?class
# FROM <http://dbpedia.org/model>
# WHERE {
#   ?class a owl:Class .
#   FILTER (
#     regex(STRAFTER(STR(?class), "http://dbpedia.org/ontology/"), "^[\\x00-\\x7F]+$")
#   )
# }
# """
# sparql = SPARQLWrapper(SPARQL_ENDPOINT)
# sparql.setReturnFormat(JSON)

# entitiies = []

# try:
#     sparql.setQuery(query)
#     results = sparql.query().convert()
#     for result in results["results"]["bindings"]:
#         entitiies.append(result["class"]["value"])
#         # print(result["class"]["value"])
# except Exception as e:
#     print("Error:", e)

In [None]:
# len(entitiies), entitiies[:2]

(639,
 ['http://dbpedia.org/ontology/AcademicConference',
  'http://dbpedia.org/ontology/AcademicJournal'])

In [None]:
# def get_sparql():
#     sparql = SPARQLWrapper(SPARQL_ENDPOINT)
#     sparql.setReturnFormat(JSON)
#     return sparql

# # Step 3: Describe a single instance
# def describe_instance(instance_iri):
#     sparql = get_sparql()
#     sparql.setReturnFormat(N3)
#     sparql.setQuery(f"DESCRIBE <{instance_iri}>")
#     try:
#         rdf = sparql.query().convert()
#         rdf_str = rdf.decode('utf-8') if isinstance(rdf, bytes) else str(rdf)
#         return rdf_str
#     except Exception as e:
#         print(f"[Error] Describing {instance_iri}: {e}")
#         try:
#             folder_location = os.path.join(os.getcwd(), "datasets", "failed")
#             os.makedirs(folder_location, exist_ok=True)  # ensure 'data/' exists
#             failed_file_with_iri = os.path.join(folder_location, "failed_instances_iri.txt")
#             with open(failed_file_with_iri, "a", encoding="utf-8") as f:
#                 f.write(instance_iri + "\n")
#             print(f"Logged failed IRI to {failed_file_with_iri}")
#         except Exception as file_err:
#             print(f"[Error] Saving failed IRI: {file_err}")
#         return None


# url = "http://dbpedia.org/resource/%22V%22_Is_for_Vengeance"
# str_format_rdf = describe_instance(instance_iri=url)

In [92]:
OUTPUT_FILENAME_DIR = os.path.join(
    "c:\\Users\\deepa\\data\\workspace\\notebooks", "datasets", "instance_description")
OUTPUT_FILENAME = os.path.join(
    OUTPUT_FILENAME_DIR, "instance_description.jsonl")

FAILED_LOG_DIR = os.path.join(
    "c:\\Users\\deepa\\data\\workspace\\notebooks", "datasets", "failed")
FAILED_CLASS_LOG = os.path.join(FAILED_LOG_DIR, "failed_class_iri.txt")
FAILED_INSTANCE_LOG = os.path.join(FAILED_LOG_DIR, "failed_instance_iri.txt")

In [96]:
# --- Global Lock for File Writing ---
# This single lock will protect all file write operations (main output + error logs)
file_lock = Lock()

# --- SPARQL and Helper Functions ---


def get_sparql(return_format=JSON):
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setReturnFormat(return_format)
    return sparql


def split_camel_case_to_lower_words(name):
    """Splits CamelCase or PascalCase and returns lowercase words separated by spaces."""
    if not name:
        return ""
    # Handle simple cases first
    if name.islower() or '_' in name or not re.search('[A-Z]', name):
        # Replace underscores and lowercase
        return name.replace('_', ' ').lower()

    # Insert space before uppercase letters (except at the start)
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
    # Insert space before uppercase letters that follow lowercase or digit
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1)
    return s2.lower()  # Convert the whole result to lowercase


def clean_uri_for_llm_key(uri_str):
    """Cleans a predicate URI string into a readable key (lowercase, space-separated)."""
    if not uri_str:
        return "unknown property"

    # Specific overrides first (already lowercase)
    if uri_str == str(RDF.type):
        return "type"
    if uri_str == str(RDFS.label):
        return "label"
    if uri_str == str(FOAF.name):
        return "name"
    # Add other specific overrides if needed (e.g., DCTERMS.subject -> "subject")

    # General cleaning - extract local name
    if '#' in uri_str:
        name = uri_str.split('#')[-1]
    else:
        name = uri_str.split('/')[-1]

    # Split camel case and convert to lowercase words
    return split_camel_case_to_lower_words(name)


def clean_uri_for_llm_value(uri_str):
    """Cleans a resource URI string into a readable value for LLM output."""
    if not uri_str:
        return "Unknown Resource"
    if '#' in uri_str:
        name = uri_str.split('#')[-1]
    else:
        name = uri_str.split('/')[-1]
    # Basic URL decoding for parentheses
    name = name.replace('%28', '(').replace('%29', ')')
    # Special handling for Wikidata URIs to just show the QID
    if uri_str.startswith("http://www.wikidata.org/entity/"):
        return name  # Just return QID like Q215380
    # Default: replace underscores with spaces
    return name.replace('_', ' ')


def format_rdf_term_for_llm_value(term_data):
    """
    Formats a term (represented as dict from Step 1 or URI string)
    into a simple string value for LLM output.
    """
    if isinstance(term_data, dict):  # Literal dictionary
        val = term_data.get("value", "")
        # Clean common literal suffixes for LLM readability
        val = re.sub(r'@\w+(-[A-Za-z0-9]+)*$', '', val)  # Remove @lang tags
        val = re.sub(r'\^\^<.*>$', '', val)  # Remove ^^<datatype>
        val = val.strip('"')  # Remove surrounding quotes if any
        return val
    elif isinstance(term_data, str):  # URI string
        return clean_uri_for_llm_value(term_data)
    else:
        return str(term_data)  # Fallback


def format_rdf_term(term):
    """Creates the intermediate structured representation for RDF terms."""
    if isinstance(term, Literal):
        dt = str(term.datatype) if term.datatype else None
        # Assign default datatypes if missing
        if dt is None and term.language:
            dt = str(RDF.langString)
        elif dt is None:
            dt = str(XSD.string)
        return {"value": str(term), "language": term.language, "datatype": dt}
    elif isinstance(term, URIRef):
        return str(term)
    else:  # Handle Blank Nodes etc.
        return str(term)


def extract_structured_description(rdf_n3_string, instance_iri):
    """Parses N3 RDF data and extracts outgoing/incoming relationships."""
    if not rdf_n3_string:
        # Return empty structure if no N3 data provided (e.g., empty DESCRIBE/CONSTRUCT)
        return {"instance_iri": instance_iri, "outgoing": {}, "incoming": {}}
    g = Graph()
    try:
        # Use instance_iri as base URI for resolving relative URIs if any
        g.parse(data=rdf_n3_string, format="n3", publicID=instance_iri)
    except Exception as e:
        # Log parsing errors specifically
        print(
            f"[Error] Parsing N3 data for {instance_iri}: {type(e).__name__} - {e}")
        return None  # Indicate failure
    instance_ref = URIRef(instance_iri)
    outgoing_data = defaultdict(list)
    incoming_data = defaultdict(list)

    # Outgoing properties
    for pred, obj in g.predicate_objects(subject=instance_ref):
        pred_uri_str = str(pred)
        formatted_obj = format_rdf_term(obj)
        # Avoid adding exact duplicates (important for literals)
        if formatted_obj not in outgoing_data[pred_uri_str]:
            outgoing_data[pred_uri_str].append(formatted_obj)

    # Incoming relationships
    for subj, pred in g.subject_predicates(object=instance_ref):
        # Avoid reflexive triples (where subject is the instance itself)
        if subj == instance_ref:
            continue
        pred_uri_str = str(pred)
        subj_uri_str = str(subj)
        # Avoid adding duplicate incoming subjects for the same predicate
        if subj_uri_str not in incoming_data[pred_uri_str]:
            incoming_data[pred_uri_str].append(subj_uri_str)

    # Final Structure: Convert defaultdicts, sort values for consistency
    final_outgoing = {pred: sorted(values, key=str)
                      for pred, values in outgoing_data.items()}
    # Sort incoming subjects as well
    final_incoming = {pred: sorted(values)
                      for pred, values in incoming_data.items()}
    return {"instance_iri": instance_iri, "outgoing": final_outgoing, "incoming": final_incoming}


def format_for_llm_custom_layout(structured_data):
    """
    Takes the structured dictionary and formats it into the specific
    two-part layout requested by the user (revised key/predicate format).
    """
    if not structured_data or (not structured_data.get("outgoing") and not structured_data.get("incoming")):
        instance_iri = structured_data.get("instance_iri", "Unknown Instance")
        instance_name = clean_uri_for_llm_value(instance_iri)
        # Provide a minimal output even if no data found after parsing
        return f"name: {instance_name}\n(No description properties found)"

    instance_iri = structured_data.get("instance_iri")
    instance_name_cleaned = clean_uri_for_llm_value(instance_iri)

    output_lines_part1 = []
    output_lines_part2 = []

    # --- Part 1: Outgoing Properties (key: value) ---
    outgoing_properties = structured_data.get("outgoing", {})
    primary_name_val = instance_name_cleaned  # Default name

    temp_outgoing_formatted = {}
    for pred_uri in sorted(outgoing_properties.keys()):
        llm_key = clean_uri_for_llm_key(pred_uri)
        values = outgoing_properties[pred_uri]
        cleaned_values_for_key = []
        for term_data in values:
            cleaned_val = format_rdf_term_for_llm_value(term_data)
            # Add value if it's not empty and not already added
            if cleaned_val and cleaned_val not in cleaned_values_for_key:
                cleaned_values_for_key.append(cleaned_val)
        if cleaned_values_for_key:
            # Sort the cleaned values before joining
            value_string = ", ".join(sorted(cleaned_values_for_key))
            temp_outgoing_formatted[llm_key] = value_string
            # Update primary name if this is the 'name' key
            if llm_key == 'name':
                primary_name_val = value_string

    # Generate output lines for part 1, ensuring 'name' is first
    if 'name' in temp_outgoing_formatted:
        output_lines_part1.append(f"name: {temp_outgoing_formatted['name']}")
    elif instance_name_cleaned:  # Add fallback if no name property found
        output_lines_part1.append(f"name: {instance_name_cleaned}")

    # Add other properties sorted by key
    for key in sorted(temp_outgoing_formatted.keys()):
        if key == 'name':
            continue  # Skip name as it's already added
        output_lines_part1.append(f"{key}: {temp_outgoing_formatted[key]}")

    # --- Part 2: Incoming Relationships (Subject : Predicate : Object) ---
    incoming_relationships = structured_data.get("incoming", {})
    instance_name_for_part2 = primary_name_val  # Use name determined in Part 1

    incoming_tuples = []
    for pred_uri, subjects in incoming_relationships.items():
        # Get cleaned predicate name (lowercase, space-separated)
        if '#' in pred_uri:
            pred_local_name = pred_uri.split('#')[-1]
        else:
            pred_local_name = pred_uri.split('/')[-1]
        pred_cleaned_for_output = split_camel_case_to_lower_words(
            pred_local_name)

        # Create a separate entry for each subject
        for subj_uri in subjects:
            cleaned_subj = clean_uri_for_llm_value(subj_uri)
            if cleaned_subj:
                # Add tuple: (cleaned_subject, cleaned_predicate, instance_name)
                incoming_tuples.append(
                    (cleaned_subj, pred_cleaned_for_output, instance_name_for_part2))

    # Sort the tuples primarily by subject name, then by predicate name
    incoming_tuples.sort()

    # Generate output lines for part 2 from sorted tuples
    for subj, pred, obj in incoming_tuples:
        output_lines_part2.append(f"{subj} : {pred} : {obj}")

    # --- Combine Output ---
    final_output = "\n".join(output_lines_part1)
    # Add separator only if both parts have content
    if output_lines_part1 and output_lines_part2:
        final_output += "\n\n"  # Add blank line separator
    if output_lines_part2:
        final_output += "\n".join(output_lines_part2)

    # Handle cases where only incoming relationships were found
    if not output_lines_part1 and output_lines_part2:
        final_output = f"name: {instance_name_cleaned}\n(No outgoing properties found)\n\n" + \
            final_output

    return final_output

# --- Core Logic Functions ---

# Step 1: Fetch all ontology classes


def fetch_classes():
    """Fetches a sample of DBpedia ontology classes."""
    print("Fetching ontology classes...")
    sparql = get_sparql(return_format=JSON)
    # Refined query for DBpedia ontology classes
    class_query = r"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>

    SELECT ?class
    FROM <http://dbpedia.org/model>
    WHERE {
      ?class a owl:Class .
      FILTER (
        regex(STRAFTER(STR(?class), "http://dbpedia.org/ontology/"), "^[\\x00-\\x7F]+$")
      )
    }
    ORDER BY ?class
    LIMIT 10
    """
    sparql.setQuery(class_query)
    try:
        results = sparql.query().convert()
        classes = [result["class"]["value"]
                   for result in results["results"]["bindings"]]
        print(f"Fetched {len(classes)} sample classes.")
        return classes
    except Exception as e:
        print(f"[Error] Fetching classes: {type(e).__name__} - {e}")
        return []


# Step 2: Fetch instances of a class
def fetch_instances_for_class(ontology_class):
    """Fetches a sample of instances for a given DBpedia class."""
    sparql = get_sparql(return_format=JSON)
    instance_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?instance
    FROM <http://dbpedia.org/model>
    FROM <http://dbpedia.org/data>
    WHERE {{
        BIND(<{ontology_class}> AS ?entity)
        ?instance a ?entity .
    }}
    ORDER BY ?instance
    LIMIT 1
    """
    sparql.setQuery(instance_query)
    try:
        results = sparql.query().convert()
        instances = [result["instance"]["value"]
                     for result in results["results"]["bindings"]]
        return instances
    except Exception as e:
        print(
            f"[Error] Fetching instances for {ontology_class}: {type(e).__name__} - {e}")
        # Log failed class with thread safety
        try:
            os.makedirs(FAILED_LOG_DIR, exist_ok=True)
            with file_lock:  # Protect file write
                with open(FAILED_CLASS_LOG, "a", encoding="utf-8") as f:
                    f.write(ontology_class + "\n")
        except Exception as file_err:
            print(
                f"[Error] Saving failed class IRI to {FAILED_CLASS_LOG}: {file_err}")
        return []


# Step 3: Describe a single instance
def describe_instance(instance_iri):
    """
    Fetches description data (N3), processes it structurally, and formats for LLM.
    Uses CONSTRUCT for potentially better reliability than DESCRIBE.
    """
    sparql = get_sparql(return_format=N3)
    query = f"DESCRIBE <{instance_iri}>"
    sparql.setQuery(query)
    try:
        result_bytes = sparql.query().convert()
        rdf_n3_string = result_bytes.decode('utf-8') if result_bytes else ""

        # Step 1 (Internal): Extract structured data
        structured_data = extract_structured_description(
            rdf_n3_string, instance_iri)

        if structured_data is None:
            return None  # Error during parsing

        # Step 2 (Internal): Format for LLM
        llm_input_string = format_for_llm_custom_layout(structured_data)
        return llm_input_string

    except Exception as e:
        print(f"[Error] Describing {instance_iri}: {type(e).__name__} - {e}")
        # Log failed instance with thread safety
        try:
            os.makedirs(FAILED_LOG_DIR, exist_ok=True)
            with file_lock:  # Protect file write
                with open(FAILED_INSTANCE_LOG, "a", encoding="utf-8") as f:
                    f.write(instance_iri + "\n")
        except Exception as file_err:
            print(
                f"[Error] Saving failed instance IRI to {FAILED_INSTANCE_LOG}: {file_err}")
        return None


# Step 4: Threaded orchestration
def process_class(ontology_class, output_filename, lock):
    """Fetches instances for a class and writes their descriptions to a file."""
    instances = fetch_instances_for_class(ontology_class)
    if not instances:
        return

    for iri in instances:
        describe_instance_str = describe_instance(iri)
        if describe_instance_str is not None:
           # Create a dictionary for the JSON object
            output_data = {
                "iri": iri,
                "description": describe_instance_str
            }
            # Convert dictionary to JSON string
            # ensure_ascii=False is important for non-ASCII characters in descriptions/IRIs
            json_line = json.dumps(output_data, ensure_ascii=False)

            # Acquire lock before writing to the shared file
            with lock:
                try:
                    with open(output_filename, "a", encoding="utf-8") as f:
                        f.write(json_line + "\n")
                except Exception as e:
                    print(f"[Error] Writing to file for {iri}: {e}")


# Step 5: Main runner with threading
def main():
    """Main function to orchestrate fetching and processing."""
    try:
        os.makedirs(FAILED_LOG_DIR, exist_ok=True)
    except Exception as e:
        print(f"[Error] Creating failed log directory {FAILED_LOG_DIR}: {e}")
        return
    try:
        os.makedirs(OUTPUT_FILENAME_DIR, exist_ok=True)
    except Exception as e:
        print(f"[Error] Creating failed log directory {FAILED_LOG_DIR}: {e}")
        return

    owl_classes = fetch_classes()
    if not owl_classes:
        print("No classes fetched. Exiting.")
        return

    print(
        f"Processing {len(owl_classes)} classes using up to {MAX_THREADS} threads...")

    processed_count = 0
    # Use the global lock defined earlier
    global file_lock

    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        # Submit tasks with necessary arguments
        futures = {executor.submit(process_class, owl_class, OUTPUT_FILENAME, file_lock): owl_class
                   for owl_class in owl_classes}

        for future in as_completed(futures):
            owl_class = futures[future]
            try:
                future.result()  # Check for exceptions raised within the thread
                processed_count += 1
                # Print progress indicator
                print(
                    f" \r > Progress: {processed_count}/{len(owl_classes)} classes processed...", end="")
            except Exception as e:
                # Log errors from the thread execution itself
                print(
                    f"\n[Error in thread result for {owl_class}]: {type(e).__name__} - {e}")

    print(f"\nProcessing complete. Descriptions saved to {OUTPUT_FILENAME}")
    print(f"Check {FAILED_CLASS_LOG} and {FAILED_INSTANCE_LOG} for any errors.")


main()

Fetching ontology classes...
Fetched 10 sample classes.
Processing 10 classes using up to 32 threads...
 > Progress: 10/10 classes processed... 
Processing complete. Descriptions saved to c:\Users\deepa\data\workspace\notebooks\datasets\instance_description\instance_description.jsonl
Check c:\Users\deepa\data\workspace\notebooks\datasets\failed\failed_class_iri.txt and c:\Users\deepa\data\workspace\notebooks\datasets\failed\failed_instance_iri.txt for any errors.


In [98]:

def read_one_jsonl(filename):
    """
    Reads a JSONL file line by line and yields each parsed JSON object.
    This allows processing one record at a time without loading the whole file.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            for line_number, line in enumerate(f, 1):
                line = line.strip()
                if not line: continue
                try:
                    yield json.loads(line) # Yield the parsed dictionary
                except json.JSONDecodeError:
                    print(f"Warning: Skipping invalid JSON on line {line_number} in {filename}")
    except FileNotFoundError:
        print(f"Error: File not found - {filename}")
    except Exception as e:
        print(f"An unexpected error occurred while reading {filename}: {e}")

# Example Usage:
for record in read_one_jsonl(OUTPUT_FILENAME):
    print("---"*50)
    print(f"IRI: {record.get('iri')}")
    print(f"Description: {record.get('description')}")
    # Process the record here

------------------------------------------------------------------------------------------------------------------------------------------------------
IRI: http://dbpedia.org/resource/100_Word_Story
Description: name: 100 Word Story
abbreviation: 100 Word Story
academic discipline: Literary magazine
editor: Grant Faulkner
first publication year: 2011
frequency of publication: Quarterly
type: AcademicJournal, CreativeWork, PeriodicalLiterature, Q1092563, Q234460, Q386724, Thing, Work, WrittenWork
------------------------------------------------------------------------------------------------------------------------------------------------------
IRI: http://dbpedia.org/resource/AAAI/ACM_Conference_on_AI,_Ethics,_and_Society
Description: name: AIES
academic discipline: Computer science
frequency of publication: Annual
publisher: Association for Computing Machinery
type: AcademicConference, Event, Q1656682, Q2020153, SocietalEvent, Thing
----------------------------------------------------

In [None]:
def get_sparql():
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setReturnFormat(JSON)
    return sparql

# Step 1: Fetch all ontology classes
def fetch_classes():
    sparql = get_sparql()
    class_query = r"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>

    SELECT ?class
    FROM <http://dbpedia.org/model>
    WHERE {
      ?class a owl:Class .
      FILTER (
        regex(STRAFTER(STR(?class), "http://dbpedia.org/ontology/"), "^[\\x00-\\x7F]+$")
      )
    }
    ORDER BY ?class
    LIMIT 10
    """
    sparql.setQuery(class_query)
    results = sparql.query().convert()
    return [result["class"]["value"] for result in results["results"]["bindings"]]

# Step 2: Fetch instances of a class
def fetch_instances_for_class(ontology_class):
    sparql = get_sparql()
    instance_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?instance
    FROM <http://dbpedia.org/model>
    FROM <http://dbpedia.org/data>
    WHERE {{
        BIND(<{ontology_class}> AS ?entity)
        ?instance a ?entity .
    }}
    ORDER BY ?instance
    LIMIT 1
    """
    sparql.setQuery(instance_query)
    try:
        results = sparql.query().convert()
        return [(result["instance"]["value"]) for result in results["results"]["bindings"]]
    except Exception as e:
        print(f"[Error] Fetching instances for {ontology_class}: {e}")
        try:
            folder_location = os.path.join(os.getcwd(), "datasets", "failed")
            os.makedirs(folder_location, exist_ok=True)
            failed_class_file = os.path.join(
                folder_location, "failed_class_iri.txt")
            with open(failed_class_file, "a", encoding="utf-8") as f:
                f.write(ontology_class + "\n")
            print(f"Logged failed class IRI to {failed_class_file}")
        except Exception as file_err:
            print(f"[Error] Saving failed class IRI: {file_err}")
        return []


# def get_sparql():
#     sparql = SPARQLWrapper(SPARQL_ENDPOINT)
#     return sparql


def split_camel_case_to_lower_words(name):
    """Splits CamelCase or PascalCase and returns lowercase words separated by spaces."""
    if not name:
        return ""
    # Handle simple cases first
    if name.islower() or '_' in name or not re.search('[A-Z]', name):
        # Replace underscores and lowercase
        return name.replace('_', ' ').lower()

    # Insert space before uppercase letters (except at the start)
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
    # Insert space before uppercase letters that follow lowercase or digit
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1)
    return s2.lower()  # Convert the whole result to lowercase


def clean_uri_for_llm_key(uri_str):
    """Cleans a predicate URI string into a readable key (lowercase, space-separated)."""
    if not uri_str:
        return "unknown property"

    # Specific overrides first (already lowercase)
    if uri_str == str(RDF.type):
        return "type"
    if uri_str == str(RDFS.label):
        return "label"
    if uri_str == str(FOAF.name):
        return "name"
    # Add other specific overrides if needed

    # General cleaning - extract local name
    if '#' in uri_str:
        name = uri_str.split('#')[-1]
    else:
        name = uri_str.split('/')[-1]

    # Split camel case and convert to lowercase words
    return split_camel_case_to_lower_words(name)


def clean_uri_for_llm_value(uri_str):
    if not uri_str:
        return "Unknown Resource"
    if '#' in uri_str:
        name = uri_str.split('#')[-1]
    else:
        name = uri_str.split('/')[-1]
    name = name.replace('%28', '(').replace('%29', ')')
    if uri_str.startswith("http://www.wikidata.org/entity/"):
        return name  # Just return QID like Q215380
    return name.replace('_', ' ')


def format_rdf_term_for_llm_value(term_data):
    if isinstance(term_data, dict):  # Literal dictionary
        val = term_data.get("value", "")
        val = re.sub(r'@\w+$', '', val)
        val = re.sub(r'\^\^<.*>$', '', val)
        val = val.strip('"')
        return val
    elif isinstance(term_data, str):  # URI string
        return clean_uri_for_llm_value(term_data)
    else:
        return str(term_data)


def format_rdf_term(term):
    # ... (same as before) ...
    if isinstance(term, Literal):
        dt = str(term.datatype) if term.datatype else None
        if dt is None and term.language:
            dt = str(RDF.langString)
        elif dt is None:
            dt = str(XSD.string)
        return {"value": str(term), "language": term.language, "datatype": dt}
    elif isinstance(term, URIRef):
        return str(term)
    else:
        return str(term)


def extract_structured_description(rdf_n3_string, instance_iri):
    # ... (same as before) ...
    if not rdf_n3_string:
        return {"instance_iri": instance_iri, "outgoing": {}, "incoming": {}}
    g = Graph()
    try:
        g.parse(data=rdf_n3_string, format="n3", publicID=instance_iri)
    except Exception as e:
        print(f"[Error] Parsing N3 data for {instance_iri}: {e}")
        return None
    instance_ref = URIRef(instance_iri)
    outgoing_data = defaultdict(list)
    incoming_data = defaultdict(list)
    # Outgoing
    for pred, obj in g.predicate_objects(subject=instance_ref):
        pred_uri_str = str(pred)
        formatted_obj = format_rdf_term(obj)
        if formatted_obj not in outgoing_data[pred_uri_str]:
            outgoing_data[pred_uri_str].append(formatted_obj)
    # Incoming
    for subj, pred in g.subject_predicates(object=instance_ref):
        if subj == instance_ref:
            continue
        pred_uri_str = str(pred)
        subj_uri_str = str(subj)
        incoming_data[pred_uri_str].append(subj_uri_str)
    # Final Structure
    final_outgoing = {pred: sorted(values, key=str)
                      for pred, values in outgoing_data.items()}
    final_incoming = {pred: values for pred, values in incoming_data.items()}
    return {"instance_iri": instance_iri, "outgoing": final_outgoing, "incoming": final_incoming}


def format_for_llm_custom_layout(structured_data):
    """
    Takes the structured dictionary and formats it into the specific
    two-part layout requested by the user (revised key/predicate format).

    Args:
        structured_data: The dictionary produced by extract_structured_description.

    Returns:
        A formatted string for LLM input in the custom layout.
    """
    if not structured_data or (not structured_data.get("outgoing") and not structured_data.get("incoming")):
        instance_iri = structured_data.get("instance_iri", "Unknown Instance")
        instance_name = clean_uri_for_llm_value(instance_iri)
        return f"name: {instance_name}\n(No description data found)"

    instance_iri = structured_data.get("instance_iri")
    instance_name_cleaned = clean_uri_for_llm_value(instance_iri)

    output_lines_part1 = []
    output_lines_part2 = []

    # --- Part 1: Outgoing Properties (key: value) ---
    outgoing_properties = structured_data.get("outgoing", {})
    primary_name_val = instance_name_cleaned

    temp_outgoing_formatted = {}
    # Use the modified clean_uri_for_llm_key here
    for pred_uri in sorted(outgoing_properties.keys()):
        llm_key = clean_uri_for_llm_key(pred_uri)  # Uses new cleaning logic
        values = outgoing_properties[pred_uri]
        cleaned_values_for_key = []
        for term_data in values:
            cleaned_val = format_rdf_term_for_llm_value(term_data)
            if cleaned_val and cleaned_val not in cleaned_values_for_key:
                cleaned_values_for_key.append(cleaned_val)
        if cleaned_values_for_key:
            value_string = ", ".join(sorted(cleaned_values_for_key))
            temp_outgoing_formatted[llm_key] = value_string
            if llm_key == 'name':  # Check against the cleaned key 'name'
                primary_name_val = value_string

    # Generate output lines for part 1
    if 'name' in temp_outgoing_formatted:
        output_lines_part1.append(f"name: {temp_outgoing_formatted['name']}")
    elif instance_name_cleaned:
        output_lines_part1.append(f"name: {instance_name_cleaned}")

    for key in sorted(temp_outgoing_formatted.keys()):
        if key == 'name':
            continue
        output_lines_part1.append(f"{key}: {temp_outgoing_formatted[key]}")

    # --- Part 2: Incoming Relationships (Subject : Predicate : Object) ---
    incoming_relationships = structured_data.get("incoming", {})
    instance_name_for_part2 = primary_name_val

    incoming_tuples = []
    for pred_uri, subjects in incoming_relationships.items():
        # **MODIFICATION**: Apply new cleaning logic to predicate for Part 2 output
        # Extract local name first
        if '#' in pred_uri:
            pred_local_name = pred_uri.split('#')[-1]
        else:
            pred_local_name = pred_uri.split('/')[-1]
        # Apply split and lowercase logic
        pred_cleaned_for_output = split_camel_case_to_lower_words(
            pred_local_name)

        for subj_uri in subjects:
            cleaned_subj = clean_uri_for_llm_value(subj_uri)
            if cleaned_subj:
                incoming_tuples.append(
                    (cleaned_subj, pred_cleaned_for_output, instance_name_for_part2))

    # Sort the tuples primarily by subject, then predicate
    incoming_tuples.sort()

    # Generate output lines for part 2 from sorted tuples
    for subj, pred, obj in incoming_tuples:
        output_lines_part2.append(f"{subj} : {pred} : {obj}")

    # --- Combine Output ---
    final_output = "\n".join(output_lines_part1)
    if output_lines_part2:
        if output_lines_part1:
            final_output += "\n\n"
        final_output += "\n".join(output_lines_part2)

    return final_output

# Step 3: Describe a single instance
def describe_instance(instance_iri):
    """
    Fetches DESCRIBE data, processes it structurally, and then formats
    it into the custom two-part layout for LLM input (revised key/predicate format).
    """
    sparql = get_sparql()
    sparql.setReturnFormat(N3)
    query = f"DESCRIBE <{instance_iri}>"
    sparql.setQuery(query)

    try:
        result_bytes = sparql.query().convert()
        rdf_n3_string = result_bytes.decode('utf-8')

        structured_data = extract_structured_description(
            rdf_n3_string, instance_iri)

        if structured_data is None:
            raise ValueError("Failed to parse RDF data.")

        llm_input_string = format_for_llm_custom_layout(structured_data)
        return llm_input_string

    except Exception as e:
        print(f"[Error] Describing {instance_iri}: {e}")
        try:
            folder_location = os.path.join(os.getcwd(), "datasets", "failed")
            os.makedirs(folder_location, exist_ok=True)  # ensure 'data/' exists
            failed_instance_file = os.path.join(folder_location, "failed_instance_iri.txt")
            with open(failed_instance_file, "a", encoding="utf-8") as f:
                f.write(instance_iri + "\n")
            print(f"Logged failed IRI to {failed_instance_file}")
        except Exception as file_err:
            print(f"[Error] Saving failed IRI: {file_err}")
        return None

# Step 4: Threaded orchestration
def process_class(ontology_class):
    instances = fetch_instances_for_class(ontology_class)
    for iri in instances:
        describe_instance_str = describe_instance(iri)
        print(f"\n[DESCRIBE] {iri}\n{describe_instance_str}\n")

# Step 5: Main runner with threading
def main():
    owl_classes = fetch_classes()
    print(f"Total owl classes fetched: {len(owl_classes)}")

    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(process_class, owl_class)
                   for owl_class in owl_classes]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"[Error in thread result] {e}")


main()

Total owl classes fetched: 10

[DESCRIBE] http://dbpedia.org/resource/AAAI/ACM_Conference_on_AI,_Ethics,_and_Society
name: AIES
academic discipline: Computer science
frequency of publication: Annual
publisher: Association for Computing Machinery
type: AcademicConference, Event, Q1656682, Q2020153, SocietalEvent, Thing


[DESCRIBE] http://dbpedia.org/resource/%22Glozel_est_Authentique!%22
name: Glozel est Authentique!
genre: Horror fiction
publisher: Theatre of the Mind Enterprises
type: Activity, Game, Q11410, Q1914636, Thing


[DESCRIBE] http://dbpedia.org/resource/100_Word_Story
name: 100 Word Story
abbreviation: 100 Word Story
academic discipline: Literary magazine
editor: Grant Faulkner
first publication year: 2011
frequency of publication: Quarterly
type: AcademicJournal, CreativeWork, PeriodicalLiterature, Q1092563, Q234460, Q386724, Thing, Work, WrittenWork


[DESCRIBE] http://dbpedia.org/resource/'Arsh
name: ʿArsh
country: Yemen
elevation: 1455.0
original name: عرش
population t

In [None]:
# Summarize the following information about the entity http://dbpedia.org/resource/100_Word_Story:

# Name: 100 Word Story
# Abbreviation: 100 Word Story
# Type: Academic Journal, Periodical Literature, Written Work, Creative Work
# First published in: 2011
# Frequency of publication: Quarterly
# Academic discipline: Literary Magazine
# Editor: Grant Faulkner
# Homepage: http://www.100wordstory.org/

In [64]:
def get_sparql():
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    return sparql

# **MODIFIED FUNCTION**
def split_camel_case_to_lower_words(name):
    """Splits CamelCase or PascalCase and returns lowercase words separated by spaces."""
    if not name: return ""
    # Handle simple cases first
    if name.islower() or '_' in name or not re.search('[A-Z]', name):
        return name.replace('_', ' ').lower() # Replace underscores and lowercase

    # Insert space before uppercase letters (except at the start)
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
    # Insert space before uppercase letters that follow lowercase or digit
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1)
    return s2.lower() # Convert the whole result to lowercase

# **MODIFIED FUNCTION**
def clean_uri_for_llm_key(uri_str):
    """Cleans a predicate URI string into a readable key (lowercase, space-separated)."""
    if not uri_str: return "unknown property"

    # Specific overrides first (already lowercase)
    if uri_str == str(RDF.type): return "type"
    if uri_str == str(RDFS.label): return "label"
    if uri_str == str(FOAF.name): return "name"
    # Add other specific overrides if needed

    # General cleaning - extract local name
    if '#' in uri_str: name = uri_str.split('#')[-1]
    else: name = uri_str.split('/')[-1]

    # Split camel case and convert to lowercase words
    return split_camel_case_to_lower_words(name)

# (Other helper functions remain the same)
def clean_uri_for_llm_value(uri_str):
    if not uri_str: return "Unknown Resource"
    if '#' in uri_str: name = uri_str.split('#')[-1]
    else: name = uri_str.split('/')[-1]
    name = name.replace('%28', '(').replace('%29', ')')
    if uri_str.startswith("http://www.wikidata.org/entity/"):
        return name # Just return QID like Q215380
    return name.replace('_', ' ')

def format_rdf_term_for_llm_value(term_data):
    if isinstance(term_data, dict): # Literal dictionary
        val = term_data.get("value", "")
        val = re.sub(r'@\w+$', '', val)
        val = re.sub(r'\^\^<.*>$', '', val)
        val = val.strip('"')
        return val
    elif isinstance(term_data, str): # URI string
        return clean_uri_for_llm_value(term_data)
    else:
        return str(term_data)

# --- Step 1: Extract Structured Data (Unchanged Predicates) ---
# (This function remains exactly the same as in the previous answers)
def format_rdf_term(term):
    # ... (same as before) ...
    if isinstance(term, Literal):
        dt = str(term.datatype) if term.datatype else None
        if dt is None and term.language: dt = str(RDF.langString)
        elif dt is None: dt = str(XSD.string)
        return {"value": str(term), "language": term.language, "datatype": dt}
    elif isinstance(term, URIRef): return str(term)
    else: return str(term)

def extract_structured_description(rdf_n3_string, instance_iri):
    # ... (same as before) ...
    if not rdf_n3_string:
        return {"instance_iri": instance_iri, "outgoing": {}, "incoming": {}}
    g = Graph()
    try:
        g.parse(data=rdf_n3_string, format="n3", publicID=instance_iri)
    except Exception as e:
        print(f"[Error] Parsing N3 data for {instance_iri}: {e}")
        return None
    instance_ref = URIRef(instance_iri)
    outgoing_data = defaultdict(list)
    incoming_data = defaultdict(list)
    # Outgoing
    for pred, obj in g.predicate_objects(subject=instance_ref):
        pred_uri_str = str(pred)
        formatted_obj = format_rdf_term(obj)
        if formatted_obj not in outgoing_data[pred_uri_str]:
             outgoing_data[pred_uri_str].append(formatted_obj)
    # Incoming
    for subj, pred in g.subject_predicates(object=instance_ref):
        if subj == instance_ref: continue
        pred_uri_str = str(pred)
        subj_uri_str = str(subj)
        incoming_data[pred_uri_str].append(subj_uri_str)
    # Final Structure
    final_outgoing = {pred: sorted(values, key=str) for pred, values in outgoing_data.items()}
    final_incoming = {pred: values for pred, values in incoming_data.items()}
    return {"instance_iri": instance_iri, "outgoing": final_outgoing, "incoming": final_incoming}


# --- Step 2: Format Structured Data into Custom Layout for LLM (Revised Keys/Predicates) ---

def format_for_llm_custom_layout(structured_data):
    """
    Takes the structured dictionary and formats it into the specific
    two-part layout requested by the user (revised key/predicate format).

    Args:
        structured_data: The dictionary produced by extract_structured_description.

    Returns:
        A formatted string for LLM input in the custom layout.
    """
    if not structured_data or (not structured_data.get("outgoing") and not structured_data.get("incoming")):
        instance_iri = structured_data.get("instance_iri", "Unknown Instance")
        instance_name = clean_uri_for_llm_value(instance_iri)
        return f"name: {instance_name}\n(No description data found)"

    instance_iri = structured_data.get("instance_iri")
    instance_name_cleaned = clean_uri_for_llm_value(instance_iri)

    output_lines_part1 = []
    output_lines_part2 = []

    # --- Part 1: Outgoing Properties (key: value) ---
    outgoing_properties = structured_data.get("outgoing", {})
    primary_name_val = instance_name_cleaned

    temp_outgoing_formatted = {}
    # Use the modified clean_uri_for_llm_key here
    for pred_uri in sorted(outgoing_properties.keys()):
        llm_key = clean_uri_for_llm_key(pred_uri) # Uses new cleaning logic
        values = outgoing_properties[pred_uri]
        cleaned_values_for_key = []
        for term_data in values:
            cleaned_val = format_rdf_term_for_llm_value(term_data)
            if cleaned_val and cleaned_val not in cleaned_values_for_key:
                cleaned_values_for_key.append(cleaned_val)
        if cleaned_values_for_key:
            value_string = ", ".join(sorted(cleaned_values_for_key))
            temp_outgoing_formatted[llm_key] = value_string
            if llm_key == 'name': # Check against the cleaned key 'name'
                primary_name_val = value_string

    # Generate output lines for part 1
    if 'name' in temp_outgoing_formatted:
         output_lines_part1.append(f"name: {temp_outgoing_formatted['name']}")
    elif instance_name_cleaned:
         output_lines_part1.append(f"name: {instance_name_cleaned}")

    for key in sorted(temp_outgoing_formatted.keys()):
        if key == 'name': continue
        output_lines_part1.append(f"{key}: {temp_outgoing_formatted[key]}")


    # --- Part 2: Incoming Relationships (Subject : Predicate : Object) ---
    incoming_relationships = structured_data.get("incoming", {})
    instance_name_for_part2 = primary_name_val

    incoming_tuples = []
    for pred_uri, subjects in incoming_relationships.items():
        # **MODIFICATION**: Apply new cleaning logic to predicate for Part 2 output
        # Extract local name first
        if '#' in pred_uri: pred_local_name = pred_uri.split('#')[-1]
        else: pred_local_name = pred_uri.split('/')[-1]
        # Apply split and lowercase logic
        pred_cleaned_for_output = split_camel_case_to_lower_words(pred_local_name)

        for subj_uri in subjects:
            cleaned_subj = clean_uri_for_llm_value(subj_uri)
            if cleaned_subj:
                incoming_tuples.append((cleaned_subj, pred_cleaned_for_output, instance_name_for_part2))

    # Sort the tuples primarily by subject, then predicate
    incoming_tuples.sort()

    # Generate output lines for part 2 from sorted tuples
    for subj, pred, obj in incoming_tuples:
         output_lines_part2.append(f"{subj} : {pred} : {obj}")


    # --- Combine Output ---
    final_output = "\n".join(output_lines_part1)
    if output_lines_part2:
        if output_lines_part1:
            final_output += "\n\n"
        final_output += "\n".join(output_lines_part2)

    return final_output


# --- Main Execution Logic --- (Unchanged)

def describe_instance_custom_layout(instance_iri):
    """
    Fetches DESCRIBE data, processes it structurally, and then formats
    it into the custom two-part layout for LLM input (revised key/predicate format).
    """
    sparql = get_sparql()
    sparql.setReturnFormat(N3)
    query = f"DESCRIBE <{instance_iri}>"
    sparql.setQuery(query)

    try:
        result_bytes = sparql.query().convert()
        rdf_n3_string = result_bytes.decode('utf-8')

        structured_data = extract_structured_description(rdf_n3_string, instance_iri)

        if structured_data is None:
             raise ValueError("Failed to parse RDF data.")

        llm_input_string = format_for_llm_custom_layout(structured_data)
        return llm_input_string

    except Exception as e:
        print(f"[Error] Describing or formatting for LLM {instance_iri}: {e}")
        return None

In [66]:
# --- Example Usage --- (Unchanged)
if __name__ == "__main__":
    test_iri_band = "http://dbpedia.org/resource/!!!"
    llm_input_band = describe_instance_custom_layout(test_iri_band)

    if llm_input_band:
        print(f"\n--- ({test_iri_band}) ---")
        print(llm_input_band)
    else:
        print(f"\nFailed to get or format description for {test_iri_band}")

    print("\n----------------------------------------")

    test_iri_journal = "http://dbpedia.org/resource/100_Word_Story"
    llm_input_journal = describe_instance_custom_layout(test_iri_journal)
    if llm_input_journal:
         print(f"\n--- ({test_iri_journal}) ---")
         print(llm_input_journal)
    else:
         print(f"\nFailed to get or format description for {test_iri_journal}")


--- (http://dbpedia.org/resource/!!!) ---
name: !!!
active years start year: 1996
alias: Chk Chk Chk
background: group_or_band
band member: Nic Offer
former band member: Jerry Fuchs, Justin Van Der Volgen
genre: Alternative dance, Dance-punk, Disco-rock, Funk rock, Indietronica
hometown: California, Sacramento, California
record label: Gold Standard Laboratories, Touch and Go Records, Warp (record label)
type: Agent, Band, Group, MusicGroup, Organisation, Organization, Q215380, Q24229398, Q43229, SocialPerson, Thing

 One Boy : artist : !!!
Chris Coady : associated band : !!!
Chris Coady : associated musical artist : !!!
Jerry Fuchs : associated band : !!!
Jerry Fuchs : associated musical artist : !!!
Maserati (band) : associated band : !!!
Maserati (band) : associated musical artist : !!!
Nic Offer  Nic Offer  1 : associated band : !!!
Nic Offer  Nic Offer  1 : associated musical artist : !!!
Raleigh Moncrief : associated band : !!!
Raleigh Moncrief : associated musical artist : !!!
