# import

In [12]:
import pprint
import os
import sys

from dotenv import load_dotenv

from langchain_ollama import OllamaLLM, ChatOllama, OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever

from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

import psycopg
from langchain_postgres.vectorstores import PGVector
from SPARQLWrapper import SPARQLWrapper, JSON, POST, N3
from urllib.parse import urljoin

import concurrent
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from threading import Lock

from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, OWL, FOAF, XSD, SKOS, DCTERMS
import re
from collections import Counter, defaultdict
import json
import datetime
import logging
import time

import urllib.parse
from typing import List, Tuple, Dict, Set, DefaultDict

In [13]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)  # Change to WARNING or ERROR in production

# loading env variables

In [14]:
load_dotenv()

True

# Langain & Ollama

## check ollama status

In [None]:
# !curl --location 'http://127.0.0.1:11434/api/generate' \
# --header 'Content-Type: application/json' \
# --data '{ \
#     "model": "llama3.2:3b", \
#     "prompt": "hello llama!",  \
#     "options": { \
#         "temperature": 0 \
#     } \
# }' \
# | python -m json.tool

In [None]:
# !curl http://localhost:11434/api/tags

## Ollama model: configuration

In [15]:
llm = OllamaLLM(model="llama3.2:3b", temperature=0)
# llm = OllamaLLM(model="deepseek-r1:8b", temperature=0)
llm

OllamaLLM(model='llama3.2:3b', temperature=0.0)

In [16]:
chat_ollama = ChatOllama(model="llama3.2:3b", temperature=0)
chat_ollama

ChatOllama(model='llama3.2:3b', temperature=0.0)

## testing llm: invoke


In [None]:
# llm.invoke(input="tell me a joke")
response = llm.invoke("hello ollama!")

# response = llm.invoke("Create an agent that uses Ollama function calling in Langchain.")

logger.info(response)

In [None]:
messages = [
    ("system", "You are a helpful translator. Translate the user sentence to French."),
    ("human", "I love programming."),
]
chat_ollama.invoke(messages)

## testing llm: chat prompt template

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a world class technical documentation writer."),
    ("user", "{input}")
])

chain = chat_prompt_template | llm

response = chain.invoke({"input": "how can langsmith help with testing?"})

print(response)

## testing llm: chat prompt template & StrOutputParser

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a world class technical documentation writer."),
    ("user", "{input}")
])

output_parser = StrOutputParser()

chain = chat_prompt_template | llm | output_parser

response = chain.invoke({"input": "how can langsmith help with testing?"})

print(response)

## create vector store & a retriever

In [None]:
# 1. select a specfic datasource. In this case a web page.
# 2. save extracted content from the web page as docs.
# 3. index the docs using FAISS vector store.
# 4. convert the vector store to retriever.

web_base_loader = WebBaseLoader("https://docs.smith.langchain.com/user_guide")

docs = web_base_loader.load()

# print(f"type(docs) : {type(docs)} \n")
# print(f"len(docs) : {len(docs)}\n")
# print(f"docs: {docs} \n")
# type(f"docs[0] : {docs[0]} \n")
# print(f"docs[0].page_content : {docs[0].page_content} \n")

recursive_character_text_splitter = RecursiveCharacterTextSplitter()
documents = recursive_character_text_splitter.split_documents(documents=docs)


# print(type(documents))
# print(len(documents))
# print(documents)
# print(documents[0])
# print(documents[2])

ollama_embedding = OllamaEmbeddings(model="llama3.2:3b")
vector_store = FAISS.from_documents(
    documents=documents, embedding=ollama_embedding)


# print(f"vector_store.index.ntotal: {vector_store.index.ntotal}")
# print(f"vector_store._get_retriever_tags() : {vector_store._get_retriever_tags()}")
# print(f"vector_store.index_to_docstore_id : {vector_store.index_to_docstore_id}")
# print(f"type(vector_store.index_to_docstore_id) : {type(vector_store.index_to_docstore_id)}")

vector_store_retriever = vector_store.as_retriever()
print(f"vector_store_retriever: {vector_store_retriever}")

## document chain

In [None]:
# 5. create a chat prompt template
# 6. create a stuff document chain that accepts a llm model and chat prompt template & we can also run stuff document chain by passing in documents directly

chat_prompt_template = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}"""
)

documents_chain = create_stuff_documents_chain(
    llm=llm, prompt=chat_prompt_template)
response = documents_chain.invoke(
    {
        "input": "how can langsmith help with testing?",
        "context": documents
    }
)
print(response)

## retrieval chain

In [None]:
# 7. create a document retrieval chain that takes vector store retriever and stuff document chain

retrieval_chain = create_retrieval_chain(
    vector_store_retriever, documents_chain)
response = retrieval_chain.invoke(
    {"input": "how can langsmith help with testing?"})

# print(type(response))
pprint.pprint(response, indent=4)

## conversation retrieval chain

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", "Given the above conversation, generate a search query to look up to get information relevant to the conversation")
])

history_aware_retriever_chain = create_history_aware_retriever(
    llm, vector_store_retriever, chat_prompt_template)

In [None]:
chat_prompt_template = ChatPromptTemplate.from_messages([
    ("system",
     "Answer the user's questions based on the below context:\n\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}")
])

document_chain = create_stuff_documents_chain(llm, chat_prompt_template)
retrieval_chain = create_retrieval_chain(
    history_aware_retriever_chain, document_chain)

chat_history = [HumanMessage(
    content="Can LangSmith help test my LLM applications?"), AIMessage(content="Yes!")]

response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "tell me how"
})

pprint.pprint(response)

# embeddings

### initialize embedding model

In [17]:
# ollama_embedding = OllamaEmbeddings(model="mxbai-embed-large:335m")
# ollama_embedding = OllamaEmbeddings(model="nomic-embed-text:latest")
ollama_embedding = OllamaEmbeddings(model="bge-m3:567m")

### connect to pgvector

In [18]:
# Format: postgresql+psycopg2://user:password@host:port/dbname
# Database Connection Details
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

CONNECTION_STRING = f"postgresql+psycopg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
COLLECTION_NAME = "dbpedia_docs"

In [19]:
logger.info(f"\nConnecting to PGVector '{COLLECTION_NAME}'...")
try:
    # If the collection table doesn't exist, PGVector will try to create it.
    vectorstore = PGVector(
        connection=CONNECTION_STRING,
        embeddings=ollama_embedding,
        collection_name=COLLECTION_NAME,
        use_jsonb=True
        # pre_delete_collection=True
        # Use pre_delete_collection=True if you want to clear the collection on every run (USE WITH CAUTION!)
        # pre_delete_collection=False,
    )
    print(f"connection successfull!")
except psycopg.OperationalError as e:
    logger.exception(f"\nDatabase Connection Error: {e}")
    exit(1)
except Exception as e:
    logger.exception(f"\nAn error occurred during PGVector connection: {e}")
    exit(1)

INFO:__main__:
Connecting to PGVector 'dbpedia_docs'...


connection successfull!


### connect to ontotext graph db and fetch all the entities and the description

#### Constants

In [20]:
GRAPHDB_BASE_URL = os.getenv("GRAPHDB_BASE_URL")
GRAPHDB_REPOSITORY = os.getenv("GRAPHDB_REPOSITORY")

# Format: {base_url}/repositories/{repository_id}
SPARQL_ENDPOINT = urljoin(GRAPHDB_BASE_URL.strip('/') + '/', f"repositories/{GRAPHDB_REPOSITORY}")

OUTPUT_FILENAME_DIR = os.path.join("c:\\Users\\deepa\\data\\workspace\\notebooks", "datasets", "instance_description")
OUTPUT_FILENAME = os.path.join(OUTPUT_FILENAME_DIR, "instance_description.jsonl")

FAILED_LOG_DIR = os.path.join("c:\\Users\\deepa\\data\\workspace\\notebooks", "datasets", "failed")
FAILED_CLASS_LOG = os.path.join(FAILED_LOG_DIR, "failed_class_iri.txt")
FAILED_INSTANCE_LOG = os.path.join(FAILED_LOG_DIR, "failed_instance_iri.txt")


INSTANCE_BATCH_SIZE = 100
DELAY_BETWEEN_BATCHES = 1.0
DELAY_BETWEEN_CLASSES = 2.0 # Only used when iterating classes

In [21]:
# MAX_WORKERS = os.cpu_count()
MAX_WORKERS = 2

fail_class_file_lock = threading.Lock()
fail_instance_file_lock = threading.Lock()
output_file_lock = threading.Lock()

In [22]:
def get_sparql(return_format=JSON):
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setReturnFormat(return_format)
    return sparql

#### All Classes

In [24]:
def fetch_classes():
    
    logger.info("Fetching ontology classes from model graph")

    class_query = r"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>

    SELECT ?class
    FROM <http://dbpedia.org/model>
    WHERE {
      ?class a owl:Class .
      FILTER (
        regex(STRAFTER(STR(?class), "http://dbpedia.org/ontology/"), "^[\\x00-\\x7F]+$")
      )
    }
    ORDER BY ?class
    """
    try:
        sparql = get_sparql(return_format=JSON)
        sparql.setQuery(class_query)
        results = sparql.query().convert()
        classes = [result["class"]["value"] for result in results["results"]["bindings"]]
        logger.info(f"Fetched {len(classes)} classes.")
        return classes
    except Exception as e:
        logger.exception(f"[Error] Fetching classes: {type(e).__name__} - {e}")
        return []

#### All Instances of a particular Class

In [25]:
def fetch_instances_for_class(ontology_class):
    
    logger.info(f"Fetching instances of type class {ontology_class} from data graph")

    instance_query = f"""
    SELECT ?instance
    FROM <http://dbpedia.org/model>
    FROM <http://dbpedia.org/data>
    WHERE {{
        BIND(<{ontology_class}> AS ?entity)
        ?instance a ?entity .
    }}
    ORDER BY ?instance
    """
    try:
        sparql = get_sparql(return_format=JSON)
        sparql.setQuery(instance_query)
        results = sparql.query().convert()
        instances = [result["instance"]["value"] for result in results["results"]["bindings"]]
        return instances
    except Exception as e:
        logger.exception(f"[Error] Fetching instances for {ontology_class}: {type(e).__name__} - {e}")
        try:
            with fail_class_file_lock:
                with open(FAILED_CLASS_LOG, "a", encoding="utf-8") as f:
                    f.write(ontology_class + "\n")
        except Exception as file_err:
            logger.exception(f"[Error] Saving failed class IRI to {FAILED_CLASS_LOG}: {file_err}")
        return []

#### Describe Instance

In [26]:
# Step 3: Describe a single instance
def describe_instance(instance_iri):
    
    logger.info(f"Describing instance {instance_iri} from data graph")

    query = f"DESCRIBE <{instance_iri}>"
    try:
        sparql = get_sparql(return_format=N3)
        sparql.setQuery(query)
        result_bytes = sparql.query().convert()
        rdf_n3_string = result_bytes.decode('utf-8') if isinstance(result_bytes, bytes) else str(result_bytes)
        return rdf_n3_string
    except Exception as e:
        logger.exception(f"[Error] Describing {instance_iri}: {type(e).__name__} - {e}")
        try:
            with fail_instance_file_lock:
                with open(FAILED_INSTANCE_LOG, "a", encoding="utf-8") as f:
                    f.write(instance_iri + "\n")
        except Exception as file_err:
            logger.exception(f"[Error] Saving failed instance IRI to {FAILED_INSTANCE_LOG}: {file_err}")
        return None

In [None]:
# instance_iri = "http://dbpedia.org/resource/Roger_Federer"
# response = describe_instance(instance_iri)
# print(response)

#### tranform describe output to Key-Value and Tuple format

In [27]:
def get_label_from_uri(uri_str: str) -> str:
    """Extracts a human-readable label from a URI string."""
    if not isinstance(uri_str, str) or not (uri_str.startswith('<') and uri_str.endswith('>')):
        return str(uri_str) # Return as is if not a valid URI string

    uri = uri_str.strip('<>')
    try:
        parsed = urllib.parse.urlparse(uri)
        if parsed.fragment:
            label_part = parsed.fragment
        else:
            path_parts = [part for part in parsed.path.split('/') if part]
            label_part = path_parts[-1] if path_parts else uri

        decoded_label = urllib.parse.unquote(label_part)
        # Replace common separators with spaces
        label = decoded_label.replace('_', ' ').replace('-', ' ')
        # Basic CamelCase to space separation
        label = re.sub(r'(?<!^)(?=[A-Z])', ' ', label)
        # *** ADDED: Replace multiple spaces with a single space and strip ***
        label = re.sub(r'\s+', ' ', label).strip()
        return label if label else label_part # Return original part if label becomes empty

    except Exception:
        return uri # Fallback to the raw URI on parsing errors

# --- clean_value function remains the same ---
def clean_value(rdf_term: str) -> str:
    """Cleans an RDF term (URI or Literal), returning a readable string."""
    # Expects rdf_term to already be stripped of leading/trailing whitespace
    if rdf_term.startswith('<') and rdf_term.endswith('>'):
        return get_label_from_uri(rdf_term)
    elif rdf_term.startswith('"'):
        match = re.match(r'"(.*?)"', rdf_term)
        return match.group(1) if match else rdf_term
    elif rdf_term.startswith('_:'):
        return rdf_term
    return rdf_term

# --- process_n3_simplified function remains the same ---
def process_n3_simplified(n3_data: str) -> str:
    """
    Processes N3 triples generically, identifies the main subject,
    and returns a formatted key-value summary with lowercase predicate labels,
    grouping outgoing and incoming relationships.

    Args:
        n3_data: A string containing N3 triples, one per line.

    Returns:
        A formatted string summarizing the main entity and its relationships.
    """
    triples: List[Tuple[str, str, str]] = []
    uri_subjects: List[str] = []

    # 1. Parse Triples
    triple_pattern = re.compile(r'^\s*(<[^>]+>|_:\S+)\s+(<[^>]+>)\s+(.*)\s*\.\s*$')
    for line in n3_data.strip().split('\n'):
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        match = triple_pattern.match(line)
        if match:
            s, p, o_raw = match.groups()
            triples.append((s, p, o_raw))
            if s.startswith('<'):
                uri_subjects.append(s)
        else:
            print(f"Warning: Skipping malformed line: {line}")

    if not triples:
        return "No valid triples found."
    if not uri_subjects:
        return "No URI subjects found to determine a main entity."

    # 2. Identify Main Subject
    subject_counts = Counter(uri_subjects)
    main_subject_uri = subject_counts.most_common(1)[0][0]
    main_subject_iri = main_subject_uri.strip('<>')
    derived_main_label = get_label_from_uri(main_subject_uri)

    # 3. Process Information Generically
    properties: DefaultDict[str, Set[str]] = defaultdict(set)
    incoming: Set[Tuple[str, str, str]] = set()

    for s, p, o_raw in triples:
        predicate_label = get_label_from_uri(p).lower()
        o_stripped = o_raw.strip()

        if s == main_subject_uri:
            object_value = clean_value(o_stripped)
            properties[predicate_label].add(object_value)
        elif o_stripped == main_subject_uri:
            subj_label = clean_value(s)
            incoming.add((subj_label, predicate_label, derived_main_label))

    # 4. Format Output
    output_lines = []
    output_lines.append(f"IRI: {main_subject_iri}")

    explicit_label = None
    common_label_preds_lower = ['label', 'rdfs label', 'skos pref label', 'name']
    for label_pred in common_label_preds_lower:
         if label_pred in properties:
             explicit_label = sorted(list(properties[label_pred]))[0]
             if explicit_label:
                 # properties.pop(label_pred, None) # Optional: remove from outgoing
                 break

    output_lines.append(f"label: {explicit_label if explicit_label else derived_main_label}")

    if properties:
        output_lines.append("\nOutgoing Relationships:")
        for predicate_label in sorted(properties.keys()):
            sorted_values = sorted(list(properties[predicate_label]))
            output_lines.append(f"{predicate_label}: {', '.join(sorted_values)}")

    if incoming:
        output_lines.append("\nIncoming Relationships:")
        sorted_incoming = sorted(list(incoming))
        for subj_l, pred_l, obj_l in sorted_incoming:
            output_lines.append(f"({subj_l}, {pred_l}, {obj_l})")

    return "\n".join(output_lines)

In [None]:
# output = process_n3_simplified(response)
# print(output)

#### worker

In [28]:
def process_instance_worker(instance_iri: str):
    try:
        logger.debug(f"Worker processing instance: {instance_iri}")
        n3_data = describe_instance(instance_iri) # This function handles its own failure logging

        if n3_data is not None:
            processed_description = process_n3_simplified(n3_data)
            output_data = {
                "iri": instance_iri,
                "description": processed_description
            }
            try:
                with output_file_lock:
                    with open(OUTPUT_FILENAME, "a", encoding="utf-8") as f:
                        json.dump(output_data, f)
                        f.write("\n")
                logger.debug(f"Successfully processed and saved: {instance_iri}")
            except Exception as file_err:
                logger.exception(f"[Error] Writing output for {instance_iri} to {OUTPUT_FILENAME}: {file_err}")

    except Exception as e:
        logger.exception(f"[Error] Unexpected error in worker for instance {instance_iri}: {e}")
        try:
            with fail_instance_file_lock:
                with open(FAILED_INSTANCE_LOG, "a", encoding="utf-8") as f:
                    f.write(instance_iri + "\n")
        except Exception as file_err:
            logger.exception(f"[Error] Saving failed instance IRI (worker error) to {FAILED_INSTANCE_LOG}: {file_err}")

#### main method

In [29]:
def main():

    os.makedirs(OUTPUT_FILENAME_DIR, exist_ok=True)
    os.makedirs(FAILED_LOG_DIR, exist_ok=True)

    classes = fetch_classes()
    if not classes:
        logger.warning("No classes fetched. Exiting.")
        return

    total_instances_processed = 0
    for i, ontology_class in enumerate(classes):
        logger.info(f"Processing class {i+1}/{len(classes)}: {ontology_class} ---")
        instances = fetch_instances_for_class(ontology_class)

        if not instances:
            logger.warning(f"No instances found or fetch failed for class {ontology_class}. Skipping.")
            continue

        logger.info(f"Submitting {len(instances)} instances of class {ontology_class} for processing...")

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = [
                executor.submit(process_instance_worker, instance_iri)
                for instance_iri in instances
            ]

            # Wait for all tasks for this class to complete and log progress
            processed_count = 0
            for future in concurrent.futures.as_completed(futures):
                processed_count += 1
                # Log progress periodically if needed
                if processed_count % 100 == 0:
                     logger.info(f"Class {ontology_class}: Processed {processed_count}/{len(instances)} instances...")
                try:
                    # Check if the future raised an exception (already logged in worker)
                    future.result() # Raises exception if one occurred in the worker task
                except Exception:
                    # Exception already logged by the worker or describe_instance
                    pass # Or add specific handling here if needed

        total_instances_processed += len(instances) # Count submitted instances
        logger.info(f"Finished processing instances for class {ontology_class}.")

    logger.info("Instance description process completed")
    logger.info(f"Total classes processed: {len(classes)}")
    logger.info(f"Total instances submitted for processing: {total_instances_processed}")

    try:
        with open(FAILED_INSTANCE_LOG, "r", encoding="utf-8") as f:
            failed_instance_count = sum(1 for _ in f)
        logger.info(f"Total instances failed during description/processing (check log): {failed_instance_count}")
    except FileNotFoundError:
        logger.info("No instance failures logged.")
    except Exception as e:
        logger.error(f"Could not read failed instance log count: {e}")

    try:
        with open(FAILED_CLASS_LOG, "r", encoding="utf-8") as f:
            failed_class_count = sum(1 for _ in f)
        logger.info(f"Total classes failed during instance fetching (check log): {failed_class_count}")
    except FileNotFoundError:
        logger.info("No class failures logged.")
    except Exception as e:
        logger.error(f"Could not read failed class log count: {e}")

#### main method call

In [33]:
# main()

In [35]:
# main()

## read the entity Description from the output file

In [None]:
def read_one_jsonl(filename):
    """
    Reads a JSONL file line by line and yields each parsed JSON object.
    This allows processing one record at a time without loading the whole file.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            for line_number, line in enumerate(f, 1):
                line = line.strip()
                if not line: continue
                try:
                    yield json.loads(line) # Yield the parsed dictionary
                except json.JSONDecodeError:
                    print(f"Warning: Skipping invalid JSON on line {line_number} in {filename}")
    except FileNotFoundError:
        print(f"Error: File not found - {filename}")
    except Exception as e:
        print(f"An unexpected error occurred while reading {filename}: {e}")

# Example Usage:
for record in read_one_jsonl(OUTPUT_FILENAME):
    print("---"*50)
    print(f"IRI: {record.get('iri')}")
    print(f"Description: {record.get('description')}")
    # Process the record here

In [None]:
# Summarize the following information about the entity http://dbpedia.org/resource/100_Word_Story:

# Name: 100 Word Story
# Abbreviation: 100 Word Story
# Type: Academic Journal, Periodical Literature, Written Work, Creative Work
# First published in: 2011
# Frequency of publication: Quarterly
# Academic discipline: Literary Magazine
# Editor: Grant Faulkner
# Homepage: http://www.100wordstory.org/