## Initializing Project

In [1]:
import logging
import os
import json
import asyncio
import nest_asyncio
import gradio
from dotenv import load_dotenv
from pathlib import Path
from pprint import pprint
from typing import Any

In [None]:
from ogmyrag.prompts import PROMPT

from ogmyrag.my_logging import configure_logger

from ogmyrag.storage import (
   MongoDBStorage,
   PineconeStorage,
   Neo4jStorage
)

from ogmyrag.graph_construction import (
   get_formatted_company_data, 
   get_formatted_entities_relationships_parsing_query, 
   get_formatted_entities_and_relationships,
   get_formatted_entity_for_vectordb,
   get_formatted_entity_for_graphdb,
   get_formatted_relationship_for_graphdb
)

from ogmyrag.llm import fetch_completion_openai

from ogmyrag.graph_query import (
   GraphQuerySystem
)

from ogmyrag.util import (
   get_formatted_ontology, 
   get_formatted_report_definitions,
   get_formatted_current_datetime,
   get_normalized_string
)

In [None]:
# Patch event loop to support re-entry in Jupyter

nest_asyncio.apply()

In [None]:
# Set up logging

app_logger = configure_logger(name='og-myrag',log_level=logging.DEBUG, log_file='logs/app.log')

mongo_logger = configure_logger(name='mongodb',log_level=logging.DEBUG, log_file='logs/mongodb.log', to_console=False)
pinecone_logger = configure_logger(name='pinecone',log_level=logging.DEBUG, log_file='logs/pinecone.log', to_console=False)
neo4j_logger = configure_logger(name='neo4j',log_level=logging.DEBUG, log_file='logs/neo4j.log', to_console=False)

front_agent_logger = configure_logger(name='front-agent',log_level=logging.DEBUG, log_file='logs/front_agent.log', to_console=False)
vector_search_agent_logger = configure_logger(name='vector-search-agent',log_level=logging.DEBUG, log_file='logs/vector_search_agent.log', to_console=False)
text2cypher_agent_logger = configure_logger(name='text2cypher-agent',log_level=logging.DEBUG, log_file='logs/text2cypher_agent.log', to_console=False)

In [None]:
# Load environment variables

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI","")

openai_api_key = os.getenv("OPENAI_API_KEY","")

pinecone_api_key = os.getenv("PINECONE_API_KEY","")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT","")
pinecone_cloud = os.getenv("PINECONE_CLOUD","")
pinecone_metric = os.getenv("PINECONE_METRIC", "")
pinecone_dimensions = os.getenv("PINECONE_DIMENSIONS")

neo4j_uri = os.getenv("NEO4J_URI","")
neo4j_username = os.getenv("NEO4J_USERNAME","")
neo4j_password = os.getenv("NEO4J_PASSWORD","")

if not mongo_db_uri:
    app_logger.error("Please set the MONGO_DB_URI environment variable.")

if not openai_api_key:
    app_logger.error("Please set the OPENAI_API_KEY environment variable.")

if not pinecone_api_key or not pinecone_environment or not pinecone_cloud or not pinecone_metric or not pinecone_dimensions:
    app_logger.error("Please set the PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_CLOUD, PINECONE_METRIC, and PINECONE_DIMENSIONS environment variables.")

if not neo4j_uri or not neo4j_username or not neo4j_password:
    app_logger.error("Please set the NE04J_URI, NE04J_USERNAME, and NE04J_PASSWORD environment variables.")

In [None]:
# Connect to MongoDB

try:
    mongo = MongoDBStorage(mongo_db_uri)
    mongo.use_database("ogmyrag")
    mongo.use_collection("company_disclosures")
except Exception as e:
    app_logger.error(f"Could not connect to MongoDB: {str(e)}")

In [None]:
# Connect to Pinecone

try:
   pinecone = PineconeStorage(
      index_name = "ogmyrag",
      pinecone_api_key = pinecone_api_key,
      pinecone_environment = pinecone_environment,
      pinecone_cloud = pinecone_cloud,
      pinecone_metric = pinecone_metric,
      pinecone_dimensions = pinecone_dimensions,
      openai_api_key = openai_api_key 
   )
except Exception as e:
    app_logger.error(f"Could not connect to Pinecone: {str(e)}")

In [None]:
# Connect to Neo4j
try:
   neo4j = Neo4jStorage(neo4j_uri, neo4j_username, neo4j_password)
except Exception as e:
    app_logger.error(f"Could not connect to Neo4j: {str(e)}")

## Uploading Documents to MongoDB

In [None]:
# For development purposes, we will use the local file system to read the files and upload to mongodb
# TODO: Support formatting for definitions

folder_name = "adb_prospectus"
folder_path = Path.cwd() / folder_name

if not folder_path.is_dir():
    app_logger.info(f"Folder '{folder_name}' not found in project root.")
    app_logger.info(f"Project is terminated")
else:
    txt_files = list(folder_path.glob("*.txt"))
    if not txt_files:
        app_logger.info("No .txt files found in the folder.")
    else:
        for txt_file in txt_files:
            current_data = get_formatted_company_data(
                txt_file.read_text(encoding='utf-8'),
                txt_file.stem,
                "prospectus",
                "Autocount Dotcom Berhad"
             )
            existing_documents = mongo.read_documents({"name": current_data["name"]})
            if existing_documents:
                app_logger.info(f"Document with name '{current_data['name']}' already exists in the database.")
            else:
                try:
                    document_id = mongo.create_document(current_data)
                    app_logger.info(f"Inserted document with name '{current_data['name']}' and id '{document_id}' into the database.")
                except Exception as e:
                    app_logger.error(f"Error inserting document with name '{current_data['name']}': {str(e)}")

## Entities and Relationships Parsing

### Load word definitions from mongodb

In [None]:
try:
   raw_definitions = mongo.read_documents({
      "type": "PROSPECTUS DEFINITIONS", 
      "from_company": "AUTOCOUNT DOTCOM BERHAD"
   })
   formatted_definitions = get_formatted_report_definitions(json.loads(raw_definitions[0]["content"]))
except Exception as e:
   app_logger.error(f"Error getting formatted word definitions: {str(e)}")

### Load ontology

In [None]:
with open("ontology.json") as f:
   raw_ontology = json.load(f)
   formatted_ontology = get_formatted_ontology(raw_ontology)

### Prepare system prompt

In [None]:
entities_relationships_parsing_system_prompt = get_formatted_entities_relationships_parsing_query(
   prompt_template=PROMPT["ENTITIES_RELATIONSHIPS_PARSING"],
   ontology = formatted_ontology,
   source_txt_definitions=formatted_definitions
)

app_logger.debug(f"Entities and relationships parsing system prompt: {entities_relationships_parsing_system_prompt}")

### Extract entities and relationships from the documents and upload them to MongoDB

In [None]:
# TODO: Add caching mechanism to avoid reprocessing the same document if error occur
async def process_company_data(
  get_data_by: dict[str, Any],
  system_prompt: str,
  llm_model: str = "gpt-4.1-mini",
  max_tokens: int = 16384, 
  temperature: float = 0.3
):
   tasks = []
   try:
      documents = mongo.read_documents(get_data_by)

      if not documents:
         app_logger.info("There are no documents in the database to extract entities and relationships from.")
         return
      else:
         for document in documents:
            if document.get("content"):
               tasks.append(fetch_completion_openai(
                  model=llm_model,
                  user_prompt=document["content"],
                  system_prompt= system_prompt,
                  history_messages= None,
                  max_tokens=max_tokens,
                  temperature=temperature
               ))
               
         if not tasks:
           app_logger.info("No documents with content to process")
           return
         
         results = await asyncio.gather(*tasks)
         
         for document, result in zip(documents, results):
            entities, relationships = get_formatted_entities_and_relationships(result)
            
            app_logger.info(f"Inserting {len(entities)} entity(ies) into the database for document ID: {document['_id']}")
            mongo.use_collection("entities")
            for entity in entities:
               mongo.create_document(entity)
               
            app_logger.info(f"Inserting {len(relationships)} relationship(s) into the database for document ID: {document['_id']}")
            mongo.use_collection("relationships")
            for relationship in relationships:
               mongo.create_document(relationship)
            
            app_logger.info(f"Updating the is_parsed status of the document with ID: {document['_id']}")
            mongo.use_collection("company_disclosures")
            mongo.update_document(
               {"_id": document["_id"]},
               {"is_parsed": True}
            )
            app_logger.info(f"The document with ID {document['_id']} has been successfully processed, along with the upload of {len(entities)} entity(ies) and {len(relationships)} relationship(s).")
   except Exception as e:
      app_logger.error(f"Error occur while processing company data: {e}")

In [None]:
await process_company_data(
   llm_model= "gpt-4o-mini",
   get_data_by = {"is_parsed": False, "name": "ADB_PROSPECTUS_SECTION_1"},
   system_prompt = entities_relationships_parsing_system_prompt
)

### Add source_entity_id and target_entity_id for each relationship

In [None]:
# TODO: Write a better logic to update the source_entity_id and target_entity_id in the relationships collection
# try:
#    mongo.use_collection("relationships")
#    relationships = mongo.read_documents()
   
#    enriched_relationships = []
   
#    mongo.use_collection("entities") 
   
#    for relationship in relationships:
#      source_name = relationship.get("source")
#      target_name = relationship.get("target")

#      source_entities = mongo.read_documents({"name": source_name})
#      if source_entities and isinstance(source_entities, list):
#         source_entity_id = str(source_entities[0].get("_id"))

#      target_entities = mongo.read_documents({"name": target_name})
#      if target_entities and isinstance(target_entities, list):
#         target_entity_id = str(target_entities[0].get("_id"))

#      if not source_entity_id:
#        app_logger.warning(f"Source entity not found for: {source_name}")
#      if not target_entity_id:
#        app_logger.warning(f"Target entity not found for: {target_name}")

#      relationship["source_entity_id"] = source_entity_id if source_entity_id else ""
#      relationship["target_entity_id"] = target_entity_id if target_entity_id else ""
   
#    mongo.use_collection("relationships") 
#    for relationship in relationships:
#       mongo.update_document(
#          {"_id": relationship["_id"]},
#          {
#             "source_entity_id": relationship["source_entity_id"],
#             "target_entity_id": relationship["target_entity_id"]
#          }
#       )

#    app_logger.info(f"Updated {len(relationships)} relationship(s) with source and target entity IDs.")
# except Exception as e:
#    app_logger.error(f"Error while reading relationships: {e}")


### Deduplication of entities and relationships

In [None]:
# TODO

## Uploading Entities to Pinecone

### Read and format entities that have not been uploaded to Pinecone

In [None]:
formatted_entities = []

try:
    mongo.use_collection("entities")
    entities = mongo.read_documents({"inserted_into_vectordb_at": ""})
    for entity in entities:
        formatted_entities.append(get_formatted_entity_for_vectordb(entity))
    app_logger.info(f"Read {len(formatted_entities)} entity(ies) that have not been formatted from the database.")
except Exception as e:
    app_logger.error("Eror while reading entity(ies): {e}")

### Upload to Pinecone and update the entities' inserted_into_vectordb_at field

In [None]:
try:
   await pinecone.create_vectors(formatted_entities)
except Exception as e:
   app_logger.error(f"Error while uploading vectors: {e}")

In [None]:
try:
   for entity in entities:
      mongo.update_document(
         {"_id": entity["_id"]},
         {"inserted_into_vectordb_at": get_formatted_current_datetime("Asia/Kuala_Lumpur")}
      )
   app_logger.info(f"Updated {len(entities)} entity(ies) with inserted_into_vectordb_at field.")
except Exception as e:
   app_logger.error("Error while updating entity(ies): {e}")

## Uploading Entities and Relationships to Neo4j

### Format and upload entities

In [None]:
def get_entities_for_graphdb_by_type(entity_type: str) -> list:
   try:
      mongo.use_collection("entities")
      entities = mongo.read_documents({"type": get_normalized_string(entity_type), "inserted_into_graphdb_at": ""})
      if not entities:
         app_logger.info(f"No entities found for type: {entity_type} that have not been uploaded to Neo4j.")
         return []
      app_logger.info(f"Read {len(entities)} entity(ies) for type {get_normalized_string(entity_type)} that have not been uploaded to Neo4j.")
      return entities
   except Exception as e:
      app_logger.error(f"Error while getting entities by type: {e}")
      return []

In [None]:
def upload_entities_to_graphdb_by_type(entity_type: str):
    try:
      entities = get_entities_for_graphdb_by_type(entity_type)
      
      if not entities:
         return
       
      formatted_entities = []
      for entity in entities:
         formatted_entities.append(get_formatted_entity_for_graphdb(entity))
         
      neo4j.insert_entities(entities=formatted_entities, label=get_normalized_string(entity_type))
      
      # Uploaded the entities' inserted_into_graphdb_at field
      mongo.use_collection("entities")
      for entity in entities:
         mongo.update_document(
            {"_id": entity["_id"]},
            {"inserted_into_graphdb_at": get_formatted_current_datetime("Asia/Kuala_Lumpur")}
         )
      
      app_logger.info(f"Uploaded {len(entities)} entity(ies) of type {get_normalized_string(entity_type)} to Neo4j.")
    except Exception as e:
        app_logger.error(f"Error while uploading entities to Neo4j: {e}")

In [None]:
# Upload entities to Neo4j

try:
   entity_types = ["PERSON", "COMPANY", "PRODUCT", "SERVICE", "PLACE"]
   for entity_type in entity_types:
      upload_entities_to_graphdb_by_type(entity_type)
except Exception as e:
   app_logger.error("Error while uploading entity(ies) to neo4j: {e}")

### Format and upload relationships

In [None]:
def get_relationships_for_graphdb() -> list:
   try:
      mongo.use_collection("relationships")
      relationships = mongo.read_documents({"inserted_into_graphdb_at": ""})
      if not relationships:
         app_logger.info(f"There are no relationships that have not been uploaded to Neo4j.")
         return []
      app_logger.info(f"Read {len(relationships)} relationship(s) that have not been uploaded to Neo4j.")
      return relationships
   except Exception as e:
      app_logger.error(f"Error while getting relationship(s): {e}")
      return []

In [None]:
def upload_relationships_to_graphdb():
    try:
      relationships = get_relationships_for_graphdb()
      
      if not relationships:
         return
       
      formatted_relationships = []
      for relationship in relationships:
         formatted_relationships.append(get_formatted_relationship_for_graphdb(relationship))
         
      neo4j.insert_relationships(formatted_relationships)
      
      # Uploaded the relationships' inserted_into_graphdb_at field
      mongo.use_collection("relationships")
      for relationship in relationships:
         mongo.update_document(
            {"_id": relationship["_id"]},
            {"inserted_into_graphdb_at": get_formatted_current_datetime("Asia/Kuala_Lumpur")}
         )
      
      app_logger.info(f"Uploaded {len(relationships)} relationship(s) to Neo4j.")
       
    except Exception as e:
        app_logger.error(f"Error while uploading relationships to Neo4j: {e}")

In [None]:
# Upload relationships to Neo4j

try:
   upload_relationships_to_graphdb()
except Exception as e:
   app_logger.error("Error while uploading relationship(s) to neo4j: {e}")

## Setup query functionality

In [None]:
try:
   results = await pinecone.get_similar_results_with_namespace(
      batch_queries=[
         {"namespace": "company", "query_texts": ["autocount dotcom"]},
         {"namespace": "person", "query_texts": ["liew soung yue", "chin peng"]}
      ],
      top_k=3
   )
   pprint(results)
except Exception as e:
   app_logger.error(f"Error while getting similar results: {e}")

### Initialize GraphQuerySystem

In [None]:
with open("ontology.json") as f:
   raw_ontology = json.load(f)

In [None]:
# try:
#    query_system = GraphQuerySystem(ontology=raw_ontology)
# except Exception as e:
#    app_logger.error(f"Error while creating graph query system: {e}")

In [None]:
# query_result = await query_system.handle_request("Autocount Dotcom, liew soung yue, chin peng")
# app_logger.info(query_result)

### Setup chat interface with Gradio


In [None]:
def gradio_chat_interface(user_input: str, history: list[dict[str,str]]):
    """
    Gradio interface wrapper for the streaming chatbot.
    """
    bot_response = ""
    history = history or []
    
    history.append({"role": "user", "content": user_input})
    yield history, "" # To ensure user's message is shown immediately
    
    history.append({"role": "assistant", "content": "Processing user query..."})
    yield history, ""  # To indicate processing status
    
    history.append({"role": "assistant", "content": ""})
    try:
        bot_response = query_system.handle_request(request_data = user_input)
        history[-1]["content"] = bot_response  # Update assistant's message with the response
        yield history, ""
    except Exception as e:
        app_logger.error(f"Error while processing response: {e}")
        history[-1]["content"] = "An error occurred while processing your request."
        yield history, ""

In [None]:
def launch_chatbot():
    with gradio.Blocks() as demo:
        chatbot = gradio.Chatbot(type="messages")
        msg = gradio.Textbox(placeholder="Type your message here...")
        
        msg.submit(
            gradio_chat_interface,
            inputs=[msg, chatbot],
            outputs=[chatbot, msg]
        )
    
    demo.launch(debug=True)

In [None]:
launch_chatbot()

In [None]:
cypher_query = "MATCH (n:PERSON) WHERE n.name = $name RETURN n"
parameters = {'name': 'LIEW SOUNG YUE'}

result = neo4j.run_query(cypher_query, parameters)

pprint(result)

In [None]:
with open("ontology.json") as f:
   raw_ontology = json.load(f)

In [None]:
try:
   query_system = GraphQuerySystem(ontology=raw_ontology)
except Exception as e:
   app_logger.error(f"Error while creating graph query system: {e}")

In [None]:
query_result = query_system.handle_request("Retrieve the name and description of the person named LIEW SOUNG YUE.")
app_logger.info(query_result)

In [None]:
test = "{\"query\": \"MATCH (p:PERSON {name: $name}) RETURN p.name, p.description\", \"parameters\": {\"name\": \"LIEW SOUNG YUE\"}}"
test = json.loads(test)
pprint(test)
result = neo4j.run_query_for_text2cypher_agent(test)

pprint(result)