## Initializing Project

In [None]:
import logging
import os
import json
import asyncio
import nest_asyncio
from dotenv import load_dotenv
from pathlib import Path
from pprint import pprint
from typing import Any

In [None]:
from prompts import PROMPT

from ogmyrag.my_logging import configure_logger

from ogmyrag.storage import (
   MongoDBStorage,
   PineconeStorage
)

from ogmyrag.graph_construction import (
   get_formatted_company_data, 
   get_formatted_entities_relationships_parsing_query, 
   get_formatted_entities_and_relationships,
   get_formatted_entity_for_vectordb
)

from ogmyrag.llm import fetch_completion_openai

from ogmyrag.util import (
   get_formatted_ontology, 
   get_formatted_report_definitions,
   get_formatted_current_datetime
)

In [None]:
# Patch event loop to support re-entry in Jupyter

nest_asyncio.apply()

In [None]:
# Set up logging

app_logger = configure_logger(name='og-myrag',log_level=logging.DEBUG, log_file='logs/app.log')
mongo_logger = configure_logger(name='mongodb',log_level=logging.DEBUG, log_file='logs/mongodb.log', to_console=False)
pinecone_logger = configure_logger(name='pinecone',log_level=logging.DEBUG, log_file='logs/pinecone.log', to_console=False)

In [None]:
# Load environment variables

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI","")

openai_api_key = os.getenv("OPENAI_API_KEY","")

pinecone_api_key = os.getenv("PINECONE_API_KEY","")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT","")
pinecone_cloud = os.getenv("PINECONE_CLOUD","")
pinecone_metric = os.getenv("PINECONE_METRIC", "")
pinecone_dimensions = os.getenv("PINECONE_DIMENSIONS")

if not mongo_db_uri:
    app_logger.error("Please set the MONGO_DB_URI environment variable.")

if not openai_api_key:
    app_logger.error("Please set the OPENAI_API_KEY environment variable.")

if not pinecone_api_key or not pinecone_environment or not pinecone_cloud or not pinecone_metric or not pinecone_dimensions:
    app_logger.error("Please set the PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_CLOUD, PINECONE_METRIC, and PINECONE_DIMENSIONS environment variables.")

In [None]:
# Connect to MongoDB

try:
    mongo = MongoDBStorage(mongo_db_uri)
    mongo.use_database("ogmyrag")
    mongo.use_collection("company_disclosures")
except Exception as e:
    app_logger.error(f"Could not connect to MongoDB: {str(e)}")

In [None]:
# Connect to Pinecone

try:
   pinecone = PineconeStorage(
      index_name = "ogmyrag",
      pinecone_api_key = pinecone_api_key,
      pinecone_environment = pinecone_environment,
      pinecone_cloud = pinecone_cloud,
      pinecone_metric = pinecone_metric,
      pinecone_dimensions = pinecone_dimensions,
      openai_api_key = openai_api_key 
   )
except Exception as e:
    app_logger.error(f"Could not connect to Pinecone: {str(e)}")

## Uploading Documents to MongoDB

In [None]:
# For development purposes, we will use the local file system to read the files and upload to mongodb
# TODO: Support formatting for definitions

folder_name = "adb_prospectus"
folder_path = Path.cwd() / folder_name

if not folder_path.is_dir():
    app_logger.info(f"Folder '{folder_name}' not found in project root.")
    app_logger.info(f"Project is terminated")
else:
    txt_files = list(folder_path.glob("*.txt"))
    if not txt_files:
        app_logger.info("No .txt files found in the folder.")
    else:
        for txt_file in txt_files:
            current_data = get_formatted_company_data(
                txt_file.read_text(encoding='utf-8'),
                txt_file.stem,
                "prospectus",
                "Autocount Dotcom Berhad"
             )
            existing_documents = mongo.read_documents({"name": current_data["name"]})
            if existing_documents:
                app_logger.info(f"Document with name '{current_data['name']}' already exists in the database.")
            else:
                try:
                    document_id = mongo.create_document(current_data)
                    app_logger.info(f"Inserted document with name '{current_data['name']}' and id '{document_id}' into the database.")
                except Exception as e:
                    app_logger.error(f"Error inserting document with name '{current_data['name']}': {str(e)}")

## Entities and Relationships Parsing

### Load word definitions from mongodb

In [None]:
try:
   raw_definitions = mongo.read_documents({
      "type": "PROSPECTUS DEFINITIONS", 
      "from_company": "AUTOCOUNT DOTCOM BERHAD"
   })
   formatted_definitions = get_formatted_report_definitions(json.loads(raw_definitions[0]["content"]))
except Exception as e:
   app_logger.error(f"Error getting formatted word definitions: {str(e)}")

### Load ontology

In [None]:
with open("ontology.json") as f:
   raw_ontology = json.load(f)
   formatted_ontology = get_formatted_ontology(raw_ontology)

### Prepare system prompt

In [None]:
entities_relationships_parsing_system_prompt = get_formatted_entities_relationships_parsing_query(
   prompt_template=PROMPT["ENTITIES_RELATIONSHIPS_PARSING"],
   ontology = formatted_ontology,
   source_txt_definitions=formatted_definitions
)

app_logger.debug(f"Entities and relationships parsing system prompt: {entities_relationships_parsing_system_prompt}")

### Extract entities and relationships from the documents and upload them to MongoDB

In [None]:
# TODO: Add caching mechanism to avoid reprocessing the same document if error occur
async def process_company_data(
  get_data_by: dict[str, Any],
  system_prompt: str,
  llm_model: str = "gpt-4.1-mini",
  max_tokens: int = 16384, 
  temperature: float = 0.3
):
   tasks = []
   try:
      documents = mongo.read_documents(get_data_by)

      if not documents:
         app_logger.info("There are no documents in the database to extract entities and relationships from.")
         return
      else:
         for document in documents:
            if document.get("content"):
               tasks.append(fetch_completion_openai(
                  model=llm_model,
                  user_prompt=document["content"],
                  system_prompt= system_prompt,
                  history_messages= None,
                  max_tokens=max_tokens,
                  temperature=temperature
               ))
               
         if not tasks:
           app_logger.info("No documents with content to process")
           return
         
         results = await asyncio.gather(*tasks)
         
         for document, result in zip(documents, results):
            entities, relationships = get_formatted_entities_and_relationships(result)
            
            app_logger.info(f"Inserting {len(entities)} entity(ies) into the database for document ID: {document['_id']}")
            mongo.use_collection("entities")
            for entity in entities:
               mongo.create_document(entity)
               
            app_logger.info(f"Inserting {len(relationships)} relationship(s) into the database for document ID: {document['_id']}")
            mongo.use_collection("relationships")
            for relationship in relationships:
               mongo.create_document(relationship)
            
            app_logger.info(f"Updating the is_parsed status of the document with ID: {document['_id']}")
            mongo.use_collection("company_disclosures")
            mongo.update_document(
               {"_id": document["_id"]},
               {"is_parsed": True}
            )
            app_logger.info(f"The document with ID {document['_id']} has been successfully processed, along with the upload of {len(entities)} entity(ies) and {len(relationships)} relationship(s).")
   except Exception as e:
      app_logger.error(f"Error occur while processing company data: {e}")

In [None]:
await process_company_data(
   llm_model= "gpt-4o-mini",
   get_data_by = {"is_parsed": False, "name": "ADB_PROSPECTUS_SECTION_1"},
   system_prompt = entities_relationships_parsing_system_prompt
)

## Uploading entities to Pinecone

### Read and format entities that have not been uploaded to Pinecone

In [None]:
formatted_entities = []

try:
    mongo.use_collection("entities")
    entities = mongo.read_documents({"inserted_into_vectordb_at": ""})
    for entity in entities:
        formatted_entities.append(get_formatted_entity_for_vectordb(entity))
    app_logger.info(f"Read {len(formatted_entities)} entity(ies) that have not been formatted from the database.")
except Exception as e:
    app_logger.error("Eror while reading entity(ies): {e}")

### Upload to Pinecone and update the entities' inserted_into_vectordb_at status

In [None]:
try:
   await pinecone.create_vectors(formatted_entities)
except Exception as e:
   app_logger.error(f"Error while uploading vectors: {e}")

In [None]:
try:
   for entity in entities:
      mongo.update_document(
         {"_id": entity["_id"]},
         {"inserted_into_vectordb_at": get_formatted_current_datetime("Asia/Kuala_Lumpur")}
      )
   app_logger.info(f"Updated {len(entities)} entity(ies) with inserted_into_vectordb_at field.")
except Exception as e:
   app_logger.error("Error while updating entity(ies): {e}")

In [None]:
try:
   results = await pinecone.get_similar_results(query_text="ADB", namespace="COMPANY")
   pprint(results)
except Exception as e:
   app_logger.error(f"Error while getting similar results: {e}")