## Initializing Project

In [1]:
import logging
import os
import json
import asyncio
import nest_asyncio
from dotenv import load_dotenv
from pathlib import Path
from pprint import pprint
from typing import Any

In [2]:
from prompts import PROMPT

from ogmyrag.my_logging import configure_logger

from ogmyrag.storage import MongoDBStorage

from ogmyrag.graph_construction import (
   get_formatted_company_data, 
   get_formatted_entities_relationships_parsing_query, 
   get_formatted_entities_and_relationships
)

from ogmyrag.llm import fetch_completion_openai

from ogmyrag.util import get_formatted_ontology, get_formatted_report_definitions, get_formatted_current_datetime

In [3]:
# Patch event loop to support re-entry in Jupyter

nest_asyncio.apply()

In [4]:
# Set up logging

app_logger = configure_logger(name='og-myrag',log_level=logging.DEBUG, log_file='logs/app.log')
mongo_logger = configure_logger(name='mongodb',log_level=logging.DEBUG, log_file='logs/mongodb.log', to_console=False)

In [5]:
# Load environment variables

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI","")

openai_api_key = os.getenv("OPENAI_API_KEY","")

if not mongo_db_uri:
    app_logger.error("Please set the MONGO_DB_URI environment variable.")

if not openai_api_key:
    app_logger.error("Please set the OPENAI_API_KEY environment variable.")

In [6]:
# Connect to MongoDB

try:
    mongo = MongoDBStorage(mongo_db_uri)
    mongo.use_database("ogmyrag")
    mongo.use_collection("company_disclosures")
except Exception as e:
    app_logger.error(f"Could not connect to MongoDB: {str(e)}")

## Uploading Documents to MongoDB

In [7]:
# For development purposes, we will use the local file system to read the files and upload to mongodb

folder_name = "adb_prospectus"
folder_path = Path.cwd() / folder_name

if not folder_path.is_dir():
    app_logger.info(f"Folder '{folder_name}' not found in project root.")
    app_logger.info(f"Project is terminated")
else:
    txt_files = list(folder_path.glob("*.txt"))
    if not txt_files:
        app_logger.info("No .txt files found in the folder.")
    else:
        for txt_file in txt_files:
            current_data = get_formatted_company_data(
                txt_file.read_text(encoding='utf-8'),
                txt_file.stem,
                "prospectus",
                "Autocount Dotcom Berhad"
             )
            existing_documents = mongo.read_documents({"name": current_data["name"]})
            if existing_documents:
                app_logger.info(f"Document with name '{current_data['name']}' already exists in the database.")
            else:
                try:
                    document_id = mongo.create_document(current_data)
                    app_logger.info(f"Inserted document with name '{current_data['name']}' and id '{document_id}' into the database.")
                except Exception as e:
                    app_logger.error(f"Error inserting document with name '{current_data['name']}': {str(e)}")

2025-04-16 12:52:15,748 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_1' already exists in the database.
2025-04-16 12:52:15,888 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_10' already exists in the database.
2025-04-16 12:52:15,973 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_2' already exists in the database.
2025-04-16 12:52:16,113 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_3' already exists in the database.
2025-04-16 12:52:16,341 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_4' already exists in the database.
2025-04-16 12:52:16,480 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_5' already exists in the database.
2025-04-16 12:52:16,543 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_6' already exists in the database.
2025-04-16 12:52:16,824 - og-myrag - INFO - Document with name 'ADB_PROSPECTUS_SECTION_7A' already exists in the database.
2025-04-16 12:52:17,24

## Entities and Relationships Parsing

### Load word definitions from mongodb

In [8]:
try:
   raw_definitions = mongo.read_documents({
      "type": "PROSPECTUS DEFINITIONS", 
      "from_company": "AUTOCOUNT DOTCOM BERHAD"
   })
   formatted_definitions = get_formatted_report_definitions(json.loads(raw_definitions[0]["content"]))
except Exception as e:
   app_logger.error(f"Error getting formatted word definitions: {str(e)}")

### Load ontology

In [9]:
with open("ontology.json") as f:
   raw_ontology = json.load(f)
   formatted_ontology = get_formatted_ontology(raw_ontology)

### Prepare system prompt

In [10]:
entities_relationships_parsing_system_prompt = get_formatted_entities_relationships_parsing_query(
   prompt_template=PROMPT["ENTITIES_RELATIONSHIPS_PARSING"],
   ontology = formatted_ontology,
   source_txt_definitions=formatted_definitions
)

app_logger.debug(f"Entities and relationships parsing system prompt: {entities_relationships_parsing_system_prompt}")

2025-04-16 12:52:23,668 - og-myrag - DEBUG - Entities and relationships parsing system prompt: 
Goal: 
   You are an information extraction system grounded in a specific ontology. You will be provided with a piece of text and an ontology definition consisting of a list of valid entity classes and valid relationships between those classes. Your task is to extract only the entities and relationships that match the ontology exactly.

General Rules:
   1. Extract only entities that participate in at least one defined relationship. Entities that do not participate in any valid relationship defined in the ontology must not be included in the output.

   2. Do not create or infer any entity types or relationship types that are not explicitly defined in the ontology.

   3. Use the exact relationship names and entity classes as specified in the ontology.
   
   4. During actual parsing, you will be provided with key-value pairs that help interpret the source text. You must apply these mappings

### Extract entities and relationships from the documents and upload them to MongoDB

In [None]:
async def process_company_data(
  get_data_by: dict[str, Any],
  system_prompt: str,
  llm_model: str = "gpt-4.1-mini",
  max_tokens: int = 16384, 
  temperature: float = 0.3
):
   tasks = []
   try:
      documents = mongo.read_documents(get_data_by)

      if not documents:
         app_logger.info("There are no documents in the database to extract entities and relationships from.")
         return
      else:
         for document in documents:
            if document.get("content"):
               tasks.append(fetch_completion_openai(
                  model=llm_model,
                  user_prompt=document["content"],
                  system_prompt= system_prompt,
                  history_messages= None,
                  max_tokens=max_tokens,
                  temperature=temperature
               ))
               
         if not tasks:
           app_logger.info("No documents with content to process")
           return
         
         results = await asyncio.gather(*tasks)
         
         for document, result in zip(documents, results):
            entities, relationships = get_formatted_entities_and_relationships(result)
            current_time = get_formatted_current_datetime("Asia/Kuala_Lumpur")
            
            mongo.use_collection("entities")
            for entity in entities:
               entity["inserted_at"] = current_time
               mongo.create_document(entity)
            
            mongo.use_collection("relationships")
            for relationship in relationships:
               relationship["inserted_at"] = current_time
               mongo.create_document(relationship)
               
            mongo.use_collection("company_disclosures")
            mongo.update_document(
               {"_id": document["_id"]},
               {"is_parsed": True}
            )
            
            app_logger.info(f"The document with ID {document['_id']} has been successfully processed, along with the upload of {len(entities)} entity(ies) and {len(relationships)} relationship(s).")
   except Exception as e:
      app_logger.error(f"Error occur while processing company data: {e}")

In [16]:
asyncio.run(process_company_data(
   get_data_by = {"is_parsed": False, "name": "ADB_PROSPECTUS_SECTION_1"},
   system_prompt = entities_relationships_parsing_system_prompt
))

2025-04-16 12:53:35,267 - og-myrag - DEBUG - Sending query to gpt-4.1-mini ...
2025-04-16 12:53:35,283 - og-myrag - DEBUG - Current conversation:

system: 
Goal: 
   You are an information extraction system grounded in a specific ontology. You will be provided with a piece of text and an ontology definition consisting of a list of valid entity classes and valid relationships between those classes. Your task is to extract only the entities and relationships that match the ontology exactly.

General Rules:
   1. Extract only entities that participate in at least one defined relationship. Entities that do not participate in any valid relationship defined in the ontology must not be included in the output.

   2. Do not create or infer any entity types or relationship types that are not explicitly defined in the ontology.

   3. Use the exact relationship names and entity classes as specified in the ontology.
   
   4. During actual parsing, you will be provided with key-value pairs that h

In [19]:
# For entities collection
mongo.use_collection("entities")
entities = mongo.read_documents()
for entity in entities:
    app_logger.info(entity)

app_logger.info("\n\n\n")
# For relationships collection
mongo.use_collection("relationships")
relationships = mongo.read_documents()
for relationship in relationships:
    app_logger.info(relationship)


2025-04-16 13:02:44,180 - og-myrag - INFO - {'_id': ObjectId('67ff37e6879ff5edf381cd2a'), 'name': 'AUTOCOUNT DOTCOM BERHAD', 'type': 'COMPANY', 'description': 'Autocount Dotcom Berhad is a registered company with registration number 202201006885 (1452582-U). It has a corporate directory including a board of directors, company secretary, registered office, principal place of business, and various professional service providers.', 'created_at': '2025-04-16 12:53:58', 'inserted_at': '2025-04-16 12:53:58', 'last_modified_at': '2025-04-16 12:53:58'}
2025-04-16 13:02:44,181 - og-myrag - INFO - {'_id': ObjectId('67ff37e8879ff5edf381cd2b'), 'name': 'CHOO CHIN PENG', 'type': 'PERSON', 'description': 'Choo Chin Peng is an Executive Director and Chairman of Autocount Dotcom Berhad.', 'created_at': '2025-04-16 12:53:58', 'inserted_at': '2025-04-16 12:53:58', 'last_modified_at': '2025-04-16 12:53:58'}
2025-04-16 13:02:44,181 - og-myrag - INFO - {'_id': ObjectId('67ff37ea879ff5edf381cd2c'), 'name': 